In [175]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [227]:
shortforms = ['FER']
genes = ['FER']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [228]:
genes

['FER']

In [229]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [230]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [231]:
top = miners['FER'].top()

In [232]:
top

[('feronia', 35.81081081081081),
 ('facial emotion recognition', 6.0),
 ('fusarium ear rot', 5.5),
 ('artificial reef', 5.4),
 ('fusarium ear rot severity', 4.0),
 ('receptor like kinase feronia', 3.8571428571428568),
 ('efficiency ratio', 3.75),
 ('at a suspended artificial reef', 3.6),
 ('the receptor like kinase feronia', 3.5),
 ('kinase feronia', 3.2),
 ('food efficiency ratio', 2.8),
 ('ferrihydrite', 2.5),
 ('related', 2.5),
 ('rlk feronia', 2.5),
 ('that feronia', 2.0),
 ('the feronia', 2.0),
 ('feed efficiency ratio', 2.0),
 ('the food efficiency ratio', 2.0),
 ('qtls for fusarium ear rot', 2.0),
 ('s2 pearson s correlation coefficients between days to silking dts fusarium ear rot severity',
  2),
 ('andard deviation sd f 3 population range median and mean ranks for fusarium ear rot severity',
  2),
 ('ent nov 2016 and for b control c and exclusion e treatments at one of the artificial reef',
  2),
 ('ear rot', 1.7777777777777777),
 ('suspended artificial reef', 1.6666666666666

In [234]:
longforms0 = miners['FER'].get_longforms()

In [235]:
list(enumerate(longforms0))

[(0, ('feronia', 35.81081081081081)),
 (1, ('facial emotion recognition', 6.0)),
 (2, ('fusarium ear rot', 5.5)),
 (3, ('artificial reef', 5.4)),
 (4, ('fusarium ear rot severity', 4.0)),
 (5, ('efficiency ratio', 3.75)),
 (6, ('related', 2.5)),
 (7, ('ferrihydrite', 2.5)),
 (8, ('kinase', 1.5)),
 (9, ('zeolite ferrierite', 1.3333333333333333))]

In [236]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [6, 8]]

In [238]:
list(enumerate(top))

[(0, ('feronia', 35.81081081081081)),
 (1, ('facial emotion recognition', 6.0)),
 (2, ('fusarium ear rot', 5.5)),
 (3, ('artificial reef', 5.4)),
 (4, ('fusarium ear rot severity', 4.0)),
 (5, ('receptor like kinase feronia', 3.8571428571428568)),
 (6, ('efficiency ratio', 3.75)),
 (7, ('at a suspended artificial reef', 3.6)),
 (8, ('the receptor like kinase feronia', 3.5)),
 (9, ('kinase feronia', 3.2)),
 (10, ('food efficiency ratio', 2.8)),
 (11, ('ferrihydrite', 2.5)),
 (12, ('related', 2.5)),
 (13, ('rlk feronia', 2.5)),
 (14, ('that feronia', 2.0)),
 (15, ('the feronia', 2.0)),
 (16, ('feed efficiency ratio', 2.0)),
 (17, ('the food efficiency ratio', 2.0)),
 (18, ('qtls for fusarium ear rot', 2.0)),
 (19,
  ('s2 pearson s correlation coefficients between days to silking dts fusarium ear rot severity',
   2)),
 (20,
  ('andard deviation sd f 3 population range median and mean ranks for fusarium ear rot severity',
   2)),
 (21,
  ('ent nov 2016 and for b control c and exclusion e 

In [237]:
longforms0

[('feronia', 35.81081081081081),
 ('facial emotion recognition', 6.0),
 ('fusarium ear rot', 5.5),
 ('artificial reef', 5.4),
 ('fusarium ear rot severity', 4.0),
 ('efficiency ratio', 3.75),
 ('ferrihydrite', 2.5),
 ('zeolite ferrierite', 1.3333333333333333)]

In [239]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [25]])

In [240]:
longforms0

[('feronia', 35.81081081081081),
 ('facial emotion recognition', 6.0),
 ('fusarium ear rot', 5.5),
 ('artificial reef', 5.4),
 ('fusarium ear rot severity', 4.0),
 ('efficiency ratio', 3.75),
 ('ferrihydrite', 2.5),
 ('zeolite ferrierite', 1.3333333333333333),
 ('tyrosine kinase', 1.3333333333333333)]

In [241]:
longforms0.sort(key=lambda x: -x[1])

In [242]:
longforms, scores = zip(*longforms0)

In [243]:
longforms

('feronia',
 'facial emotion recognition',
 'fusarium ear rot',
 'artificial reef',
 'fusarium ear rot severity',
 'efficiency ratio',
 'ferrihydrite',
 'zeolite ferrierite',
 'tyrosine kinase')

In [244]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [245]:
grounding_map

{'tyrosine kinase': 'MESH:D011505'}

In [246]:
names

{'MESH:D011505': 'Protein-Tyrosine Kinases'}

In [247]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [147]:
names

{'PUBCHEM:10869981': 'Fluoroestradiol F-18',
 'MESH:D004620': 'Embolism, Fat',
 'MESH:D004599': 'Electric Stimulation Therapy'}

In [148]:
grounding_map

{'estradiol': 'PUBCHEM:10869981',
 'fat embolism syndrome': 'MESH:D004620',
 'first episode schizophrenia': 'ungrounded',
 'fluoroestradiol': 'PUBCHEM:10869981',
 'free energy surface': 'ungrounded',
 'functional electrical stimulation': 'MESH:D004599'}

In [248]:
result = grounding_map, names, pos_labels

In [249]:
result

({'artificial reef': 'ungrounded',
  'efficiency ratio': 'ungrounded',
  'facial emotion recognition': 'ungrounded',
  'feronia': 'UP:Q9SCZ4',
  'ferrihydrite': 'ungrounded',
  'fusarium ear rot': 'ungrounded',
  'fusarium ear rot severity': 'ungrounded',
  'tyrosine kinase': 'HGNC:3655',
  'zeolite ferrierite': 'ungrounded'},
 {'UP:Q9SCZ4': 'FER', 'HGNC:3655': 'FER'},
 ['HGNC:3655', 'UP:Q9SCZ4'])

In [250]:
grounding_map, names, pos_labels = ({'artificial reef': 'ungrounded',
  'efficiency ratio': 'ungrounded',
  'facial emotion recognition': 'ungrounded',
  'feronia': 'UP:Q9SCZ4',
  'ferrihydrite': 'ungrounded',
  'fusarium ear rot': 'ungrounded',
  'fusarium ear rot severity': 'ungrounded',
  'tyrosine kinase': 'HGNC:3655',
  'zeolite ferrierite': 'ungrounded'},
 {'UP:Q9SCZ4': 'FER', 'HGNC:3655': 'FER'},
 ['HGNC:3655', 'UP:Q9SCZ4'])

In [251]:
grounding_dict = {'FER': grounding_map}

In [252]:
classifier = AdeftClassifier('FER', pos_labels=pos_labels)

In [253]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [254]:
labeler = AdeftLabeler(grounding_dict)

In [255]:
corpus = labeler.build_from_texts(shortform_texts)

In [256]:
corpus.extend(entrez_texts)

In [257]:
texts, labels = zip(*corpus)

In [258]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-21 10:33:19] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-21 10:33:22] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9610480217432276 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [259]:
classifier.stats

{'label_distribution': {'UP:Q9SCZ4': 32, 'ungrounded': 17, 'HGNC:3655': 78},
 'f1': {'mean': 0.9610480217432276, 'std': 0.015913358823083618},
 'precision': {'mean': 0.934003304879264, 'std': 0.02025652527603831},
 'recall': {'mean': 0.9913043478260869, 'std': 0.017391304347826077}}

In [260]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [261]:
d = disamb.disambiguate(shortform_texts)

In [262]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC')]

In [263]:
len(a)

55

In [264]:
a[40]

('The focal adhesion kinase (FAK) signaling cascade in cancer cells was profoundly inhibited by methyl violet 2B identified with the structure-based virtual screening. Methyl violet 2B was shown to be a non-competitive inhibitor of full-length FAK enzyme vs. ATP. It turned out that methyl violet 2B possesses extremely high kinase selectivity in biochemical kinase profiling using a large panel of kinases. Anti-proliferative activity measurement against several different cancer cells and Western blot analysis showed that this substance is capable of suppressing significantly the proliferation of cancer cells and is able to strongly block FAK/AKT/MAPK signaling pathways in a dose dependent manner at low nanomolar concentration. Especially, phosphorylation of Tyr925-FAK that is required for full activation of FAK was nearly completely suppressed even with 1nM of methyl violet 2B in A375P cancer cells. To the best of our knowledge, it has never been reported that methyl violet possesses ant

In [265]:
disamb.dump('FER', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('FES', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [266]:
model_to_s3(disamb)