In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

INFO: [2020-01-17 14:40:11] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
shortforms = ['RET']
genes = ['RET']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['RET'].top()

In [6]:
top

[('rearranged during transfection', 105.7983193277311),
 ('resistance exercise training', 37.97297297297298),
 ('the rearranged during transfection', 36.36585365853659),
 ('reverse electron transport', 27.666666666666668),
 ('transfer', 19.487179487179485),
 ('resonance energy transfer', 17.736842105263154),
 ('reverse electron transfer', 17.684210526315788),
 ('ret proto oncogene', 11.846153846153847),
 ('and rearranged during transfection', 11.538461538461537),
 ('transfection', 9.621212121212125),
 ('mutations in the rearranged during transfection', 8.90909090909091),
 ('of the rearranged during transfection', 8.333333333333334),
 ('receptor tyrosine kinase', 7.555555555555555),
 ('reticulocyte', 5.714285714285714),
 ('proto oncogene', 5.125),
 ('the reverse electron transport', 5.0),
 ('mutations of the rearranged during transfection', 4.666666666666667),
 ('receptor rearranged during transfection', 4.333333333333333),
 ('the proto oncogene rearranged during transfection', 4.333333

In [24]:
top[50]

('postconcussion syndrome', 2.5)

In [12]:
longforms0 = miners['RET'].get_longforms()

In [13]:
longforms0

[('rearranged during transfection', 105.7983193277311),
 ('resistance exercise training', 37.97297297297298),
 ('reverse electron transport', 27.666666666666668),
 ('transfer', 19.487179487179485),
 ('ret proto oncogene', 11.846153846153847),
 ('receptor tyrosine kinase', 7.555555555555555),
 ('reticulocyte', 5.714285714285714),
 ('retroperitoneal', 4.0),
 ('rapid ethanol tolerance', 4.0),
 ('receptor', 3.0),
 ('gene', 2.0),
 ('retene', 2.0),
 ('retardation', 2.0),
 ('transformation', 2.0),
 ('y mos main intracellular pathways activating by proto oncogene tyrosine protein kinase receptor ret',
  2),
 ('re arranged during t ransfection', 1.3333333333333333)]

In [14]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [3, 9, 10, 12, 13, 14, 15]]

In [15]:
longforms0

[('rearranged during transfection', 105.7983193277311),
 ('resistance exercise training', 37.97297297297298),
 ('reverse electron transport', 27.666666666666668),
 ('ret proto oncogene', 11.846153846153847),
 ('receptor tyrosine kinase', 7.555555555555555),
 ('reticulocyte', 5.714285714285714),
 ('retroperitoneal', 4.0),
 ('rapid ethanol tolerance', 4.0),
 ('retene', 2.0)]

In [17]:
list(enumerate(top))

[(0, ('rearranged during transfection', 105.7983193277311)),
 (1, ('resistance exercise training', 37.97297297297298)),
 (2, ('the rearranged during transfection', 36.36585365853659)),
 (3, ('reverse electron transport', 27.666666666666668)),
 (4, ('transfer', 19.487179487179485)),
 (5, ('resonance energy transfer', 17.736842105263154)),
 (6, ('reverse electron transfer', 17.684210526315788)),
 (7, ('ret proto oncogene', 11.846153846153847)),
 (8, ('and rearranged during transfection', 11.538461538461537)),
 (9, ('transfection', 9.621212121212125)),
 (10, ('mutations in the rearranged during transfection', 8.90909090909091)),
 (11, ('of the rearranged during transfection', 8.333333333333334)),
 (12, ('receptor tyrosine kinase', 7.555555555555555)),
 (13, ('reticulocyte', 5.714285714285714)),
 (14, ('proto oncogene', 5.125)),
 (15, ('the reverse electron transport', 5.0)),
 (16, ('mutations of the rearranged during transfection', 4.666666666666667)),
 (17, ('receptor rearranged during t

In [18]:
longforms0.extend([x for i, x in enumerate(top) if i in [5, 6]])

In [19]:
longforms0

[('rearranged during transfection', 105.7983193277311),
 ('resistance exercise training', 37.97297297297298),
 ('reverse electron transport', 27.666666666666668),
 ('ret proto oncogene', 11.846153846153847),
 ('receptor tyrosine kinase', 7.555555555555555),
 ('reticulocyte', 5.714285714285714),
 ('retroperitoneal', 4.0),
 ('rapid ethanol tolerance', 4.0),
 ('retene', 2.0),
 ('resonance energy transfer', 17.736842105263154),
 ('reverse electron transfer', 17.684210526315788)]

In [20]:
longforms0.sort(key=lambda x: -x[1])

In [21]:
longforms, scores = zip(*longforms0)

In [22]:
longforms

('rearranged during transfection',
 'resistance exercise training',
 'reverse electron transport',
 'resonance energy transfer',
 'reverse electron transfer',
 'ret proto oncogene',
 'receptor tyrosine kinase',
 'reticulocyte',
 'retroperitoneal',
 'rapid ethanol tolerance',
 'retene')

In [23]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [24]:
grounding_map

{'reticulocyte': 'MESH:D012156'}

In [25]:
names

{'MESH:D012156': 'Reticulocytes'}

In [74]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [75]:
result = (grounding_map, names, pos_labels)

In [76]:
result

({'rapid ethanol tolerance': 'ungrounded',
  'rearranged during transfection': 'HGNC:9967',
  'receptor tyrosine kinase': 'HGNC:9967',
  'resistance exercise training': 'ungrounded',
  'resonance energy transfer': 'ungrounded',
  'ret proto oncogene': 'HGNC:9967',
  'retene': 'ungrounded',
  'reticulocyte': 'ungrounded',
  'retroperitoneal': 'ungrounded',
  'reverse electron transfer': 'ungrounded',
  'reverse electron transport': 'ungrounded'},
 {'HGNC:9967': 'RET'},
 ['HGNC:9967'])

In [77]:
grounding_map, names, pos_labels = ({'rapid ethanol tolerance': 'ungrounded',
  'rearranged during transfection': 'HGNC:9967',
  'receptor tyrosine kinase': 'HGNC:9967',
  'resistance exercise training': 'ungrounded',
  'resonance energy transfer': 'ungrounded',
  'ret proto oncogene': 'HGNC:9967',
  'retene': 'ungrounded',
  'reticulocyte': 'ungrounded',
  'retroperitoneal': 'ungrounded',
  'reverse electron transfer': 'ungrounded',
  'reverse electron transport': 'ungrounded'},
 {'HGNC:9967': 'RET'},
 ['HGNC:9967'])

In [78]:
grounding_dict = {'RET': grounding_map}

In [79]:
classifier = AdeftClassifier('RET', pos_labels=pos_labels)

In [80]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [81]:
labeler = AdeftLabeler(grounding_dict)

In [82]:
corpus = labeler.build_from_texts(shortform_texts)

In [83]:
corpus.extend(entrez_texts)

In [84]:
texts, labels = zip(*corpus)

In [85]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-17 14:57:15] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-17 14:57:23] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9920518376807328 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [86]:
classifier.stats

{'label_distribution': {'ungrounded': 92, 'HGNC:9967': 872},
 'f1': {'mean': 0.9920518376807328, 'std': 0.004873855908121181},
 'precision': {'mean': 0.984275512009176, 'std': 0.00961380996783067},
 'recall': {'mean': 1.0, 'std': 0.0}}

In [87]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [88]:
d = disamb.disambiguate(shortform_texts)

In [89]:
a = [text for pred, text in zip(d, shortform_texts)if not pred[0].startswith('HGNC')]

In [92]:
len(a)

113

In [52]:
y =[x for x in a if 'tyrosine kinase' in x]

In [54]:
y[0]

'Non-small-cell lung cancer (NSCLC) patients with mutated or rearranged oncogene drivers can be treated with upfront selective inhibitors achieving higher response rates and longer survival than chemotherapy. The RET gene can undergo chromosomal rearrangements in 1%–2% of all NSCLC patients, involving various upstream fusion partners such as KIF5B, CCDC6, NCOA4, and TRIM33. Many multikinase inhibitors are active against rearranged RET. Cabozantinib, vandetanib, sunitinib, lenvatinib, and nintedanib achieved tumor responses in about 30% of these patients in retrospective studies. Prospective phase II trials investigated the activity and toxicity of cabozantinib, vandetanib, sorafenib, and lenvatinib, and did not reach significantly higher response rates. VEGFR and EGFR inhibition represented the main ways of developing off-target toxicity. An intrinsic resistance emerged according to the type of RET fusion partners, as KIF5B-RET fusion is the most resistant. Also acquired mutations in r

In [93]:
disamb.dump('RET', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [94]:
model_to_s3(disamb)