In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['FGR']
genes = ['FGR']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['FGR'].top()

In [6]:
top

[('fetal growth restriction', 192.85142857142858),
 ('of fetal growth restriction', 26.275862068965512),
 ('and fetal growth restriction', 17.81818181818182),
 ('growth restriction', 17.427230046948353),
 ('with fetal growth restriction', 9.23076923076923),
 ('fluorescence guided resection', 7.0),
 ('induced fetal growth restriction', 6.5),
 ('to fetal growth restriction', 5.25),
 ('functional group richness', 5.142857142857142),
 ('foetal growth restriction', 5.0),
 ('in fetal growth restriction', 5.0),
 ('preeclampsia pe and fetal growth restriction', 5.0),
 ('fgr fetal growth restriction', 4.857142857142858),
 ('by fetal growth restriction', 4.444444444444445),
 ('fetal growth retardation', 4.0),
 ('associated with fetal growth restriction', 4.0),
 ('causes of fetal growth restriction', 4.0),
 ('such as fetal growth restriction', 3.6666666666666665),
 ('or fetal growth restriction', 3.6),
 ('pe and fetal growth restriction', 3.25),
 ('including fetal growth restriction', 3.0),
 ('co

In [7]:
longforms0 = miners['FGR'].get_longforms()

In [8]:
longforms0

[('fetal growth restriction', 192.85142857142858),
 ('fluorescence guided resection', 7.0),
 ('functional group richness', 5.142857142857142),
 ('fetal growth retardation', 4.0),
 ('saponins from actinidia arguta radix saa total flavonoids from glycyrrhizae radix et rhizoma',
  2),
 ('ratio', 1.3333333333333333)]

In [9]:
longforms = longforms0[:-2]

In [12]:
list(enumerate(top))

[(0, ('fetal growth restriction', 192.85142857142858)),
 (1, ('of fetal growth restriction', 26.275862068965512)),
 (2, ('and fetal growth restriction', 17.81818181818182)),
 (3, ('growth restriction', 17.427230046948353)),
 (4, ('with fetal growth restriction', 9.23076923076923)),
 (5, ('fluorescence guided resection', 7.0)),
 (6, ('induced fetal growth restriction', 6.5)),
 (7, ('to fetal growth restriction', 5.25)),
 (8, ('functional group richness', 5.142857142857142)),
 (9, ('foetal growth restriction', 5.0)),
 (10, ('in fetal growth restriction', 5.0)),
 (11, ('preeclampsia pe and fetal growth restriction', 5.0)),
 (12, ('fgr fetal growth restriction', 4.857142857142858)),
 (13, ('by fetal growth restriction', 4.444444444444445)),
 (14, ('fetal growth retardation', 4.0)),
 (15, ('associated with fetal growth restriction', 4.0)),
 (16, ('causes of fetal growth restriction', 4.0)),
 (17, ('such as fetal growth restriction', 3.6666666666666665)),
 (18, ('or fetal growth restriction'

In [13]:
longforms.extend((longform, score) for i, (longform, score) in enumerate(top)
                 if i in [9])

In [14]:
longforms.sort(key=lambda x: -x[1])

In [15]:
longforms, scores = zip(*longforms)

In [16]:
longforms

('fetal growth restriction',
 'fluorescence guided resection',
 'functional group richness',
 'foetal growth restriction',
 'fetal growth retardation')

In [17]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [18]:
grounding_map

{'fetal growth restriction': 'MESH:D005317',
 'fetal growth retardation': 'MESH:D005317'}

In [19]:
names

{'MESH:D005317': 'Fetal Growth Retardation'}

In [21]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [22]:
result = (grounding_map, names, pos_labels)

In [23]:
result

({'fetal growth restriction': 'MESH:D005317',
  'fetal growth retardation': 'MESH:D005317',
  'fluorescence guided resection': 'ungrounded',
  'foetal growth restriction': 'MESH:D005317',
  'functional group richness': 'ungrounded'},
 {'MESH:D005317': 'Fetal Growth Retardation'},
 ['MESH:D005317'])

In [24]:
grounding_map, names, pos_labels = ({'fetal growth restriction': 'MESH:D005317',
  'fetal growth retardation': 'MESH:D005317',
  'fluorescence guided resection': 'ungrounded',
  'foetal growth restriction': 'MESH:D005317',
  'functional group richness': 'ungrounded'},
 {'MESH:D005317': 'Fetal Growth Retardation'},
 ['MESH:D005317'])

In [25]:
names['HGNC:3697'] = 'FGR'
pos_labels.append('HGNC:3697')

In [26]:
grounding_dict = {'FGR': grounding_map}

In [27]:
classifier = AdeftClassifier('FGR', pos_labels=pos_labels)

In [28]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [29]:
labeler = AdeftLabeler(grounding_dict)

In [30]:
corpus = labeler.build_from_texts(shortform_texts)

In [31]:
corpus.extend(entrez_texts)

In [32]:
texts, labels = zip(*corpus)

In [33]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-27 12:30:41] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-27 12:30:45] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9939967916548713 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [34]:
classifier.stats

{'label_distribution': {'MESH:D005317': 150, 'ungrounded': 8, 'HGNC:3697': 93},
 'f1': {'mean': 0.9939967916548713, 'std': 0.00799643508150585},
 'precision': {'mean': 0.9882225148123766, 'std': 0.015673297062414965},
 'recall': {'mean': 1.0, 'std': 0.0}}

In [35]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [36]:
d = disamb.disambiguate(shortform_texts)

In [37]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC:3697']

In [39]:
a[0]

'We next sought to determine whether specimens harboring mutant  CSF3R  show in vitro sensitivity to small-molecule inhibitors of kinases or siRNA directed against kinases that become dysregulated downstream of mutant CSF3R. Analysis of cells from Patient 3, who had CNL with the  CSF3R  S783fs mutation ( Table S3 and Fig. S2 in the Supplementary Appendix ), revealed dramatic sensitivity to the multikinase inhibitor dasatinib (Sprycel, Bristol-Myers Squibb) but no sensitivity to inhibitors of JAK family kinases ( Fig. 1B ). Further interrogation with our panel of tyrosine kinase– specific siRNAs revealed sensitivity to silencing of tyrosine kinase nonreceptor 2 (TNK2) and an SRC family kinase, FGR, both of which are potently inhibited by dasatinib  36  ( Fig. 1C ). We also performed drug-sensitivity profiling on samples from two patients with the  CSF3R  T618I mutation (one with CNL and one with ETP-T-ALL). In contrast to the drug-sensitivity pattern in the patients with truncation muta

In [40]:
disamb.dump('FGR', '../results')

In [68]:
from adeft.disambiguate import load_disambiguator, load_disambiguator_directly

In [42]:
disamb.classifier.training_set_digest

'baeb1ba13324f4192620577a2d3e6ab9'

In [41]:
model_to_s3(disamb)

In [76]:
d.disambiguate(texts[0])

('CHEBI:CHEBI:29016',
 'arginine',
 {'CHEBI:CHEBI:29016': 0.9667882794828576,
  'CHEBI:CHEBI:79': 0.004343534773612251,
  'HGNC:651': 0.00798509086185913,
  'HGNC:77': 0.00973784879530277,
  'HGNC:9965': 0.011145246086368127})

In [79]:
print(d.info())

Disambiguation model for ARG

Produces the disambiguations:
	(-)-Arctigenin*	CHEBI:CHEBI:79
	ABL2*	HGNC:77
	AREG*	HGNC:651
	RERE*	HGNC:9965
	arginine*	CHEBI:CHEBI:29016

Training data had class balance:
	AREG*	157
	ABL2*	88
	RERE*	39
	arginine*	15
	(-)-Arctigenin*	2

Classification Metrics:
	F1 score:	0.81188
	Precision:	0.82394
	Recall:		0.8205

* Positive labels
See Docstring for explanation



In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [41]:
model_to_s3(disamb)

In [44]:
classifier.feature_importances()['FPLX:RAC']

[('rac1', 3.0203950681340763),
 ('cdc42', 0.46463658443237615),
 ('cells', 0.26067826443238284),
 ('rhoa', 0.2284452268653803),
 ('rho', 0.201414697794678),
 ('rac1b', 0.1862119230497441),
 ('actin', 0.16300556582714468),
 ('gtpases', 0.15543881285531247),
 ('rictor', 0.14896197766149943),
 ('pak1', 0.14776120596579687),
 ('jnk', 0.13802477317328446),
 ('activation', 0.13739172186857945),
 ('s1p', 0.12195584486540219),
 ('thrombin', 0.12054088946040988),
 ('endothelial', 0.11948523924516169),
 ('proteins', 0.11834598588132215),
 ('cell', 0.11418065253318789),
 ('phox', 0.11144187989246301),
 ('chimaerin', 0.1102266733731222),
 ('elmo1', 0.10886660370678143),
 ('vav', 0.10685654194092864),
 ('ros', 0.1017412296369346),
 ('gtp', 0.09970566367485924),
 ('escc', 0.09632367768212988),
 ('ras', 0.09346518511044424),
 ('nef', 0.08938792674953275),
 ('ubiquitylation', 0.0893305800391189),
 ('p85', 0.08885763883676143),
 ('gtpase', 0.08833886568389247),
 ('figure', 0.087475101884638),
 ('comple

In [46]:
d = load_disambiguator('ALK', '../results')



In [47]:
d.info()

'Disambiguation model for ALK\n\nProduces the disambiguations:\n\tALK*\tHGNC:427\n\tAlkaline Phosphatase*\tMESH:D000469\n\tRSTK1*\tFPLX:RSTK1\n\nTraining data had class balance:\n\tALK*\t1181\n\tRSTK1*\t155\n\tAlkaline Phosphatase*\t6\n\nClassification Metrics:\n\tF1 score:\t0.97493\n\tPrecision:\t0.97367\n\tRecall:\t\t0.97767\n\n* Positive labels\nSee Docstring for explanation\n'

In [48]:
print(d.info())

Disambiguation model for ALK

Produces the disambiguations:
	ALK*	HGNC:427
	Alkaline Phosphatase*	MESH:D000469
	RSTK1*	FPLX:RSTK1

Training data had class balance:
	ALK*	1181
	RSTK1*	155
	Alkaline Phosphatase*	6

Classification Metrics:
	F1 score:	0.97493
	Precision:	0.97367
	Recall:		0.97767

* Positive labels
See Docstring for explanation



In [49]:
model_to_s3(d)

In [58]:
d = load_disambiguator('TAK', '../results')



In [59]:
print(d.info())

Disambiguation model for TAK

Produces the disambiguations:
	MAP3K7*	HGNC:6859
	Takayasu Arteritis*	MESH:D013625

Training data had class balance:
	MAP3K7*	327
	Ungrounded*	241
	Ungrounded	36
	Takayasu Arteritis*	13

Classification Metrics:
	F1 score:	0.94043
	Precision:	0.93685
	Recall:		0.94658

* Positive labels
See Docstring for explanation



In [57]:
model_to_s3(d)

In [60]:
from adeft import available_shortforms

In [63]:
print(d.info())

Disambiguation model for TAK

Produces the disambiguations:
	MAP3K7*	HGNC:6859
	Takayasu Arteritis*	MESH:D013625

Training data had class balance:
	MAP3K7*	327
	Ungrounded*	241
	Ungrounded	36
	Takayasu Arteritis*	13

Classification Metrics:
	F1 score:	0.94043
	Precision:	0.93685
	Recall:		0.94658

* Positive labels
See Docstring for explanation



In [64]:
d.classifier.feature_importances()

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [2]:
from adeft import __version__

In [3]:
__version__

'0.5.5'

In [4]:
from adeft.disambiguate import load_disambiguator_directly

In [59]:
d = load_disambiguator_directly('../results/TEK/')

In [60]:
print(d.info())

Disambiguation model for TEK

Produces the disambiguations:
	TEK*	HGNC:11724

Training data had class balance:
	TEK*	217
	Ungrounded	9

Classification Metrics:
	F1 score:	0.99316
	Precision:	0.98646
	Recall:		1.0

* Positive labels
See Docstring for explanation



In [61]:
model_to_s3(d)

In [32]:
d.grounding_dict

{'TEC': {'ternary elongation complex': 'transcription elongation complex',
  'thermal expansion coefficient': 'ungrounded',
  'thromboembolic complications': 'ungrounded',
  'thymic epithelial cells': 'NCIT:C33771',
  'thymic epithelial cells line': 'NCIT:C33771',
  'thyrocyte': 'MESH:D000072637',
  'thyroid epithelial cells': 'MESH:D000072637',
  'tissue engineered construct': 'ungrounded',
  'total electron content': 'ungrounded',
  'tracheal epithelial cells': 'ungrounded',
  'transcription elongation complex': 'transcription elongation complex',
  'triethyl citrate': 'PUBCHEM:6506',
  'tubular epithelial cells': 'NCIT:C61147',
  'tumor associated endothelial cells': 'NCIT:C37088',
  'tumor endothelial cells': 'NCIT:C37088'},
 'TECs': {'thymic epithelial cells': 'NCIT:C33771',
  'thymus epithelial cells': 'NCIT:C33771',
  'tissue engineered constructs': 'ungrounded',
  'transcription elongation complexes': 'transcription elongation complex',
  'tubular epithelial cells': 'NCIT:C6114

In [25]:
!python -m adeft.download --update

100% [......................................................] 1181008 / 1181008

In [26]:
from adeft import available_shortforms

In [27]:
len(available_shortforms)

72

In [28]:
available_shortforms

{'PC': 'PC',
 'DOG1': 'DOG1',
 'TEK': 'TEK',
 'EMT': 'EMT',
 'SP': 'SP',
 'PCS': 'PCS',
 'CKI': 'CKI',
 'PE': 'PE',
 'RET': 'RET',
 'TEC': 'TEC',
 'ROS': 'ROS',
 'NP': 'NP:NP_S',
 'NPs': 'NP:NP_S',
 'MS': 'MS',
 'MT': 'MT',
 'BP': 'BP',
 'GH': 'GH',
 'AD': 'AD',
 'GT': 'GT',
 'SNS': 'SNS',
 'GARP': 'GARP',
 'DA': 'DA',
 'BCR': 'BCR',
 'GR': 'GR',
 'HIR': 'HIR',
 'IR': 'IR',
 'HK2': 'HK2',
 'ARF': 'ARF',
 'CS': 'CS',
 'UFO': 'UFO',
 'LAK': 'LAK',
 'EC': 'EC',
 'ARG': 'ARG',
 'MOS': 'MOS',
 'STD': 'STD',
 'PD1': 'PD1',
 'TGH': 'TGH',
 'PKD': 'PKD',
 'MIP': 'MIP',
 'RA': 'RA',
 'PCP': 'PCP',
 'EAG': 'EAG',
 'PI': 'PI',
 'PS': 'PS',
 'PA': 'PA',
 'MB': 'MB',
 'HA': 'HA',
 'FES': 'FES',
 'AR': 'AR',
 'HR': 'HR',
 'NE': 'NE',
 'UBC': 'UBC',
 'GSC': 'GSC',
 'AA': 'AA',
 'FER': 'FER',
 'NIS': 'NIS',
 'GC': 'GC',
 'DLK': 'DLK',
 'HK1': 'HK1',
 'CM': 'CM',
 'RB': 'RB:R_B',
 'Rb': 'RB:R_B',
 'LH': 'LH',
 'ER': 'ER',
 'TF': 'TF',
 'PGP': 'PGP',
 'GCA': 'GCA',
 'RK': 'RK',
 'RAC': 'RAC',
 'MCT': 'M

In [33]:
'TEC' in available_shortforms

True

In [34]:
'TECs' in available_shortforms

False

In [35]:
!python -m adeft.download --update

Shortform TEC has multiple adeft modelsThis may lead to unexpected behavior
Shortform TEC has multiple adeft modelsThis may lead to unexpected behavior
100% [........................................................] 194559 / 194559Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/albertsteppi/adeft/adeft/download/__main__.py", line 18, in <module>
    download_models(update=args.update)
  File "/Users/albertsteppi/adeft/adeft/download/download.py", line 65, in download_models
    out=resource_path)
  File "/Users/albertsteppi/.virtualenvs/py37/lib/python3.7/site-packages/wget.py", line 526, in download
    (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
  

In [36]:
!python -m adeft.download --update

100% [......................................................] 1181008 / 1181008