In [12]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [13]:
shortforms = ['DGK']
genes = ['DGUOK']
families = {'DGK': ['DGKA', 'DGKB', 'DGKD', 'DGKG', 'DGKH', 'DGKQ', 'DGKZ', 'DGKE',
                    'DGKI', 'DGKK']}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [14]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [15]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [16]:
top = miners['DGK'].top()

In [17]:
top

[('diacylglycerol kinase', 145.79661016949152),
 ('kinase', 74.41624365482234),
 ('dag kinase', 20.08),
 ('of diacylglycerol kinase', 15.411764705882353),
 ('by diacylglycerol kinase', 12.666666666666666),
 ('and diacylglycerol kinase', 9.999999999999998),
 ('dg kinase', 7.666666666666666),
 ('the diacylglycerol kinase', 7.0),
 ('by dag kinase', 6.5),
 ('diacylglycerol dag kinase', 6.0),
 ('a diacylglycerol kinase', 3.0),
 ('diacylglycerol dg kinase', 3),
 ('phospholipase d pld and diacylglycerol kinase', 2.5),
 ('by dg kinase', 2.0),
 ('for diacylglycerol kinase', 2.0),
 ('mammalian diacylglycerol kinase', 2.0),
 ('that diacylglycerol kinase', 2.0),
 ('is catalyzed by diacylglycerol kinase', 2.0),
 ('pa by diacylglycerol kinase', 1.5),
 ('plc and diacylglycerol kinase', 1.5),
 ('role of diacylglycerol kinase', 1.3333333333333333),
 ('to pa by diacylglycerol kinase', 1.3333333333333333),
 ('phospholipase c plc and diacylglycerol kinase', 1.3333333333333333),
 ('diacyglycerol kinase', 1

In [18]:
longforms0 = miners['DGK'].get_longforms()

In [19]:
longforms0

[('diacylglycerol kinase', 153, 0.9178480832694187),
 ('dag kinase', 25, 0.8197686543937565),
 ('dg kinase', 10, 0.7466214775824778),
 ('diacyl glycerol kinase', 2, 0.47990724345877034),
 ('diglyceride kinase', 2, 0.46536259029351856),
 ('diacyglycerol kinase', 2, 0.4324463613461819),
 ('diacylglycerol kinase s', 1, 0.7518306787959074),
 ('dag metabolism enzyme', 1, 0.45667871256010617),
 ('diacylglyercol kinase', 1, 0.2943166935057051),
 ('diacylglyc erol kinase', 1, 0.2307857994733727)]

In [11]:
longforms0

[('diacylglycerol kinase', 153, 0.8901396723238695),
 ('dag kinase', 25, 0.9492877035169986),
 ('dg kinase', 10, 0.9655144982530459),
 ('diacyl glycerol kinase', 2, 0.9292139700983575),
 ('diglyceride kinase', 2, 0.9010520802840223),
 ('diacyglycerol kinase', 2, 0.8373184730136229),
 ('diacylglyercol kinase', 1, 0.8039259047601771),
 ('diacylglycerol kinase s', 1, 0.7518306787959074),
 ('diacylglyc erol kinase', 1, 0.6809150624102774),
 ('dag metabolism enzyme', 1, 0.45667871256010617)]

In [10]:
longforms = longforms0[:-2]

In [11]:
longforms, scores = zip(*longforms)

In [12]:
longforms

('raclopride',
 'radial alveolar count',
 'ractopamine',
 'ribosome associated complex',
 'regenerated amorphous cellulose',
 'ractopamine hydrochloride',
 'relative area change',
 'renal artery calcium',
 'right auditory cortex',
 'rabbit articular chondrocytes',
 'ractopamine hcl',
 'renal artery calcification',
 'regional anticoagulation with citrate',
 'ras relative c3 botulinum toxin substrate')

In [103]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [100]:
gilda_ground('raclopride')

('MESH', 'D020891', 'Raclopride')

In [104]:
grounding_map

{'raclopride': 'MESH:D020891', 'ractopamine': 'CHEBI:CHEBI:82644'}

In [105]:
names

{'MESH:D020891': 'Raclopride', 'CHEBI:CHEBI:82644': 'ractopamine'}

In [17]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names, pos_labels=pos_labels)

In [18]:
result = (grounding_map, names, pos_labels)

In [19]:
result

({'rabbit articular chondrocytes': 'ungrounded',
  'raclopride': 'MESH:D020891',
  'ractopamine': 'CHEBI:CHEBI:82644',
  'ractopamine hcl': 'CHEBI:CHEBI:82644',
  'ractopamine hydrochloride': 'CHEBI:CHEBI:82644',
  'radial alveolar count': 'radial alveolar count',
  'ras relative c3 botulinum toxin substrate': 'FPLX:RAC',
  'regenerated amorphous cellulose': 'ungrounded',
  'regional anticoagulation with citrate': 'ungrounded',
  'relative area change': 'ungrounded',
  'renal artery calcification': 'ungrounded',
  'renal artery calcium': 'ungrounded',
  'ribosome associated complex': 'ungrounded',
  'right auditory cortex': 'ungrounded'},
 {'MESH:D020891': 'Raclopride',
  'CHEBI:CHEBI:82644': 'ractopamine',
  'radial alveolar count': 'radial alveolar count',
  'FPLX:RAC': 'RAC'},
 ['CHEBI:CHEBI:82644', 'FPLX:RAC', 'MESH:D020891'])

In [20]:
grounding_map, names, pos_labels = ({'rabbit articular chondrocytes': 'ungrounded',
  'raclopride': 'MESH:D020891',
  'ractopamine': 'CHEBI:CHEBI:82644',
  'ractopamine hcl': 'CHEBI:CHEBI:82644',
  'ractopamine hydrochloride': 'CHEBI:CHEBI:82644',
  'radial alveolar count': 'radial alveolar count',
  'ras relative c3 botulinum toxin substrate': 'FPLX:RAC',
  'regenerated amorphous cellulose': 'ungrounded',
  'regional anticoagulation with citrate': 'ungrounded',
  'relative area change': 'ungrounded',
  'renal artery calcification': 'ungrounded',
  'renal artery calcium': 'ungrounded',
  'ribosome associated complex': 'ungrounded',
  'right auditory cortex': 'ungrounded'},
 {'MESH:D020891': 'Raclopride',
  'CHEBI:CHEBI:82644': 'ractopamine',
  'radial alveolar count': 'radial alveolar count',
  'FPLX:RAC': 'RAC'},
 ['CHEBI:CHEBI:82644', 'FPLX:RAC', 'MESH:D020891'])

In [21]:
names['HGNC:391'] = 'AKT1'
pos_labels.append('HGNC:391')

In [22]:
grounding_dict = {'RAC': grounding_map}

In [23]:
classifier = AdeftClassifier('RAC', pos_labels=pos_labels)

In [24]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [25]:
labeler = AdeftLabeler(grounding_dict)

In [27]:
corpus = labeler.build_from_texts(shortform_texts)

In [28]:
corpus.extend(entrez_texts)

In [29]:
texts, labels = zip(*corpus)

In [30]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-24 10:40:37] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-24 10:41:43] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9491480221449358 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [31]:
classifier.stats

{'label_distribution': {'radial alveolar count': 10,
  'CHEBI:CHEBI:82644': 31,
  'FPLX:RAC': 950,
  'ungrounded': 32,
  'MESH:D020891': 19,
  'HGNC:391': 2636},
 'f1': {'mean': 0.9491480221449358, 'std': 0.005321067717975318},
 'precision': {'mean': 0.9472554921097684, 'std': 0.005037114274819043},
 'recall': {'mean': 0.9518640618177944, 'std': 0.006001817426961046}}

In [36]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [37]:
d = disamb.disambiguate(shortform_texts)

In [38]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC:391']

In [39]:
a[1]

'The influence of inositol phosphates and phosphoinositides on the alpha isoform of the RAC-protein kinase B (RAC/PKB) was studied using purified wild type and mutant kinase preparations and a recombinant pleckstrin homology (PH) domain. Binding of inositol phosphates and phosphoinositides to the PH domain was measured as the quenching of intrinsic tryptophan fluorescence. Inositol phosphates and D3-phosphorylated phosphoinositides bound with affinities of 1-10 microM and 0.5 microM, respectively. Similar values were obtained using RAC/PKB expressed and purified from baculovirus-infected Sf9 cells in the fluorescence assay. The influence of synthetic dioctanoyl derivatives of phosphatidylinositol 3,4-bisphosphate and phosphatidylinositol 3,4,5-trisphosphate on the activity of RAC/PKB purified from transfected COS-1 cells was studied. Phosphatidylinositol 3,4,5-trisphosphate was found to inhibit the RAC/PKB kinase activity with half-maximal inhibition at 2.5 microM. In contrast, phospha

In [40]:
disamb.dump('RAC', '../results')

In [68]:
from adeft.disambiguate import load_disambiguator, load_disambiguator_directly

In [87]:
d = load_disambiguator_directly('../results/EK/')

In [88]:
d.info()

'Disambiguation model for EK\n\nProduces the disambiguations:\n\tCHK*\tFPLX:CHK\n\tEnkephalins*\tMESH:D004745\n\tTMPRSS15*\tHGNC:9490\n\nTraining data had class balance:\n\tCHK*\t58\n\tTMPRSS15*\t30\n\tUngrounded\t11\n\tEnkephalins*\t7\n\nClassification Metrics:\n\tF1 score:\t0.81333\n\tPrecision:\t0.83439\n\tRecall:\t\t0.84234\n\n* Positive labels\nSee Docstring for explanation\n'

In [89]:
model_to_s3(d)

In [76]:
d.disambiguate(texts[0])

('CHEBI:CHEBI:29016',
 'arginine',
 {'CHEBI:CHEBI:29016': 0.9667882794828576,
  'CHEBI:CHEBI:79': 0.004343534773612251,
  'HGNC:651': 0.00798509086185913,
  'HGNC:77': 0.00973784879530277,
  'HGNC:9965': 0.011145246086368127})

In [79]:
print(d.info())

Disambiguation model for ARG

Produces the disambiguations:
	(-)-Arctigenin*	CHEBI:CHEBI:79
	ABL2*	HGNC:77
	AREG*	HGNC:651
	RERE*	HGNC:9965
	arginine*	CHEBI:CHEBI:29016

Training data had class balance:
	AREG*	157
	ABL2*	88
	RERE*	39
	arginine*	15
	(-)-Arctigenin*	2

Classification Metrics:
	F1 score:	0.81188
	Precision:	0.82394
	Recall:		0.8205

* Positive labels
See Docstring for explanation



In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [41]:
model_to_s3(disamb)

In [44]:
classifier.feature_importances()['FPLX:RAC']

[('rac1', 3.0203950681340763),
 ('cdc42', 0.46463658443237615),
 ('cells', 0.26067826443238284),
 ('rhoa', 0.2284452268653803),
 ('rho', 0.201414697794678),
 ('rac1b', 0.1862119230497441),
 ('actin', 0.16300556582714468),
 ('gtpases', 0.15543881285531247),
 ('rictor', 0.14896197766149943),
 ('pak1', 0.14776120596579687),
 ('jnk', 0.13802477317328446),
 ('activation', 0.13739172186857945),
 ('s1p', 0.12195584486540219),
 ('thrombin', 0.12054088946040988),
 ('endothelial', 0.11948523924516169),
 ('proteins', 0.11834598588132215),
 ('cell', 0.11418065253318789),
 ('phox', 0.11144187989246301),
 ('chimaerin', 0.1102266733731222),
 ('elmo1', 0.10886660370678143),
 ('vav', 0.10685654194092864),
 ('ros', 0.1017412296369346),
 ('gtp', 0.09970566367485924),
 ('escc', 0.09632367768212988),
 ('ras', 0.09346518511044424),
 ('nef', 0.08938792674953275),
 ('ubiquitylation', 0.0893305800391189),
 ('p85', 0.08885763883676143),
 ('gtpase', 0.08833886568389247),
 ('figure', 0.087475101884638),
 ('comple

In [46]:
d = load_disambiguator('ALK', '../results')



In [47]:
d.info()

'Disambiguation model for ALK\n\nProduces the disambiguations:\n\tALK*\tHGNC:427\n\tAlkaline Phosphatase*\tMESH:D000469\n\tRSTK1*\tFPLX:RSTK1\n\nTraining data had class balance:\n\tALK*\t1181\n\tRSTK1*\t155\n\tAlkaline Phosphatase*\t6\n\nClassification Metrics:\n\tF1 score:\t0.97493\n\tPrecision:\t0.97367\n\tRecall:\t\t0.97767\n\n* Positive labels\nSee Docstring for explanation\n'

In [48]:
print(d.info())

Disambiguation model for ALK

Produces the disambiguations:
	ALK*	HGNC:427
	Alkaline Phosphatase*	MESH:D000469
	RSTK1*	FPLX:RSTK1

Training data had class balance:
	ALK*	1181
	RSTK1*	155
	Alkaline Phosphatase*	6

Classification Metrics:
	F1 score:	0.97493
	Precision:	0.97367
	Recall:		0.97767

* Positive labels
See Docstring for explanation



In [49]:
model_to_s3(d)

In [58]:
d = load_disambiguator('TAK', '../results')



In [59]:
print(d.info())

Disambiguation model for TAK

Produces the disambiguations:
	MAP3K7*	HGNC:6859
	Takayasu Arteritis*	MESH:D013625

Training data had class balance:
	MAP3K7*	327
	Ungrounded*	241
	Ungrounded	36
	Takayasu Arteritis*	13

Classification Metrics:
	F1 score:	0.94043
	Precision:	0.93685
	Recall:		0.94658

* Positive labels
See Docstring for explanation



In [57]:
model_to_s3(d)

In [60]:
from adeft import available_shortforms

In [63]:
print(d.info())

Disambiguation model for TAK

Produces the disambiguations:
	MAP3K7*	HGNC:6859
	Takayasu Arteritis*	MESH:D013625

Training data had class balance:
	MAP3K7*	327
	Ungrounded*	241
	Ungrounded	36
	Takayasu Arteritis*	13

Classification Metrics:
	F1 score:	0.94043
	Precision:	0.93685
	Recall:		0.94658

* Positive labels
See Docstring for explanation



In [64]:
d.classifier.feature_importances()

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [2]:
from adeft import __version__

In [3]:
__version__

'0.5.5'

In [4]:
from adeft.disambiguate import load_disambiguator_directly

In [59]:
d = load_disambiguator_directly('../results/TEK/')

In [60]:
print(d.info())

Disambiguation model for TEK

Produces the disambiguations:
	TEK*	HGNC:11724

Training data had class balance:
	TEK*	217
	Ungrounded	9

Classification Metrics:
	F1 score:	0.99316
	Precision:	0.98646
	Recall:		1.0

* Positive labels
See Docstring for explanation



In [61]:
model_to_s3(d)

In [32]:
d.grounding_dict

{'TEC': {'ternary elongation complex': 'transcription elongation complex',
  'thermal expansion coefficient': 'ungrounded',
  'thromboembolic complications': 'ungrounded',
  'thymic epithelial cells': 'NCIT:C33771',
  'thymic epithelial cells line': 'NCIT:C33771',
  'thyrocyte': 'MESH:D000072637',
  'thyroid epithelial cells': 'MESH:D000072637',
  'tissue engineered construct': 'ungrounded',
  'total electron content': 'ungrounded',
  'tracheal epithelial cells': 'ungrounded',
  'transcription elongation complex': 'transcription elongation complex',
  'triethyl citrate': 'PUBCHEM:6506',
  'tubular epithelial cells': 'NCIT:C61147',
  'tumor associated endothelial cells': 'NCIT:C37088',
  'tumor endothelial cells': 'NCIT:C37088'},
 'TECs': {'thymic epithelial cells': 'NCIT:C33771',
  'thymus epithelial cells': 'NCIT:C33771',
  'tissue engineered constructs': 'ungrounded',
  'transcription elongation complexes': 'transcription elongation complex',
  'tubular epithelial cells': 'NCIT:C6114

In [25]:
!python -m adeft.download --update

100% [......................................................] 1181008 / 1181008

In [26]:
from adeft import available_shortforms

In [27]:
len(available_shortforms)

72

In [28]:
available_shortforms

{'PC': 'PC',
 'DOG1': 'DOG1',
 'TEK': 'TEK',
 'EMT': 'EMT',
 'SP': 'SP',
 'PCS': 'PCS',
 'CKI': 'CKI',
 'PE': 'PE',
 'RET': 'RET',
 'TEC': 'TEC',
 'ROS': 'ROS',
 'NP': 'NP:NP_S',
 'NPs': 'NP:NP_S',
 'MS': 'MS',
 'MT': 'MT',
 'BP': 'BP',
 'GH': 'GH',
 'AD': 'AD',
 'GT': 'GT',
 'SNS': 'SNS',
 'GARP': 'GARP',
 'DA': 'DA',
 'BCR': 'BCR',
 'GR': 'GR',
 'HIR': 'HIR',
 'IR': 'IR',
 'HK2': 'HK2',
 'ARF': 'ARF',
 'CS': 'CS',
 'UFO': 'UFO',
 'LAK': 'LAK',
 'EC': 'EC',
 'ARG': 'ARG',
 'MOS': 'MOS',
 'STD': 'STD',
 'PD1': 'PD1',
 'TGH': 'TGH',
 'PKD': 'PKD',
 'MIP': 'MIP',
 'RA': 'RA',
 'PCP': 'PCP',
 'EAG': 'EAG',
 'PI': 'PI',
 'PS': 'PS',
 'PA': 'PA',
 'MB': 'MB',
 'HA': 'HA',
 'FES': 'FES',
 'AR': 'AR',
 'HR': 'HR',
 'NE': 'NE',
 'UBC': 'UBC',
 'GSC': 'GSC',
 'AA': 'AA',
 'FER': 'FER',
 'NIS': 'NIS',
 'GC': 'GC',
 'DLK': 'DLK',
 'HK1': 'HK1',
 'CM': 'CM',
 'RB': 'RB:R_B',
 'Rb': 'RB:R_B',
 'LH': 'LH',
 'ER': 'ER',
 'TF': 'TF',
 'PGP': 'PGP',
 'GCA': 'GCA',
 'RK': 'RK',
 'RAC': 'RAC',
 'MCT': 'M

In [33]:
'TEC' in available_shortforms

True

In [34]:
'TECs' in available_shortforms

False

In [35]:
!python -m adeft.download --update

Shortform TEC has multiple adeft modelsThis may lead to unexpected behavior
Shortform TEC has multiple adeft modelsThis may lead to unexpected behavior
100% [........................................................] 194559 / 194559Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/albertsteppi/adeft/adeft/download/__main__.py", line 18, in <module>
    download_models(update=args.update)
  File "/Users/albertsteppi/adeft/adeft/download/download.py", line 65, in download_models
    out=resource_path)
  File "/Users/albertsteppi/.virtualenvs/py37/lib/python3.7/site-packages/wget.py", line 526, in download
    (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
  

In [36]:
!python -m adeft.download --update

100% [......................................................] 1181008 / 1181008