In [238]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [240]:
shortforms = ['GCA']
genes = ['GCA', 'NPR1']
families = {'ungrounded': [f'TRC-GCA{i}-{j}' for i in range(1, 26)
                           for j in range(1, 6)]}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    if family != 'ungrounded':
        groundings.append(f'FPLX:{family}')
    else:
        groundings.append('ungrounded')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [241]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [242]:
top = miners['GCA'].top()

In [243]:
top

[('giant cell arteritis', 174.70967741935485),
 ('gastric cardia adenocarcinoma', 40.46341463414634),
 ('golgicide a', 25.241379310344826),
 ('adenocarcinoma', 20.842105263157897),
 ('glycocholic acid', 16.88888888888889),
 ('patients with giant cell arteritis', 15.9),
 ('of giant cell arteritis', 13.6),
 ('ability', 13.0),
 ('in giant cell arteritis', 13.0),
 ('general cognitive ability', 11.5),
 ('and giant cell arteritis', 10.888888888888886),
 ('a', 10.228571428571428),
 ('acid', 10.08),
 ('general combining ability', 9.538461538461538),
 ('polymyalgia rheumatica pmr and giant cell arteritis', 9.142857142857142),
 ('with giant cell arteritis', 8.8),
 ('for giant cell arteritis', 7.0),
 ('gastric cardiac adenocarcinoma', 6.666666666666667),
 ('in gastric cardia adenocarcinoma', 6.0),
 ('in patients with giant cell arteritis', 6.0),
 ('gca giant cell arteritis', 5.428571428571429),
 ('of golgicide a', 4.666666666666667),
 ('glycocholic', 4.0),
 ('the general combining ability', 4.0),

In [244]:
longforms0 = miners['GCA'].get_longforms()

In [245]:
list(enumerate(longforms0))

[(0, ('giant cell arteritis', 174.70967741935485)),
 (1, ('gastric cardia adenocarcinoma', 40.46341463414634)),
 (2, ('golgicide a', 25.241379310344826)),
 (3, ('glycocholic acid', 16.88888888888889)),
 (4, ('ability', 13.0)),
 (5, ('glycocholic', 4.0)),
 (6, ('cancer', 3.0)),
 (7, ('granulated coal ash', 2.0)),
 (8, ('gastric cardiac carcinoma', 2.0)),
 (9, ('graft coronary arteriosclerosis', 2)),
 (10, ('german cockroach', 1.3333333333333333)),
 (11, ('gd 1 − x ca x alo 3 − δ', 1.3333333333333333))]

In [246]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [4, 6]]

In [247]:
longforms0

[('giant cell arteritis', 174.70967741935485),
 ('gastric cardia adenocarcinoma', 40.46341463414634),
 ('golgicide a', 25.241379310344826),
 ('glycocholic acid', 16.88888888888889),
 ('glycocholic', 4.0),
 ('granulated coal ash', 2.0),
 ('gastric cardiac carcinoma', 2.0),
 ('graft coronary arteriosclerosis', 2),
 ('german cockroach', 1.3333333333333333),
 ('gd 1 − x ca x alo 3 − δ', 1.3333333333333333)]

In [248]:
list(enumerate(top))

[(0, ('giant cell arteritis', 174.70967741935485)),
 (1, ('gastric cardia adenocarcinoma', 40.46341463414634)),
 (2, ('golgicide a', 25.241379310344826)),
 (3, ('adenocarcinoma', 20.842105263157897)),
 (4, ('glycocholic acid', 16.88888888888889)),
 (5, ('patients with giant cell arteritis', 15.9)),
 (6, ('of giant cell arteritis', 13.6)),
 (7, ('ability', 13.0)),
 (8, ('in giant cell arteritis', 13.0)),
 (9, ('general cognitive ability', 11.5)),
 (10, ('and giant cell arteritis', 10.888888888888886)),
 (11, ('a', 10.228571428571428)),
 (12, ('acid', 10.08)),
 (13, ('general combining ability', 9.538461538461538)),
 (14,
  ('polymyalgia rheumatica pmr and giant cell arteritis', 9.142857142857142)),
 (15, ('with giant cell arteritis', 8.8)),
 (16, ('for giant cell arteritis', 7.0)),
 (17, ('gastric cardiac adenocarcinoma', 6.666666666666667)),
 (18, ('in gastric cardia adenocarcinoma', 6.0)),
 (19, ('in patients with giant cell arteritis', 6.0)),
 (20, ('gca giant cell arteritis', 5.4285

In [249]:
longforms0.extend([x for i, x in enumerate(top) if i in [9, 13, 33, 34, 36]])

In [250]:
longforms0

[('giant cell arteritis', 174.70967741935485),
 ('gastric cardia adenocarcinoma', 40.46341463414634),
 ('golgicide a', 25.241379310344826),
 ('glycocholic acid', 16.88888888888889),
 ('glycocholic', 4.0),
 ('granulated coal ash', 2.0),
 ('gastric cardiac carcinoma', 2.0),
 ('graft coronary arteriosclerosis', 2),
 ('german cockroach', 1.3333333333333333),
 ('gd 1 − x ca x alo 3 − δ', 1.3333333333333333),
 ('general cognitive ability', 11.5),
 ('general combining ability', 9.538461538461538),
 ('gastric cancer', 2.0),
 ('glycyrrhizic acid', 2.0),
 ('gastric cardiac carcinoma', 2.0)]

In [251]:
longforms0.sort(key=lambda x: -x[1])
longforms, scores = zip(*longforms0)

In [252]:
longforms

('giant cell arteritis',
 'gastric cardia adenocarcinoma',
 'golgicide a',
 'glycocholic acid',
 'general cognitive ability',
 'general combining ability',
 'glycocholic',
 'granulated coal ash',
 'gastric cardiac carcinoma',
 'graft coronary arteriosclerosis',
 'gastric cancer',
 'glycyrrhizic acid',
 'gastric cardiac carcinoma',
 'german cockroach',
 'gd 1 − x ca x alo 3 − δ')

In [253]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [210]:
grounding_map

{'giant cell arteritis': 'MESH:D013700',
 'golgicide a': 'CHEBI:CHEBI:139037',
 'glycocholic acid': 'CHEBI:CHEBI:17687',
 'glycyrrhizic acid': 'MESH:D019695',
 'german cockroach': 'MESH:D020048'}

In [211]:
names

{'MESH:D013700': 'Giant Cell Arteritis',
 'CHEBI:CHEBI:139037': 'golgicide A',
 'CHEBI:CHEBI:17687': 'glycocholic acid',
 'MESH:D019695': 'Glycyrrhizic Acid',
 'MESH:D020048': 'Blattellidae'}

In [212]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [213]:
result = (grounding_map, names, pos_labels)

In [214]:
result

({'gastric cancer': 'NCIT:C5247',
  'gastric cardia adenocarcinoma': 'NCIT:C5247',
  'gastric cardiac carcinoma': 'NCIT:C5247',
  'gd 1 − x ca x alo 3 − δ': 'ungrounded',
  'general cognitive ability': 'ungrounded',
  'general combining ability': 'ungrounded',
  'german cockroach': 'MESH:D020048',
  'giant cell arteritis': 'MESH:D013700',
  'glycocholic': 'CHEBI:CHEBI:17687',
  'glycocholic acid': 'CHEBI:CHEBI:17687',
  'glycyrrhizic acid': 'MESH:D019695',
  'golgicide a': 'CHEBI:CHEBI:139037',
  'graft coronary arteriosclerosis': 'ungrounded',
  'granulated coal ash': 'ungrounded'},
 {'NCIT:C5247': 'Gastric Cardia Adenocarcinoma',
  'MESH:D020048': 'Blattellidae',
  'MESH:D013700': 'Giant Cell Arteritis',
  'CHEBI:CHEBI:17687': 'glycocholic acid',
  'MESH:D019695': 'Glycyrrhizic Acid',
  'CHEBI:CHEBI:139037': 'golgicide A'},
 ['CHEBI:CHEBI:139037',
  'CHEBI:CHEBI:17687',
  'MESH:D013700',
  'MESH:D019695',
  'MESH:D020048',
  'NCIT:C5247'])

In [254]:
grounding_map, names, pos_labels = ({'gastric cancer': 'NCIT:C5247',
  'gastric cardia adenocarcinoma': 'NCIT:C5247',
  'gastric cardiac carcinoma': 'NCIT:C5247',
  'gd 1 − x ca x alo 3 − δ': 'ungrounded',
  'general cognitive ability': 'ungrounded',
  'general combining ability': 'ungrounded',
  'german cockroach': 'MESH:D020048',
  'giant cell arteritis': 'MESH:D013700',
  'glycocholic': 'CHEBI:CHEBI:17687',
  'glycocholic acid': 'CHEBI:CHEBI:17687',
  'glycyrrhizic acid': 'MESH:D019695',
  'golgicide a': 'CHEBI:CHEBI:139037',
  'graft coronary arteriosclerosis': 'ungrounded',
  'granulated coal ash': 'ungrounded'},
 {'NCIT:C5247': 'Gastric Cardia Adenocarcinoma',
  'MESH:D020048': 'Blattellidae',
  'MESH:D013700': 'Giant Cell Arteritis',
  'CHEBI:CHEBI:17687': 'glycocholic acid',
  'MESH:D019695': 'Glycyrrhizic Acid',
  'CHEBI:CHEBI:139037': 'golgicide A'},
 ['CHEBI:CHEBI:139037',
  'CHEBI:CHEBI:17687',
  'MESH:D013700',
  'MESH:D019695',
  'MESH:D020048',
  'NCIT:C5247'])

In [255]:
grounding_dict = {'GCA': grounding_map}
classifier = AdeftClassifier('GCA', pos_labels=pos_labels)
param_grid = {'C': [100.0], 'max_features': [10000]}
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts(shortform_texts)
corpus.extend(entrez_texts)
texts, labels = zip(*corpus)

In [256]:
names['HGNC:15990'] = 'GCA'
names['HGNC:7943'] = 'NPR1'
pos_labels.extend(['HGNC:2676', 'HGNC:2907'])

In [257]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-17 16:00:03] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-17 16:00:58] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.932267858317106 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [258]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:139037': 17,
  'NCIT:C5247': 36,
  'MESH:D013700': 105,
  'ungrounded': 22,
  'CHEBI:CHEBI:17687': 17,
  'MESH:D019695': 3,
  'MESH:D020048': 1,
  'HGNC:15990': 28,
  'HGNC:7943': 80},
 'f1': {'mean': 0.932267858317106, 'std': 0.03768727667923726},
 'precision': {'mean': 0.9332112111523877, 'std': 0.030799219355164494},
 'recall': {'mean': 0.9408318337730103, 'std': 0.04390146958726988}}

In [259]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [260]:
d = disamb.disambiguate(shortform_texts)

In [270]:
a = [text for pred, text in zip(d, shortform_texts) if pred[0] == 'ungrounded']

In [271]:
len(a)

24

In [274]:
a[12]

'The phenotypic and genetic relationship between global cortical size and general cognitive ability (GCA) appears to be driven by surface area (SA) and not cortical thickness (CT). Gyrification (cortical folding) is an important property of the cortex that helps to increase SA within a finite space, and may also improve connectivity by reducing distance between regions. Hence, gyrification may be what underlies the SA-GCA relationship. In previous phenotypic studies, a 3-dimensional gyrification index (3DGI) has been positively associated with cognitive ability and negatively associated with mild cognitive impairment, Alzheimer’s disease, and psychiatric disorders affecting cognition. However, the differential genetic associations of 3DGI and SA with GCA are still unclear. We examined the heritability of 3DGI, and the phenotypic, genetic, and environmental associations of 3DGI with SA and GCA in a large sample of adult male twins (N = 512). Nearly 85% of the variance in 3DGI was due to

In [52]:
y =[x for x in a if 'tyrosine kinase' in x]

In [54]:
y[0]

'Non-small-cell lung cancer (NSCLC) patients with mutated or rearranged oncogene drivers can be treated with upfront selective inhibitors achieving higher response rates and longer survival than chemotherapy. The RET gene can undergo chromosomal rearrangements in 1%–2% of all NSCLC patients, involving various upstream fusion partners such as KIF5B, CCDC6, NCOA4, and TRIM33. Many multikinase inhibitors are active against rearranged RET. Cabozantinib, vandetanib, sunitinib, lenvatinib, and nintedanib achieved tumor responses in about 30% of these patients in retrospective studies. Prospective phase II trials investigated the activity and toxicity of cabozantinib, vandetanib, sorafenib, and lenvatinib, and did not reach significantly higher response rates. VEGFR and EGFR inhibition represented the main ways of developing off-target toxicity. An intrinsic resistance emerged according to the type of RET fusion partners, as KIF5B-RET fusion is the most resistant. Also acquired mutations in r

In [268]:
disamb.dump('GCA', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [269]:
model_to_s3(disamb)