In [1]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [2]:
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [4]:
KCNJ6_pmids = all_pmids['KCNJ6']

KCNJ6_entrez_refs, KCNJ6_entrez_content = get_text_content_from_pmids(KCNJ6_pmids)

KCNJ6_entrez_texts = [universal_extract_text(text) for text in KCNJ6_entrez_content.values() if text]

In [20]:
shortform = 'BIR1'

In [21]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

refs, content = get_text_content_from_stmt_ids(stmts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [7]:
overlap = set(refs.values()) & set(KCNJ6_entrez_refs.values())

In [8]:
overlap

set()

In [22]:
shortform_texts = [universal_extract_text(text, contains=[shortform]) for ref, text in content.items()
                   if text and ref not in overlap]

In [16]:
miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

In [17]:
top = miner.top(15)

In [23]:
len(shortform_texts)

84

In [250]:
longforms = [lf for i, lf in enumerate(top) if i in [0, 2, 3, 5, 7, 9, 11, 12, 13, 14]]

In [251]:
longforms

[('cerebral ischemia reperfusion', 14.6),
 ('cumulative incidence of relapse', 10.5),
 ('chimeric immune receptor', 5.0),
 ('cumulative incidence of recurrence', 4.0),
 ('carbon ion radiation', 3.4000000000000004),
 ('cirrhosis', 2.0),
 ('cranial irradiation', 2.0),
 ('cave in rock', 2.0),
 ('corrected insulin response', 2.0),
 ('cerebral ischemia reperfusion injury', 2.0)]

In [252]:
longforms, scores = zip(*longforms)

In [253]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [254]:
grounding_map

{'cranial irradiation': 'MESH:D016371'}

In [255]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [265]:
names['HGNC:24217'] = 'CIR1'
names['HGNC:6266'] = 'KCNJ5'
pos_labels.extend(['HGNC:24217', 'HGNC:6266'])

In [266]:
result = (grounding_map, names, pos_labels)

In [267]:
result

({'carbon ion radiation': 'ungrounded',
  'cave in rock': 'ungrounded',
  'cerebral ischemia reperfusion': 'MESH:D015427',
  'cerebral ischemia reperfusion injury': 'MESH:D015427',
  'chimeric immune receptor': 'MESH:D000076962',
  'cirrhosis': 'MESH:D008103',
  'corrected insulin response': 'ungrounded',
  'cranial irradiation': 'MESH:D016371',
  'cumulative incidence of recurrence': 'ungrounded',
  'cumulative incidence of relapse': 'ungrounded'},
 {'MESH:D015427': 'Reperfusion Injury',
  'MESH:D000076962': 'Receptors, Chimeric Antigen',
  'MESH:D008103': 'Liver Cirrhosis',
  'MESH:D016371': 'Cranial Irradiation',
  'HGNC:24217': 'CIR1',
  'HGNC:6266': 'KCNJ5'},
 ['MESH:D000076962',
  'MESH:D008103',
  'MESH:D015427',
  'MESH:D016371',
  'HGNC:24217',
  'HGNC:6266'])

In [268]:
grounding_map, names, pos_labels = ({'carbon ion radiation': 'ungrounded',
                                     'cave in rock': 'ungrounded',
                                     'cerebral ischemia reperfusion': 'MESH:D015427',
                                      'cerebral ischemia reperfusion injury': 'MESH:D015427',
                                      'chimeric immune receptor': 'MESH:D000076962',
                                      'cirrhosis': 'MESH:D008103',
                                      'corrected insulin response': 'ungrounded',
                                      'cranial irradiation': 'MESH:D016371',
                                      'cumulative incidence of recurrence': 'ungrounded',
                                      'cumulative incidence of relapse': 'ungrounded'},
                                     {'MESH:D015427': 'Reperfusion Injury',
                                      'MESH:D000076962': 'Receptors, Chimeric Antigen',
                                      'MESH:D008103': 'Liver Cirrhosis',
                                      'MESH:D016371': 'Cranial Irradiation',
                                      'HGNC:24217': 'CIR1',
                                      'HGNC:6266': 'KCNJ5'},
                                     ['MESH:D000076962',
                                      'MESH:D008103',
                                      'MESH:D015427',
                                      'MESH:D016371',
                                      'HGNC:24217',
                                      'HGNC:6266'])

In [269]:
grounding_dict = {'CIR': grounding_map}

In [270]:
classifier = AdeftClassifier('CIR', pos_labels=pos_labels)

In [271]:
param_grid = {'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}

In [272]:
labeler = AdeftLabeler(grounding_dict)

In [273]:
corpus = labeler.build_from_texts(shortform_texts)

In [274]:
corpus.extend([(text, 'HGNC:6266') for text in KCNJ5_entrez_texts])
corpus.extend([(text, 'HGNC:24217') for text in CIR1_entrez_texts])

In [275]:
texts, labels = zip(*corpus)

In [276]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 13:55:25] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}
INFO: [2019-12-17 14:00:29] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.8719200376547886 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [158]:
classifier.grid_search.cv_results_

{'mean_fit_time': array([ 4.14369922, 13.86394496, 18.93544898, 17.07925839, 17.04021072,
        19.1184999 ]),
 'std_fit_time': array([0.11469509, 3.16742204, 0.21266206, 3.98558026, 4.00093192,
        0.10145038]),
 'mean_score_time': array([1.05905571, 0.92401719, 0.9861238 , 0.91746535, 0.93600969,
        0.97546144]),
 'std_score_time': array([0.10238048, 0.06669783, 0.07155401, 0.08566801, 0.05687187,
        0.06643738]),
 'param_logit__C': masked_array(data=[10.0, 10.0, 10.0, 100.0, 100.0, 100.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__max_features': masked_array(data=[100, 1000, 10000, 100, 1000, 10000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'logit__C': 10.0, 'tfidf__max_features': 100},
  {'logit__C': 10.0, 'tfidf__max_features': 1000},
  {'logit__C': 10.0, 'tfidf__max_features': 10000},
  {'log

In [277]:
classifier.stats

{'label_distribution': {'ungrounded': 21,
  'MESH:D015427': 13,
  'MESH:D016371': 2,
  'MESH:D008103': 2,
  'MESH:D000076962': 3,
  'HGNC:6266': 93,
  'HGNC:24217': 18},
 'f1': {'mean': 0.8716847706728948, 'std': 0.07039172887832547},
 'precision': {'mean': 0.8879051635798263, 'std': 0.07126047966257217},
 'recall': {'mean': 0.8658448553501028, 'std': 0.07060311872134131}}

In [278]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [279]:
disamb.dump('CIR', '../results')

In [171]:
from adeft.disambiguate import load_disambiguator

In [172]:
d = load_disambiguator('EAG', '../results')

In [173]:
d.disambiguate(texts[0])

('ungrounded',
 None,
 {'FPLX:KCNH': 0.010634706292363183, 'ungrounded': 0.9893652937076368})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')