In [145]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [146]:
shortform = 'HIR'

In [147]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

ids, content = get_text_content_from_stmt_ids(stmts)

shortform_texts = [universal_extract_text(text, contains=[shortform]) for text in content.values() if text]

miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [152]:
top = miner.top(15)

In [153]:
top

[('hepatic ischemia reperfusion', 16.666666666666668),
 ('the human insulin receptor', 12.0),
 ('human insulin receptor', 10.0),
 ('mab against the human insulin receptor', 9.23076923076923),
 ('reperfusion', 6.916666666666668),
 ('response', 6.363636363636363),
 ('risk', 4.444444444444445),
 ('monoclonal antibody mab against the human insulin receptor',
  3.6666666666666665),
 ('high intermediate risk', 3.6),
 ('high irradiation response', 3.2),
 ('hepatic irradiation', 3.0),
 ('high infertility risk', 3.0),
 ('hypoxia induced relaxation', 3.0),
 ('of human insulin receptor', 3.0),
 ('e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine',
  3)]

In [159]:
longforms = miner.get_longforms(cutoff=2.9)

In [160]:
longforms

[('hepatic ischemia reperfusion', 16.666666666666668),
 ('the human insulin receptor', 12.0),
 ('response', 6.363636363636363),
 ('risk', 4.444444444444445),
 ('hepatic irradiation', 3.0),
 ('hypoxia induced relaxation', 3.0),
 ('e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine',
  3)]

In [165]:
longforms = [lf for i, lf in enumerate(longforms) if i in [0, 4, 5]]
longforms.extend([top[2], top[8], top[9], top[11]])

In [167]:
longforms, scores = zip(*longforms)

In [168]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [169]:
grounding_map

{}

In [170]:
result = ground_with_gui(longforms, scores)

In [171]:
result

({'hepatic ischemia reperfusion': 'MESH:D015427',
  'high infertility risk': 'ungrounded',
  'high intermediate risk': 'ungrounded',
  'high irradiation response': 'ungrounded',
  'human insulin receptor': 'HGNC:6091'},
 {'MESH:D015427': 'Reperfusion Injury', 'HGNC:6091': 'INSR'},
 ['HGNC:6091', 'MESH:D015427'])

In [172]:
grounding_map, names, pos_labels = ({'hepatic ischemia reperfusion': 'MESH:D015427',
                                     'high infertility risk': 'ungrounded',
                                     'high intermediate risk': 'ungrounded',
                                     'high irradiation response': 'ungrounded',
                                     'human insulin receptor': 'HGNC:6091'},
                                     {'MESH:D015427': 'Reperfusion Injury', 'HGNC:6091': 'INSR'},
                                     ['HGNC:6091', 'MESH:D015427'])

In [173]:
grounding_dict = {'HIR': grounding_map}

In [174]:
classifier = AdeftClassifier('HIR', pos_labels)

In [175]:
param_grid = {'C': [10.0], 'max_features': [1000]}

In [176]:
labeler = AdeftLabeler(grounding_dict)

In [177]:
corpus = labeler.build_from_texts(shortform_texts)

In [178]:
texts, labels = zip(*corpus)

In [180]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 10:08:08] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0], 'max_features': [1000]}
INFO: [2019-12-17 10:08:09] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 1.0 found for parameter values:
{'logit__C': 10.0, 'tfidf__max_features': 1000}


In [181]:
classifier.stats

{'label_distribution': {'HGNC:6091': 19, 'ungrounded': 9, 'MESH:D015427': 15},
 'f1': {'mean': 1.0, 'std': 0.0},
 'precision': {'mean': 1.0, 'std': 0.0},
 'recall': {'mean': 1.0, 'std': 0.0}}

In [182]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [185]:
disamb.disambiguate(texts[2])

('ungrounded',
 None,
 {'HGNC:6091': 0.03221201907376195,
  'MESH:D015427': 0.046365491206260684,
  'ungrounded': 0.9214224897199773})

In [187]:
disamb.dump('HIR', '../results')

In [188]:
from adeft.disambiguate import load_disambiguator

In [189]:
d = load_disambiguator('HIR', '../results')

In [190]:
d.disambiguate(texts[0])

('HGNC:6091',
 'INSR',
 {'HGNC:6091': 0.9495047219792215,
  'MESH:D015427': 0.03378744088190508,
  'ungrounded': 0.016707837138873322})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')