In [1]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [2]:
shortform = 'BI'

In [3]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

ids, content = get_text_content_from_stmt_ids(stmts)

shortform_texts = [universal_extract_text(text, contains=[shortform]) for text in content.values() if text]

miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [14]:
shortform_texts[5]

"After more than 50 years of research it is apparent that the pacemaker function is orchestrated via intrinsic signaling mechanisms originating at multiple levels of organization, including subcellular (e.g., phosphorylation cascades, SR, mitochondria), cellular (i.e., surface membrane), hierarchical brain-heart signaling (i.e., neurotransmitter or hormonal stimulation of surface membrane receptors) and modulated by environmental mechanical, chemical and thermal factors (Figure  1 ). The SR rhythmically discharges local diastolic Ca 2+  releases (LCRs) beneath the cell surface membrane; LCRs activate an inward Na + /Ca 2+  exchange (NCX) current that prompts the surface membrane clock (M clock), an ensemble of sarcolemmal electrogenic molecules, to generate an AP. LCR Ca 2+  signal is regulated not only by the SR Ca 2+  pumping, which depends not only on SR proteins (phospholamban and RyR), and their phosphorylation status, but also by functions and phosphorylation status of M-clock pr

In [4]:
top = miner.top(15)

In [6]:
top

[('behavioral inhibition', 64.07142857142857),
 ('index', 61.76),
 ('brief intervention', 53.357142857142854),
 ('bayesian inference', 36.56410256410256),
 ('inhibition', 19.81818181818182),
 ('barthel index', 17.310344827586206),
 ('intervention', 16.529411764705884),
 ('the barthel index', 15.333333333333334),
 ('basal insulin', 10.833333333333334),
 ('maximum likelihood ml and bayesian inference', 10.5),
 ('of brief intervention', 10.153846153846153),
 ('injury', 9.88888888888889),
 ('brain infarction', 9.833333333333334),
 ('burn injury', 9.777777777777779),
 ('insulin', 8.736842105263158)]

In [203]:
longforms = miner.get_longforms(cutoff=2.9)

In [204]:
longforms

[('dose', 46.06060606060606),
 ('multiple epiphyseal dysplasia', 46.048780487804876),
 ('mediterranean', 11.2),
 ('mycoepoxydiene', 11.0),
 ('medetomidine', 10.5),
 ('mediator', 10.0),
 ('metathesis electrodialysis', 6.166666666666667),
 ('microendoscopic discectomy', 5.0),
 ('diet', 4.25),
 ('medical', 3.2),
 ('b tabaci', 3.0)]

In [205]:
longforms = [lf for i, lf in enumerate(longforms) if i in [1, 2, 3, 4, 5, 6, 7]]
longforms.extend([top[3], top[7], top[11]])
longforms.sort(key=lambda x: -x[1])

In [206]:
longforms

[('multiple epiphyseal dysplasia', 46.048780487804876),
 ('minimal erythema dose', 14.999999999999996),
 ('mediterranean', 11.2),
 ('mycoepoxydiene', 11.0),
 ('morphine equivalent dose', 10.666666666666666),
 ('medetomidine', 10.5),
 ('mediator', 10.0),
 ('minimal effective dose', 8.0),
 ('metathesis electrodialysis', 6.166666666666667),
 ('microendoscopic discectomy', 5.0)]

In [207]:
longforms, scores = zip(*longforms)

In [208]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [209]:
grounding_map

{'medetomidine': 'CHEBI:CHEBI:48552'}

In [210]:
result = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [211]:
result

({'medetomidine': 'CHEBI:CHEBI:48552',
  'mediator': 'ungrounded',
  'mediterranean': 'ungrounded',
  'metathesis electrodialysis': 'ungrounded',
  'microendoscopic discectomy': 'ungrounded',
  'minimal effective dose': 'ungrounded',
  'minimal erythema dose': 'ungrounded',
  'morphine equivalent dose': 'ungrounded',
  'multiple epiphyseal dysplasia': 'MESH:D010009',
  'mycoepoxydiene': 'PUBCHEM:11300750'},
 {'MESH:D010009': 'Osteochondrodysplasias',
  'PUBCHEM:11300750': 'Mycoepoxydiene'},
 ['CHEBI:CHEBI:48552', 'PUBCHEM:11300750'])

In [212]:
grounding_map, names, pos_labels = ({'medetomidine': 'CHEBI:CHEBI:48552',
                                     'mediator': 'ungrounded',
                                     'mediterranean': 'ungrounded',
                                     'metathesis electrodialysis': 'ungrounded',
                                     'microendoscopic discectomy': 'ungrounded',
                                     'minimal effective dose': 'ungrounded',
                                     'minimal erythema dose': 'ungrounded',
                                     'morphine equivalent dose': 'ungrounded',
                                     'multiple epiphyseal dysplasia': 'MESH:D010009',
                                     'mycoepoxydiene': 'PUBCHEM:11300750'},
                                     {'MESH:D010009': 'Osteochondrodysplasias',
                                      'PUBCHEM:11300750': 'Mycoepoxydiene'},
                                     ['CHEBI:CHEBI:48552', 'PUBCHEM:11300750'])

In [213]:
grounding_dict = {'MED': grounding_map}

In [219]:
classifier = AdeftClassifier('MED', pos_labels)

In [220]:
len(texts)

43

In [228]:
param_grid = {'C': [100.0], 'max_features': [1000]}

In [229]:
labeler = AdeftLabeler(grounding_dict)

In [230]:
corpus = labeler.build_from_texts(shortform_texts)

In [231]:
texts, labels = zip(*corpus)

In [232]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 10:25:47] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [1000]}
INFO: [2019-12-17 10:25:48] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.6633333333333333 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [234]:
classifier.stats

{'label_distribution': {'ungrounded': 57,
  'CHEBI:CHEBI:48552': 11,
  'MESH:D010009': 35,
  'PUBCHEM:11300750': 10},
 'f1': {'mean': 0.6633333333333333, 'std': 0.1796292478040997},
 'precision': {'mean': 0.73, 'std': 0.24819347291981714},
 'recall': {'mean': 0.63, 'std': 0.15033296378372907}}

In [235]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [236]:
disamb.disambiguate(texts[2])

('MESH:D010009',
 'Osteochondrodysplasias',
 {'CHEBI:CHEBI:48552': 0.0004532352177294433,
  'MESH:D010009': 0.9990150835227889,
  'PUBCHEM:11300750': 0.00030488616098269574,
  'ungrounded': 0.00022679509849898903})

In [237]:
disamb.dump('MED', '../results')

In [188]:
from adeft.disambiguate import load_disambiguator

In [189]:
d = load_disambiguator('HIR', '../results')

In [190]:
d.disambiguate(texts[0])

('HGNC:6091',
 'INSR',
 {'HGNC:6091': 0.9495047219792215,
  'MESH:D015427': 0.03378744088190508,
  'ungrounded': 0.016707837138873322})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')