In [1]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [2]:
shortform = 'MED'
genes = [

In [3]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

ids, content = get_text_content_from_stmt_ids(stmts)

shortform_texts = [universal_extract_text(text, contains=[shortform]) for text in content.values() if text]

miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

In [4]:
Med_stmts = get_stmts_with_agent_text_like('Med')['Med']

ids, content = get_text_content_from_stmt_ids(Med_stmts)

Med_texts = [universal_extract_text(text, contains=['Med']) for text in content.values() if text]

Med_miner = AdeftMiner('Med')
Med_miner.process_texts(Med_texts)

In [5]:
len(Med_texts)

104

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [6]:
Med_top = Med_miner.top(15)

In [7]:
Med_top

[('medea', 6.0),
 ('medicarpin', 6.0),
 ('mad and medea', 2.0),
 ('mediterranean', 1.0),
 ('medulloblastoma', 1),
 ('with medea', 1.0),
 ('natural pterocarpan medicarpin', 1),
 ('osteogenic activation was identified in medicarpin', 1),
 ('as necessitates research into the etiology and therapeutic interventions in this study medicarpin',
  1),
 ('es exerted through inhibition of cancer cell proliferation invasion and metastasis 7 medicarpin',
  1),
 ('en deficiency in experimental animals and humans we have evaluated osteogenic effect of medicarpin',
  1),
 ('for test the combination of med and trail agonists in management of myeloid leukemia medicarpin',
  1),
 ('rly people twenty participants received three diet for 4 weeks each sfa rich diet mediterranean',
  1),
 ('tail of the mothers against dpp mad transcription factor pmad which then interacts with medea',
  1),
 ('tion of target genes by smad family transcription factor including the drosophila mad and medea',
  1)]

In [8]:
top = miner.top(15)

In [9]:
top

[('dose', 46.06060606060606),
 ('multiple epiphyseal dysplasia', 46.048780487804876),
 ('effective dose', 17.52),
 ('minimal erythema dose', 15.0),
 ('pseudoachondroplasia psach and multiple epiphyseal dysplasia', 12.8),
 ('mediterranean', 11.2),
 ('mycoepoxydiene', 11.0),
 ('morphine equivalent dose', 10.666666666666666),
 ('the minimal erythema dose', 10.636363636363637),
 ('medetomidine', 10.5),
 ('mediator', 9.999999999999996),
 ('minimal effective dose', 8.0),
 ('metathesis electrodialysis', 6.166666666666667),
 ('microendoscopic discectomy', 5.0),
 ('by metathesis electrodialysis', 4.75)]

In [10]:
longforms = miner.get_longforms(cutoff=2.9)

In [11]:
longforms

[('dose', 46.06060606060606),
 ('multiple epiphyseal dysplasia', 46.048780487804876),
 ('mediterranean', 11.2),
 ('mycoepoxydiene', 11.0),
 ('medetomidine', 10.5),
 ('mediator', 9.999999999999996),
 ('metathesis electrodialysis', 6.166666666666667),
 ('microendoscopic discectomy', 5.0),
 ('diet', 4.25),
 ('medical', 3.2),
 ('b tabaci', 3.0)]

In [12]:
longforms = [lf for i, lf in enumerate(longforms) if i in [1, 2, 3, 4, 5, 6, 7]]
longforms.extend([top[3], top[7], top[11]])
longforms.sort(key=lambda x: -x[1])

In [13]:
longforms

[('multiple epiphyseal dysplasia', 46.048780487804876),
 ('minimal erythema dose', 15.0),
 ('mediterranean', 11.2),
 ('mycoepoxydiene', 11.0),
 ('morphine equivalent dose', 10.666666666666666),
 ('medetomidine', 10.5),
 ('mediator', 9.999999999999996),
 ('minimal effective dose', 8.0),
 ('metathesis electrodialysis', 6.166666666666667),
 ('microendoscopic discectomy', 5.0)]

In [14]:
longforms, scores = zip(*longforms)

In [15]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [209]:
grounding_map

{'medetomidine': 'CHEBI:CHEBI:48552'}

In [210]:
result = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [211]:
result

({'medetomidine': 'CHEBI:CHEBI:48552',
  'mediator': 'ungrounded',
  'mediterranean': 'ungrounded',
  'metathesis electrodialysis': 'ungrounded',
  'microendoscopic discectomy': 'ungrounded',
  'minimal effective dose': 'ungrounded',
  'minimal erythema dose': 'ungrounded',
  'morphine equivalent dose': 'ungrounded',
  'multiple epiphyseal dysplasia': 'MESH:D010009',
  'mycoepoxydiene': 'PUBCHEM:11300750'},
 {'MESH:D010009': 'Osteochondrodysplasias',
  'PUBCHEM:11300750': 'Mycoepoxydiene'},
 ['CHEBI:CHEBI:48552', 'PUBCHEM:11300750'])

In [212]:
grounding_map, names, pos_labels = ({'medetomidine': 'CHEBI:CHEBI:48552',
                                     'mediator': 'ungrounded',
                                     'mediterranean': 'ungrounded',
                                     'metathesis electrodialysis': 'ungrounded',
                                     'microendoscopic discectomy': 'ungrounded',
                                     'minimal effective dose': 'ungrounded',
                                     'minimal erythema dose': 'ungrounded',
                                     'morphine equivalent dose': 'ungrounded',
                                     'multiple epiphyseal dysplasia': 'MESH:D010009',
                                     'mycoepoxydiene': 'PUBCHEM:11300750'},
                                     {'MESH:D010009': 'Osteochondrodysplasias',
                                      'PUBCHEM:11300750': 'Mycoepoxydiene',
                                      'CHEBI:CHEBI:48552', 'medetomidine'},
                                     ['CHEBI:CHEBI:48552', 'PUBCHEM:11300750'])

In [213]:
grounding_dict = {'MED': grounding_map}

In [219]:
classifier = AdeftClassifier('MED', pos_labels)

In [220]:
len(texts)

43

In [228]:
param_grid = {'C': [100.0], 'max_features': [1000]}

In [229]:
labeler = AdeftLabeler(grounding_dict)

In [230]:
corpus = labeler.build_from_texts(shortform_texts)

In [231]:
texts, labels = zip(*corpus)

In [232]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 10:25:47] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [1000]}
INFO: [2019-12-17 10:25:48] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.6633333333333333 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [234]:
classifier.stats

{'label_distribution': {'ungrounded': 57,
  'CHEBI:CHEBI:48552': 11,
  'MESH:D010009': 35,
  'PUBCHEM:11300750': 10},
 'f1': {'mean': 0.6633333333333333, 'std': 0.1796292478040997},
 'precision': {'mean': 0.73, 'std': 0.24819347291981714},
 'recall': {'mean': 0.63, 'std': 0.15033296378372907}}

In [235]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [236]:
disamb.disambiguate(texts[2])

('MESH:D010009',
 'Osteochondrodysplasias',
 {'CHEBI:CHEBI:48552': 0.0004532352177294433,
  'MESH:D010009': 0.9990150835227889,
  'PUBCHEM:11300750': 0.00030488616098269574,
  'ungrounded': 0.00022679509849898903})

In [237]:
disamb.dump('MED', '../results')

In [188]:
from adeft.disambiguate import load_disambiguator

In [189]:
d = load_disambiguator('HIR', '../results')

In [190]:
d.disambiguate(texts[0])

('HGNC:6091',
 'INSR',
 {'HGNC:6091': 0.9495047219792215,
  'MESH:D015427': 0.03378744088190508,
  'ungrounded': 0.016707837138873322})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')