In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import AdeftDisambiguator
from adeft.modeling.classify import AdeftClassifier


from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['MED']
genes = ['SCN8A']
families = {'Macrophage_inflammatory_proteins': ['CCL3', 'CCL4']}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
groundings

['HGNC:10596', 'FPLX:Macrophage_inflammatory_proteins']

In [4]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    pmids = all_pmids[gene]
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [5]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [6]:
miners['MED'].compute_alignment_scores()

In [8]:
trie = miners['MED']._internal_trie

In [12]:
trie.children['electrodialysi']

<adeft.discover._TrieNode at 0x13b2df260>

In [13]:
_12.children['metathesi']

<adeft.discover._TrieNode at 0x13b2df100>

In [16]:
_13.children['dure']

<adeft.discover._TrieNode at 0x13a8bff70>

In [24]:
_13.best_ancestor_score

6.166666666666667

In [7]:
miners['MED'].get_longforms(use_alignment_based_scoring=False)

[('dose', 79, 0.9302274232722455),
 ('multiple epiphyseal dysplasia', 55, 0.9217114246700046),
 ('mediterranean', 15, 0.7183098591549295),
 ('medetomidine', 14, 0.7037037037037037),
 ('mycoepoxydiene', 12, 0.7142857142857143),
 ('mediator', 12, 0.6923076923076923),
 ('metathesis electrodialysis', 12, 0.5636363636363636),
 ('diet', 8, 0.4482758620689655),
 ('microendoscopic discectomy', 6, 0.5),
 ('medical', 5, 0.3548387096774194),
 ('b tabaci', 4, 0.3333333333333333),
 ('distillate', 4, 0.1111111111111111)]

In [9]:
miners['MED'].get_longforms(use_alignment_based_scoring=True, cutoff=0.8)

[('multiple epiphyseal dysplasia', 55, 0.9963986197984428),
 ('minimal erythema dose', 26, 0.9633978506922196),
 ('mediterranean', 15, 0.9971413643893225),
 ('medetomidine', 14, 0.997198513316415),
 ('morphine equivalent dose', 13, 0.9577536558487252),
 ('minimal effective dose', 13, 0.9560813018210231),
 ('mediator', 12, 0.9972431934685796),
 ('metathesis electrodialysis', 12, 0.9440712954161106),
 ('mycoepoxydiene', 12, 0.9063801075107808),
 ('minimum effective dose', 11, 0.9537501416611961),
 ('microendoscopic discectomy', 6, 0.9605827067560145),
 ('maximal effective dose', 6, 0.9495777028244041),
 ('medical', 5, 0.9985822053072961),
 ('methionine enriched diet', 5, 0.9979189008592835),
 ('minimal erythemal dose', 4, 0.9498796556349615),
 ('main excretory duct', 2, 1.0),
 ('medial', 2, 1.0),
 ('medulla', 2, 1.0),
 ('medullectomy', 2, 1.0),
 ('medulloblastoma', 2, 1.0),
 ('miniaturized electromagnetic device', 2, 1.0),
 ('minimum exposure distance', 2, 1.0),
 ('marihuana extract dist

In [12]:
longforms0

[('multiple epiphyseal dysplasia', 55, 0.9990434922264482),
 ('minimal erythema dose', 26, 0.9642443532842938),
 ('mediterranean', 15, 0.9990939146055442),
 ('medetomidine', 14, 0.999099522137419),
 ('morphine equivalent dose', 13, 0.958288491577354),
 ('minimal effective dose', 13, 0.9565237918123347),
 ('mediator', 12, 0.9991040378772884),
 ('metathesis electrodialysis', 12, 0.9454843494997125),
 ('mycoepoxydiene', 12, 0.9083184515856181),
 ('minimum effective dose', 11, 0.9540639072913988),
 ('microendoscopic discectomy', 6, 0.961780309952817),
 ('maximal effective dose', 6, 0.9496610705039266),
 ('medical', 5, 0.9993132556957216),
 ('methionine enriched diet', 5, 0.9987402834423367),
 ('minimal erythemal dose', 4, 0.9499796968503884),
 ('mediator complex', 3, 0.757689025371995),
 ('main excretory duct', 2, 1.0),
 ('medial', 2, 1.0),
 ('medulla', 2, 1.0),
 ('medullectomy', 2, 1.0),
 ('medulloblastoma', 2, 1.0),
 ('miniaturized electromagnetic device', 2, 1.0),
 ('minimum exposure di

In [27]:
from adeft.score import AlignmentBasedScorer

In [50]:
abs1 = AlignmentBasedScorer('MED', beta=0.95, gamma=0.95)
abs2 = AlignmentBasedScorer('MED')

In [51]:
abs1.expanding_score(['during', 'mycoepxydiene'])

[0.9161259357582442, 0.6942936289193057]

In [52]:
abs2.expanding_score(['during', 'mycoepxydiene'])

[0.7770510291922018, 0.8746748864246718]

In [53]:
longforms0

[('multiple epiphyseal dysplasia', 55, 0.9921905749853817),
 ('minimal erythema dose', 26, 0.7154196017671662),
 ('mediterranean', 15, 0.9812707760965216),
 ('medetomidine', 14, 0.9812269573851179),
 ('morphine equivalent dose', 13, 0.6680166975529871),
 ('minimal effective dose', 13, 0.6539713926006377),
 ('mediator', 12, 0.9812003885576985),
 ('metathesis electrodialysis', 12, 0.868079762881967),
 ('minimum effective dose', 11, 0.634393088727666),
 ('microendoscopic discectomy', 6, 0.8971397047791269),
 ('maximal effective dose', 6, 0.5993507622303903),
 ('methionine enriched diet', 5, 0.9875851864050953),
 ('medical', 5, 0.9822901288955395),
 ('minimal erythemal dose', 4, 0.6018867200690089),
 ('mediator complex', 3, 0.7477442596713679),
 ('main excretory duct', 2, 1.0),
 ('miniaturized electromagnetic device', 2, 1.0),
 ('minimum exposure distance', 2, 1.0),
 ('marihuana extract distillate', 2, 0.9916874653211372),
 ('medial', 2, 0.9888473769867046),
 ('medulla', 2, 0.9888473769867

In [17]:
from adeft.score import AlignmentBasedScorer

In [18]:
scorer = AlignmentBasedScorer('MED')

In [21]:
scorer.expanding_score(['during', 'metathesis', 'electrodialysis'])

[0.0, 0.869683331152402, 0.9024999773502351]

In [22]:
trie = miners['MED']._internal_trie

In [23]:
trie.children['electrodialysi']

<adeft.discover._TrieNode at 0x1105f9d60>

In [25]:
_23.children['metathesi']

<adeft.discover._TrieNode at 0x1105f9ec0>

In [26]:
_25.scaled_score(smoothing_param=4)

0.34444444444444444

In [27]:
_25.children['dure']

<adeft.discover._TrieNode at 0x1107d0680>

In [28]:
_27.scaled_score(smoothing_param=4)

0.0

In [51]:
longforms0

[('molecularly imprinted polymer', 91.88),
 ('protein', 53.23711340206186),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.857142857142858),
 ('peptide', 17.846153846153847),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('malaria in pregnancy', 8.666666666666666),
 ('promoter', 4.0),
 ('myo inositol 1 phosphate', 3.0),
 ('micropapillary', 2.5),
 ('mitochondrial intermediate peptidase', 2.5),
 ('maximal inspiratory', 2.0),
 ('maximum intensity persistence', 2.0),
 ('mercury intrusion porosimetry', 2.0),
 ('macrophage infectivity potential', 2.0),
 ('methylation induced premeiotically', 2.0),
 ('pain', 1.5)]

In [61]:
candidates[17]

('myoinhibitory peptide', 7.555555555555555)

In [53]:
longforms = longforms0[:1] + longforms0[2:5] + longforms0[6:9] + longforms0[10:]

In [54]:
longforms

[('molecularly imprinted polymer', 91.88),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.857142857142858),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('malaria in pregnancy', 8.666666666666666),
 ('myo inositol 1 phosphate', 3.0),
 ('micropapillary', 2.5),
 ('mitochondrial intermediate peptidase', 2.5),
 ('maximal inspiratory', 2.0),
 ('maximum intensity persistence', 2.0),
 ('mercury intrusion porosimetry', 2.0),
 ('macrophage infectivity potential', 2.0),
 ('methylation induced premeiotically', 2.0),
 ('pain', 1.5)]

In [62]:
longforms.extend([candidates[2], candidates[8], candidates[17], candidates[21]])
longforms.sort(key=lambda x: -x[1])

In [63]:
longforms

[('molecularly imprinted polymer', 91.88),
 ('macrophage inflammatory protein', 53.22807017543858),
 ('maximal inspiratory pressure', 38.82051282051282),
 ('maximum intensity projection', 20.363636363636363),
 ('mycobacterium indicus pranii', 18.857142857142858),
 ('major intrinsic protein', 17.88888888888889),
 ('minimally invasive parathyroidectomy', 11.8),
 ('molecularly inversion probe', 11.0),
 ('malaria in pregnancy', 8.666666666666666),
 ('myoinhibitory peptide', 7.555555555555555),
 ('myd88 inhibitory peptide', 5.714285714285714),
 ('myo inositol 1 phosphate', 3.0),
 ('micropapillary', 2.5),
 ('mitochondrial intermediate peptidase', 2.5),
 ('maximal inspiratory', 2.0),
 ('maximum intensity persistence', 2.0),
 ('mercury intrusion porosimetry', 2.0),
 ('macrophage infectivity potential', 2.0),
 ('methylation induced premeiotically', 2.0),
 ('pain', 1.5)]

In [23]:
longforms, scores = zip(*longforms)

In [24]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [25]:
grounding_map

{'mitochondrial intermediate peptidase': 'HGNC:7104', 'pain': 'MESH:D010146'}

In [26]:
result = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [28]:
grounding_map, names, pos_labels = result

In [36]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [38]:
result = (grounding_map, names, pos_labels)

In [64]:
grounding_map, names, pos_labels = \
    ({'macrophage infectivity potential': 'ungrounded',
      'macrophage inflammatory protein': 'FPLX:Macrophage_inflammatory_proteins',
      'major intrinsic protein': 'FPLX:AQP',
      'malaria in pregnancy': 'ungrounded',
      'maximal inspiratory': 'ungrounded',
      'maximal inspiratory pressure': 'ungrounded',
      'maximum intensity persistence': 'ungrounded',
      'maximum intensity projection': 'ungrounded',
      'mercury intrusion porosimetry': 'ungrounded',
      'methylation induced premeiotically': 'ungrounded',
      'micropapillary': 'ungrounded',
      'minimally invasive parathyroidectomy': 'ungrounded',
      'mitochondrial intermediate peptidase': 'HGNC:7104',
      'molecularly imprinted polymer': 'molecularly imprinted polymer',
      'molecularly inversion probe': 'ungrounded',
      'mycobacterium indicus pranii': 'ungrounded',
      'myd88 inhibitory peptide': 'ungrounded',
      'myo inositol 1 phosphate': 'ungrounded',
      'myoinhibitory peptide': 'ungrounded',
      'pain': 'ungrounded'},
     {'FPLX:Macrophage_inflammatory_proteins': 'Macrophage_inflammatory_proteins',
      'FPLX:AQP': 'AQP',
      'HGNC:7104': 'MIPEP',
      'molecularly imprinted polymer': 'molecularly imprinted polymer',
      'HGNC:7103': 'MIP',
      'HGNC:31102': 'MAFIP',
      'HGNC:6401': 'TNPO1'},
     ['HGNC:31102',
      'FPLX:AQP',
      'FPLX:Macrophage_inflammatory_proteins',
      'HGNC:7103',
      'HGNC:7104',
      'HGNC:6401'])

In [65]:
grounding_dict = {'MIP': grounding_map}

In [66]:
classifier = AdeftClassifier('MIP', pos_labels)

In [67]:
param_grid = {'C': [10.0, 100.0, 1000.0], 'max_features': [1000, 10000]}

In [68]:
labeler = AdeftLabeler(grounding_dict)

In [69]:
corpus = labeler.build_from_texts(shortform_texts)

In [70]:
corpus.extend(entrez_texts)

In [71]:
texts, labels = zip(*corpus)

In [72]:
labels

('molecularly imprinted polymer',
 'ungrounded',
 'FPLX:Macrophage_inflammatory_proteins',
 'FPLX:Macrophage_inflammatory_proteins',
 'molecularly imprinted polymer',
 'molecularly imprinted polymer',
 'molecularly imprinted polymer',
 'ungrounded',
 'FPLX:Macrophage_inflammatory_proteins',
 'FPLX:AQP',
 'ungrounded',
 'ungrounded',
 'ungrounded',
 'molecularly imprinted polymer',
 'molecularly imprinted polymer',
 'ungrounded',
 'FPLX:AQP',
 'molecularly imprinted polymer',
 'ungrounded',
 'molecularly imprinted polymer',
 'molecularly imprinted polymer',
 'ungrounded',
 'molecularly imprinted polymer',
 'ungrounded',
 'HGNC:7104',
 'molecularly imprinted polymer',
 'FPLX:Macrophage_inflammatory_proteins',
 'molecularly imprinted polymer',
 'ungrounded',
 'molecularly imprinted polymer',
 'FPLX:Macrophage_inflammatory_proteins',
 'FPLX:Macrophage_inflammatory_proteins',
 'molecularly imprinted polymer',
 'FPLX:Macrophage_inflammatory_proteins',
 'ungrounded',
 'ungrounded',
 'molecula

In [73]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-19 10:09:22] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0, 1000.0], 'max_features': [1000, 10000]}
INFO: [2019-12-19 10:11:19] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.8787164473103702 found for parameter values:
{'logit__C': 1000.0, 'tfidf__max_features': 1000}


In [74]:
pos_labels

['HGNC:31102',
 'FPLX:AQP',
 'FPLX:Macrophage_inflammatory_proteins',
 'HGNC:7103',
 'HGNC:7104',
 'HGNC:6401']

In [75]:
classifier.stats

{'label_distribution': {'molecularly imprinted polymer': 94,
  'ungrounded': 109,
  'FPLX:Macrophage_inflammatory_proteins': 304,
  'FPLX:AQP': 17,
  'HGNC:7104': 24,
  'HGNC:7103': 80,
  'HGNC:31102': 10,
  'HGNC:6401': 116},
 'f1': {'mean': 0.8585349306239577, 'std': 0.03350549602999615},
 'precision': {'mean': 0.862530896548089, 'std': 0.03367118570631103},
 'recall': {'mean': 0.8641440466440468, 'std': 0.033809042915954414}}

In [76]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [77]:
from adeft import __version__

In [79]:
from adeft_indra.s3 import model_to_s3

In [80]:
model_to_s3(disamb)

In [29]:
a = [text for text, disamb in zip(shortform_texts, disamb.disambiguate(shortform_texts)) if disamb[0] == 'HGNC:7104']

In [34]:
a[3]

'Morphine has cardioprotective effects against ischemic-reperfusion injuries. This study investigates whether morphine could mimic the antiapoptotic effect of preconditioning using a model of cultured neonatal rat cardiomyocytes subjected to metabolic inhibition (MI). To quantify MI-induced apoptosis, DNA fragmentation and mitochondrial cytochrome c release levels were measured by ELISA. MI-dependent DNA fragmentation was prevented by both Z-VAD-fmk (20 microM), a pan-caspase inhibitor, and cyclosporine A (CsA; 5 microM), a mitochondrial pore transition blocker, added during MI (36% and 54% decrease, respectively). MI-dependent cytochrome c release was not blocked by Z-VAD-fmk but was decreased (38%) by CsA during MI. Metabolic preconditioning (MIP) and preconditioning with morphine (1 microM) were also assessed. MI-dependent DNA fragmentation and cytochrome c release were prevented by MIP (40% and 45% decrease, respectively) and morphine (34% and 45%, respectively). The antiapoptotic 

In [139]:
texts[2]

'Mesenchymal stem cells (MSCs), which have multipotential differentiation and self-renewal potential, are possible cells for tissue engineering. Transforming growth factor β1 (TGFβ1) can be produced by MSCs in an inactive form, and the activation of TGFβ1 functions as an important regulator of osteogenic differentiation in MSCs. Recently, studies showed that GARP participated in the activation of latent TGFβ1, but the interaction between GARP and TGFβ1 is still undefined. In our study, we successfully isolated the MSCs from bone marrow of rats, and showed that GARP was detected in bone mesenchymal stem cells (BMSCs). During the osteogenic differentiation of BMSCs, GARP expression was increased over time. To elucidate the interaction between GARP and TGFβ1, we downregulated GARP expression in BMSCs to examine the level of active TGFβ1. We then verified that the downregulation of GARP decreased the secretion of active TGFβ1. Furthermore, osteogenic differentiation experiments, alkaline p

In [140]:
disamb.dump('MED', '../results')