In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['FRT']
genes = ['MOS', 'MOCOS']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
genes

['MOS', 'MOCOS']

In [4]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [6]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [7]:
top = miners['FRT'].top()

In [8]:
top

[('the female reproductive tract', 31.72340425531915),
 ('in the female reproductive tract', 23.120000000000005),
 ('target', 22.941176470588236),
 ('female reproductive tract', 22.459016393442624),
 ('rat thyroid', 12.32),
 ('fischer rat thyroid', 12.14285714285714),
 ('recognition target', 10.0),
 ('of the female reproductive tract', 9.818181818181818),
 ('fisher rat thyroid', 9.4),
 ('flp recognition target', 5.555555555555555),
 ('the human female reproductive tract', 4.6),
 ('flippase recognition target', 4.333333333333333),
 ('flp recombination target', 3.6),
 ('folate receptor target', 3.5),
 ('time', 3.2),
 ('recombinase target', 3.2),
 ('the flp recognition target', 3.2),
 ('in fischer rat thyroid', 3.0),
 ('immune system in the female reproductive tract', 2.5),
 ('site', 2.0),
 ('free running type', 2.0),
 ('fischer rat thyroid cells', 2.0),
 ('in fisher rat thyroid', 2.0),
 ('cells in the female reproductive tract', 2.0),
 ('from saccharomyces cerevisiae induces dna bending 

In [31]:
longforms0 = miners['FRT'].get_longforms(smoothing_param=10)

In [32]:
longforms0

[('in the female reproductive tract', 0.6505882352941178),
 ('target', 0.5102599179206566),
 ('fischer rat thyroid', 0.4844720496894409),
 ('fisher rat thyroid', 0.42000000000000004),
 ('time', 0.15714285714285717),
 ('site', 0.08333333333333333),
 ('free running type', 0.08333333333333333),
 ('fischer rat thyroid cells', 0.08333333333333333),
 ('therapy', 0.027777777777777773),
 ('rosa roxburghii tratt', 0.027777777777777773)]

In [11]:
list(enumerate(longforms0))

[(0, ('in the female reproductive tract', 0.7900000000000001)),
 (1, ('fischer rat thyroid', 0.6554621848739495)),
 (2, ('of the female reproductive tract', 0.6298701298701299)),
 (3, ('fisher rat thyroid', 0.6)),
 (4, ('target', 0.5930047694753577)),
 (5, ('the human female reproductive tract', 0.39999999999999997)),
 (6, ('time', 0.275)),
 (7, ('site', 0.16666666666666666)),
 (8, ('free running type', 0.16666666666666666)),
 (9, ('fischer rat thyroid cells', 0.16666666666666666)),
 (10, ('therapy', 0.055555555555555546)),
 (11, ('rosa roxburghii tratt', 0.055555555555555546))]

In [26]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [7, 8, 10, 12]]

In [27]:
list(enumerate(top))

[(0, ('mannan oligosaccharide', 34.74999999999999)),
 (1, ('mannanoligosaccharides', 15.588235294117647)),
 (2, ('oligosaccharide', 14.244897959183675)),
 (3, ('mannooligosaccharides', 11.615384615384615)),
 (4, ('mitochondrial oxidative stress', 10.8)),
 (5, ('mosapride citrate', 9.125)),
 (6, ('of mannan oligosaccharide', 7.833333333333333)),
 (7, ('margin of stability', 7.6)),
 (8, ('agonist mosapride citrate', 4.8)),
 (9, ('the effects of mannan oligosaccharide', 4.0)),
 (10, ('5 ht 4 r agonist mosapride citrate', 4.0)),
 (11, ('of dietary mannan oligosaccharide', 3.4285714285714284)),
 (12, ('dietary mannan oligosaccharide', 3.333333333333333)),
 (13, ('of mannanoligosaccharides', 3.2)),
 (14, ('motor optimality score', 3.0)),
 (15, ('yeast cell wall mannooligosaccharides', 3.0)),
 (16, ('the effects of dietary mannan oligosaccharide', 3.0)),
 (17,
  ('ention period consisting of control con soccer soc moderate intensity continuous swimming',
   3)),
 (18, ('the main olfactory sys

In [28]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [30, 243]])

In [30]:
longforms0

[('mannan oligosaccharide', 34.74999999999999),
 ('mannanoligosaccharides', 15.588235294117647),
 ('mannooligosaccharides', 11.615384615384615),
 ('mitochondrial oxidative stress', 10.8),
 ('mosapride citrate', 9.125),
 ('margin of stability', 7.6),
 ('motor optimality score', 3.0),
 ('metal oxidative semiconductor', 2.5),
 ('maltooligosaccharides', 2.0),
 ('occlusal splint', 1.3333333333333333),
 ('main olfactory system', 1.666666666666667),
 ('swimming', 0.0)]

In [31]:
longforms0.sort(key=lambda x: -x[1])

In [32]:
longforms, scores = zip(*longforms0)

In [33]:
longforms

('mannan oligosaccharide',
 'mannanoligosaccharides',
 'mannooligosaccharides',
 'mitochondrial oxidative stress',
 'mosapride citrate',
 'margin of stability',
 'motor optimality score',
 'metal oxidative semiconductor',
 'maltooligosaccharides',
 'main olfactory system',
 'occlusal splint',
 'swimming')

In [34]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [35]:
grounding_map

{'mannooligosaccharides': 'CHEBI:CHEBI:25174',
 'mosapride citrate': 'CHEBI:CHEBI:31867',
 'maltooligosaccharides': 'CHEBI:CHEBI:17593',
 'occlusal splint': 'MESH:D017090',
 'swimming': 'GO:GO:0036268'}

In [36]:
names

{'CHEBI:CHEBI:25174': 'mannooligosaccharide',
 'CHEBI:CHEBI:31867': 'Mosapride citrate',
 'CHEBI:CHEBI:17593': 'maltooligosaccharide',
 'MESH:D017090': 'Occlusal Splints',
 'GO:GO:0036268': 'swimming'}

In [37]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [38]:
names

{'CHEBI:CHEBI:17593': 'maltooligosaccharide',
 'CHEBI:CHEBI:25174': 'mannooligosaccharide',
 'MESH:D018384': 'Oxidative Stress',
 'CHEBI:CHEBI:31867': 'Mosapride citrate'}

In [39]:
grounding_map

{'main olfactory system': 'ungrounded',
 'maltooligosaccharides': 'CHEBI:CHEBI:17593',
 'mannan oligosaccharide': 'CHEBI:CHEBI:25174',
 'mannanoligosaccharides': 'CHEBI:CHEBI:25174',
 'mannooligosaccharides': 'CHEBI:CHEBI:25174',
 'margin of stability': 'ungrounded',
 'metal oxidative semiconductor': 'ungrounded',
 'mitochondrial oxidative stress': 'MESH:D018384',
 'mosapride citrate': 'CHEBI:CHEBI:31867',
 'motor optimality score': 'ungrounded',
 'occlusal splint': 'ungrounded',
 'swimming': 'ungrounded'}

In [40]:
names['HGNC:7199'] = 'MOS'
names['HGNC:18234'] = 'MOCOS'
pos_labels.extend(['HGNC:7199', 'HGNC:18234'])

In [41]:
result = grounding_map, names, pos_labels

In [42]:
result

({'main olfactory system': 'ungrounded',
  'maltooligosaccharides': 'CHEBI:CHEBI:17593',
  'mannan oligosaccharide': 'CHEBI:CHEBI:25174',
  'mannanoligosaccharides': 'CHEBI:CHEBI:25174',
  'mannooligosaccharides': 'CHEBI:CHEBI:25174',
  'margin of stability': 'ungrounded',
  'metal oxidative semiconductor': 'ungrounded',
  'mitochondrial oxidative stress': 'MESH:D018384',
  'mosapride citrate': 'CHEBI:CHEBI:31867',
  'motor optimality score': 'ungrounded',
  'occlusal splint': 'ungrounded',
  'swimming': 'ungrounded'},
 {'CHEBI:CHEBI:17593': 'maltooligosaccharide',
  'CHEBI:CHEBI:25174': 'mannooligosaccharide',
  'MESH:D018384': 'Oxidative Stress',
  'CHEBI:CHEBI:31867': 'Mosapride citrate',
  'HGNC:7199': 'MOS',
  'HGNC:18234': 'MOCOS'},
 ['CHEBI:CHEBI:17593',
  'CHEBI:CHEBI:25174',
  'CHEBI:CHEBI:31867',
  'MESH:D018384',
  'HGNC:7199',
  'HGNC:18234'])

In [43]:
grounding_map, names, pos_labels = ({'main olfactory system': 'ungrounded',
  'maltooligosaccharides': 'CHEBI:CHEBI:17593',
  'mannan oligosaccharide': 'CHEBI:CHEBI:25174',
  'mannanoligosaccharides': 'CHEBI:CHEBI:25174',
  'mannooligosaccharides': 'CHEBI:CHEBI:25174',
  'margin of stability': 'ungrounded',
  'metal oxidative semiconductor': 'ungrounded',
  'mitochondrial oxidative stress': 'MESH:D018384',
  'mosapride citrate': 'CHEBI:CHEBI:31867',
  'motor optimality score': 'ungrounded',
  'occlusal splint': 'ungrounded',
  'swimming': 'ungrounded'},
 {'CHEBI:CHEBI:17593': 'maltooligosaccharide',
  'CHEBI:CHEBI:25174': 'mannooligosaccharide',
  'MESH:D018384': 'Oxidative Stress',
  'CHEBI:CHEBI:31867': 'Mosapride citrate',
  'HGNC:7199': 'MOS',
  'HGNC:18234': 'MOCOS'},
 ['CHEBI:CHEBI:17593',
  'CHEBI:CHEBI:25174',
  'CHEBI:CHEBI:31867',
  'MESH:D018384',
  'HGNC:7199',
  'HGNC:18234'])

In [44]:
grounding_dict = {'MOS': grounding_map}

In [45]:
classifier = AdeftClassifier('MOS', pos_labels=pos_labels)

In [46]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [47]:
labeler = AdeftLabeler(grounding_dict)

In [48]:
corpus = labeler.build_from_texts(shortform_texts)

In [49]:
corpus.extend(entrez_texts)

In [50]:
texts, labels = zip(*corpus)

In [51]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-21 11:08:44] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-21 11:09:27] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.7980731561371396 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [259]:
classifier.stats

{'label_distribution': {'UP:Q9SCZ4': 32, 'ungrounded': 17, 'HGNC:3655': 78},
 'f1': {'mean': 0.9610480217432276, 'std': 0.015913358823083618},
 'precision': {'mean': 0.934003304879264, 'std': 0.02025652527603831},
 'recall': {'mean': 0.9913043478260869, 'std': 0.017391304347826077}}

In [52]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [53]:
d = disamb.disambiguate(shortform_texts)

In [54]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC')]

In [55]:
len(a)

37

In [57]:
a[1]

('Interestingly, this potent kinase signaling pathway that regulates oocyte maturation is the mitogen-activated protein kinase (MAPK) pathway. At the top of the cascade is MOS, an oocyte-specific mitogen-activated protein kinase (Mek) kinase similar to Raf ( Fig. 1 ).  Mos  mRNA is present in immature oocytes, but essentially none is translated into stable MOS protein. Steroid-triggered reductions in cAMP lead to increased polyadenylation of  Mos  mRNA in a process that involves interactions of regulatory proteins EG2 and cytoplasmic polyadenylation element-binding protein (CPEB) with the 3′ untranslated region of  Mos  mRNA [ 21 – 26 ]. Increased polyadenylation of  Mos  mRNA leads to increased MOS protein expression. Once present, MOS protein is constitutively active and activates Mek1 [ 27 ], which in turn activates extracellular signaling-regulated kinase 2 (Erk2, or p42), and finally cycle-dependent kinase 1 (CDK1). Activated Erk2 then feeds back to promote further increases in MO

In [58]:
disamb.dump('MOS', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('FES', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [59]:
model_to_s3(disamb)