In [106]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [107]:
shortforms = ['DGK']
genes = ['FLT1']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [62]:
genes

['FLT1']

In [108]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [109]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [111]:
top = miners['DGK'].top()

In [112]:
top

[('diacylglycerol kinase', 132.88888888888889),
 ('kinase', 65.5195530726257),
 ('dag kinase', 20.64),
 ('of diacylglycerol kinase', 15.411764705882353),
 ('by diacylglycerol kinase', 12.0),
 ('and diacylglycerol kinase', 8.833333333333334),
 ('dg kinase', 7.0),
 ('diacylglycerol dag kinase', 6.0),
 ('by dag kinase', 5.428571428571429),
 ('the diacylglycerol kinase', 5.0),
 ('a diacylglycerol kinase', 3.0),
 ('diacylglycerol dg kinase', 3),
 ('that diacylglycerol kinase', 3.0),
 ('phospholipase d pld and diacylglycerol kinase', 2.5),
 ('for diacylglycerol kinase', 2.0),
 ('mammalian diacylglycerol kinase', 2.0),
 ('is catalyzed by diacylglycerol kinase', 2.0),
 ('plc and diacylglycerol kinase', 1.5),
 ('pa by diacylglycerol kinase', 1.3333333333333333),
 ('role of diacylglycerol kinase', 1.3333333333333333),
 ('phospholipase c plc and diacylglycerol kinase', 1.3333333333333333),
 ('diacyglycerol kinase', 1.0),
 ('by dg kinase', 1.0),
 ('cycle diacylglycerol kinase', 1.0),
 ('dn diacylg

In [67]:
longforms0 = miners['FRT'].get_longforms()

In [68]:
list(enumerate(longforms0))

[(0, ('the female reproductive tract', 31.72340425531915)),
 (1, ('target', 22.941176470588236)),
 (2, ('rat thyroid', 12.32)),
 (3, ('time', 3.2)),
 (4, ('site', 2.0)),
 (5, ('free running type', 2.0)),
 (6, ('fischer rat thyroid cells', 2.0)),
 (7, ('therapy', 1.3333333333333333)),
 (8, ('rosa roxburghii tratt', 1.3333333333333333))]

In [69]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [0, 1, 2, 3, 4, 7]]

In [71]:
list(enumerate(top))

[(0, ('the female reproductive tract', 31.72340425531915)),
 (1, ('in the female reproductive tract', 23.12)),
 (2, ('target', 22.941176470588236)),
 (3, ('female reproductive tract', 22.459016393442624)),
 (4, ('rat thyroid', 12.32)),
 (5, ('fischer rat thyroid', 12.142857142857142)),
 (6, ('recognition target', 10.0)),
 (7, ('of the female reproductive tract', 9.818181818181818)),
 (8, ('fisher rat thyroid', 9.4)),
 (9, ('flp recognition target', 5.555555555555555)),
 (10, ('the human female reproductive tract', 4.6)),
 (11, ('flippase recognition target', 4.333333333333333)),
 (12, ('flp recombination target', 3.6)),
 (13, ('folate receptor target', 3.5)),
 (14, ('time', 3.2)),
 (15, ('recombinase target', 3.2)),
 (16, ('the flp recognition target', 3.2)),
 (17, ('in fischer rat thyroid', 3.0)),
 (18, ('immune system in the female reproductive tract', 2.5)),
 (19, ('site', 2.0)),
 (20, ('free running type', 2.0)),
 (21, ('fischer rat thyroid cells', 2.0)),
 (22, ('in fisher rat thyr

In [72]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [3, 5, 8, 9, 11, 12, 13, 32, 33, 35]])

In [74]:
longforms0

[('free running type', 2.0),
 ('fischer rat thyroid cells', 2.0),
 ('rosa roxburghii tratt', 1.3333333333333333),
 ('female reproductive tract', 22.459016393442624),
 ('fischer rat thyroid', 12.142857142857142),
 ('fisher rat thyroid', 9.4),
 ('flp recognition target', 5.555555555555555),
 ('flippase recognition target', 4.333333333333333),
 ('flp recombination target', 3.6),
 ('folate receptor target', 3.5),
 ('flip recombinase target', 1.0),
 ('flp recombinase target', 1.0),
 ('fractionated radiation therapy', 1.0)]

In [75]:
longforms0.sort(key=lambda x: -x[1])

In [76]:
longforms, scores = zip(*longforms0)

In [77]:
longforms

('female reproductive tract',
 'fischer rat thyroid',
 'fisher rat thyroid',
 'flp recognition target',
 'flippase recognition target',
 'flp recombination target',
 'folate receptor target',
 'free running type',
 'fischer rat thyroid cells',
 'rosa roxburghii tratt',
 'flip recombinase target',
 'flp recombinase target',
 'fractionated radiation therapy')

In [78]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [79]:
grounding_map

{}

In [80]:
names

{}

In [81]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [38]:
names

{'CHEBI:CHEBI:17593': 'maltooligosaccharide',
 'CHEBI:CHEBI:25174': 'mannooligosaccharide',
 'MESH:D018384': 'Oxidative Stress',
 'CHEBI:CHEBI:31867': 'Mosapride citrate'}

In [39]:
grounding_map

{'main olfactory system': 'ungrounded',
 'maltooligosaccharides': 'CHEBI:CHEBI:17593',
 'mannan oligosaccharide': 'CHEBI:CHEBI:25174',
 'mannanoligosaccharides': 'CHEBI:CHEBI:25174',
 'mannooligosaccharides': 'CHEBI:CHEBI:25174',
 'margin of stability': 'ungrounded',
 'metal oxidative semiconductor': 'ungrounded',
 'mitochondrial oxidative stress': 'MESH:D018384',
 'mosapride citrate': 'CHEBI:CHEBI:31867',
 'motor optimality score': 'ungrounded',
 'occlusal splint': 'ungrounded',
 'swimming': 'ungrounded'}

In [40]:
names['HGNC:7199'] = 'MOS'
names['HGNC:18234'] = 'MOCOS'
pos_labels.extend(['HGNC:7199', 'HGNC:18234'])

In [82]:
result = grounding_map, names, pos_labels

In [83]:
result

({'female reproductive tract': 'ungrounded',
  'fischer rat thyroid': 'ungrounded',
  'fischer rat thyroid cells': 'ungrounded',
  'fisher rat thyroid': 'ungrounded',
  'flip recombinase target': 'ungrounded',
  'flippase recognition target': 'ungrounded',
  'flp recognition target': 'ungrounded',
  'flp recombinase target': 'ungrounded',
  'flp recombination target': 'ungrounded',
  'folate receptor target': 'ungrounded',
  'fractionated radiation therapy': 'ungrounded',
  'free running type': 'ungrounded',
  'rosa roxburghii tratt': 'ungrounded'},
 {},
 [])

In [84]:
grounding_map, names, pos_labels = ({'female reproductive tract': 'ungrounded',
  'fischer rat thyroid': 'ungrounded',
  'fischer rat thyroid cells': 'ungrounded',
  'fisher rat thyroid': 'ungrounded',
  'flip recombinase target': 'ungrounded',
  'flippase recognition target': 'ungrounded',
  'flp recognition target': 'ungrounded',
  'flp recombinase target': 'ungrounded',
  'flp recombination target': 'ungrounded',
  'folate receptor target': 'ungrounded',
  'fractionated radiation therapy': 'ungrounded',
  'free running type': 'ungrounded',
  'rosa roxburghii tratt': 'ungrounded'},
 {},
 [])

In [87]:
names['HGNC:3763'] = 'FLT1'
pos_labels.append('HGNC:3763')

In [88]:
grounding_dict = {'FRT': grounding_map}

In [89]:
classifier = AdeftClassifier('FRT', pos_labels=pos_labels)

In [90]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [91]:
labeler = AdeftLabeler(grounding_dict)

In [92]:
corpus = labeler.build_from_texts(shortform_texts)

In [93]:
corpus.extend(entrez_texts)

In [94]:
texts, labels = zip(*corpus)

In [95]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-21 12:02:24] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-21 12:02:31] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9931913660727221 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [96]:
classifier.stats

{'label_distribution': {'ungrounded': 84, 'HGNC:3763': 585},
 'f1': {'mean': 0.9931913660727221, 'std': 0.0034044710713760622},
 'precision': {'mean': 0.9898585075055664, 'std': 0.006273519191442433},
 'recall': {'mean': 0.9965811965811966, 'std': 0.004187161953475503}}

In [97]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [98]:
d = disamb.disambiguate(shortform_texts)

In [99]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC')]

In [105]:
a[4]

('In this study, we have analyzed the presence and subsets of NK cells throughout the tissues of the FRT. We demonstrate that there are NK cells in the various FRT tissues and that their phenotype and regulation are largely dependent upon the FRT tissue where they reside. NK cells in the Fallopian tube, endometrium, cervix, and ectocervix expressed CD9 while blood NK cells did not. We have also found that unique subsets of NK cells are in specific locations of the FRT. The NK cells in the lower reproductive tract did not express CD94, but they did express CD16. In contrast, NK cells in the upper FRT express high amounts of CD94 and CD69, but few NK cells expressed CD16. All of these FRT NK cells were able to produce IFN-gamma upon stimulation with cytokines. Furthermore, the number of NK cells varied with the menstrual cycle in the endometrium but not in the cervix or ectocervix. These data suggest that unique characteristics of the tissues may account of specific localization of diffe

In [57]:
a[1]

('Interestingly, this potent kinase signaling pathway that regulates oocyte maturation is the mitogen-activated protein kinase (MAPK) pathway. At the top of the cascade is MOS, an oocyte-specific mitogen-activated protein kinase (Mek) kinase similar to Raf ( Fig. 1 ).  Mos  mRNA is present in immature oocytes, but essentially none is translated into stable MOS protein. Steroid-triggered reductions in cAMP lead to increased polyadenylation of  Mos  mRNA in a process that involves interactions of regulatory proteins EG2 and cytoplasmic polyadenylation element-binding protein (CPEB) with the 3′ untranslated region of  Mos  mRNA [ 21 – 26 ]. Increased polyadenylation of  Mos  mRNA leads to increased MOS protein expression. Once present, MOS protein is constitutively active and activates Mek1 [ 27 ], which in turn activates extracellular signaling-regulated kinase 2 (Erk2, or p42), and finally cycle-dependent kinase 1 (CDK1). Activated Erk2 then feeds back to promote further increases in MO

In [58]:
disamb.dump('MOS', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('FES', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [59]:
model_to_s3(disamb)