In [131]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [132]:
shortforms = ['FES']
genes = ['FES']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [133]:
genes

['FES']

In [134]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [135]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [136]:
top = miners['FES'].top()

In [137]:
top

[('functional electrical stimulation', 175.17391304347825),
 ('fat embolism syndrome', 25.368421052631575),
 ('syndrome', 13.5),
 ('of functional electrical stimulation', 12.25),
 ('and functional electrical stimulation', 11.846153846153847),
 ('first episode schizophrenia', 10.83333333333333),
 ('electrical stimulation', 7.858695652173907),
 ('floppy eyelid syndrome', 7.5),
 ('schizophrenia', 6.470588235294118),
 ('fluoroestradiol', 6.0),
 ('effects of functional electrical stimulation', 5.142857142857142),
 ('free energy surface', 5.0),
 ('with functional electrical stimulation', 5.0),
 ('as functional electrical stimulation', 3.6666666666666665),
 ('that functional electrical stimulation', 3.6),
 ('through functional electrical stimulation', 3.6),
 ('estradiol', 3.142857142857143),
 ('by functional electrical stimulation', 3.0),
 ('during functional electrical stimulation', 3.0),
 ('induced by functional electrical stimulation', 3.0),
 ('patients with first episode schizophrenia', 3

In [138]:
longforms0 = miners['FES'].get_longforms()

In [139]:
longforms0

[('functional electrical stimulation', 175.17391304347825),
 ('fat embolism syndrome', 25.368421052631575),
 ('first episode schizophrenia', 10.83333333333333),
 ('fluoroestradiol', 6.0),
 ('free energy surface', 5.0),
 ('estradiol', 3.142857142857143)]

In [97]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 1]]

In [98]:
longforms0

[('clock', 86.3921568627451), ('cdc2 like kinase', 6.0)]

In [99]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [13]])

In [100]:
longforms0

[('clock', 86.3921568627451),
 ('cdc2 like kinase', 6.0),
 ('contralateral kidney', 2.0)]

In [101]:
longforms0.sort(key=lambda x: -x[1])

In [140]:
longforms, scores = zip(*longforms0)

In [141]:
longforms

('functional electrical stimulation',
 'fat embolism syndrome',
 'first episode schizophrenia',
 'fluoroestradiol',
 'free energy surface',
 'estradiol')

In [142]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [143]:
grounding_map

{'estradiol': 'CHEBI:CHEBI:23965'}

In [144]:
names

{'CHEBI:CHEBI:23965': 'estradiol'}

In [146]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [147]:
names

{'PUBCHEM:10869981': 'Fluoroestradiol F-18',
 'MESH:D004620': 'Embolism, Fat',
 'MESH:D004599': 'Electric Stimulation Therapy'}

In [148]:
grounding_map

{'estradiol': 'PUBCHEM:10869981',
 'fat embolism syndrome': 'MESH:D004620',
 'first episode schizophrenia': 'ungrounded',
 'fluoroestradiol': 'PUBCHEM:10869981',
 'free energy surface': 'ungrounded',
 'functional electrical stimulation': 'MESH:D004599'}

In [149]:
names['HGNC:3657'] = 'FES'
pos_labels.append('HGNC:3657')

In [150]:
result = grounding_map, names, pos_labels

In [151]:
result

({'estradiol': 'PUBCHEM:10869981',
  'fat embolism syndrome': 'MESH:D004620',
  'first episode schizophrenia': 'ungrounded',
  'fluoroestradiol': 'PUBCHEM:10869981',
  'free energy surface': 'ungrounded',
  'functional electrical stimulation': 'MESH:D004599'},
 {'PUBCHEM:10869981': 'Fluoroestradiol F-18',
  'MESH:D004620': 'Embolism, Fat',
  'MESH:D004599': 'Electric Stimulation Therapy',
  'HGNC:3657': 'FES'},
 ['MESH:D004599', 'MESH:D004620', 'PUBCHEM:10869981', 'HGNC:3657'])

In [152]:
grounding_map, names, pos_labels = ({'estradiol': 'PUBCHEM:10869981',
  'fat embolism syndrome': 'MESH:D004620',
  'first episode schizophrenia': 'ungrounded',
  'fluoroestradiol': 'PUBCHEM:10869981',
  'free energy surface': 'ungrounded',
  'functional electrical stimulation': 'MESH:D004599'},
 {'PUBCHEM:10869981': 'Fluoroestradiol F-18',
  'MESH:D004620': 'Embolism, Fat',
  'MESH:D004599': 'Electric Stimulation Therapy',
  'HGNC:3657': 'FES'},
 ['MESH:D004599', 'MESH:D004620', 'PUBCHEM:10869981', 'HGNC:3657'])

In [153]:
grounding_dict = {'FES': grounding_map}

In [154]:
classifier = AdeftClassifier('FES', pos_labels=pos_labels)

In [155]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [156]:
labeler = AdeftLabeler(grounding_dict)

In [157]:
corpus = labeler.build_from_texts(shortform_texts)

In [158]:
corpus.extend(entrez_texts)

In [159]:
texts, labels = zip(*corpus)

In [160]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-21 09:41:58] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-21 09:42:03] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9754195047836213 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [161]:
classifier.stats

{'label_distribution': {'MESH:D004599': 115,
  'MESH:D004620': 17,
  'ungrounded': 12,
  'PUBCHEM:10869981': 10,
  'HGNC:3657': 87},
 'f1': {'mean': 0.9754195047836213, 'std': 0.013901984206467233},
 'precision': {'mean': 0.9635044143060567, 'std': 0.015031648051337746},
 'recall': {'mean': 0.9911111111111112, 'std': 0.010886621079036362}}

In [162]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [163]:
d = disamb.disambiguate(shortform_texts)

In [164]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC')]

In [165]:
len(a)

38

In [171]:
a[5]

('To report a case of superficial punctate keratopathy (SPK) which coexisted with floppy eyelid syndrome (FES) and presented as intermittent red eye and blurred vision in an 11-year-old boy who slept in the prone position. His condition did not improve despite treatment with topical antibiotics (levofloxacin, tobramycin), steroid eye drops (prednisolone), and artificial tears. The patient was diagnosed with tonsil hypertrophy and nasopharyngeal adenoid hypertrophy and obstructive sleep apnea syndrome (OSAS). He underwent tonsillectomy and adenoidectomy. Then he started sleeping in the supine position postoperatively. The SPK, red eye and blurred vision completely resolved after surgery without additional treatment. The corneal sensation also recovered gradually during the next 7 years. However, the floppy eyelid did not resolve.\nKnown causes of superficial punctate keratopathy (SPK) in children include entropion, keratoconjunctivitis, and eye-drop toxicity [ 1 – 3 ]. However, there ar

In [172]:
disamb.dump('FES', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('FES', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [173]:
model_to_s3(disamb)