In [175]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [205]:
shortforms = ['RK']
genes = ['GRK1', 'RBKS']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [206]:
genes

['GRK1', 'RBKS']

In [207]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [179]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [180]:
top = miners['RK'].top()

In [181]:
top

[('raspberry ketone', 26.85714285714286),
 ('rhodopsin kinase', 23.923076923076923),
 ('kinase', 13.37142857142857),
 ('radial keratotomy', 7.777777777777778),
 ('remnant kidney', 6.25),
 ('by rhodopsin kinase', 4.857142857142858),
 ('kernels', 4.2),
 ('rye bread with kernels', 4.0),
 ('arabinoxylan ax bread β glucan bg bread dark ground rye gr bread and rye bread with kernels',
  4),
 ('ribokinase', 3.5),
 ('kidney', 3.2),
 ('sence and presence of microtubule obstacles two concentrations of a tau or b rigor kinesin',
  3),
 ('reactivating kinase', 2.8),
 ('lysine', 2.0),
 ('rapid kindling', 2.0),
 ('red kohlrabi', 2.0),
 ('of raspberry ketone', 2.0),
 ('phosphorylation by rhodopsin kinase', 2.0),
 ('upon illumination rhodopsin kinase', 2),
 ('yoni trapped per week when supplemented pre release as immatures 2 3 days with raspberry ketone',
  2),
 ('oxylan bread ax ○ β glucan bread bg + dark ground rye bread gr □ rye bread with kernels',
  2),
 ('rigor kinesin', 1.5),
 ('rye kernels', 1

In [182]:
longforms0 = miners['RK'].get_longforms()

In [185]:
list(enumerate(longforms0))

[(0, ('raspberry ketone', 26.85714285714286)),
 (1, ('rhodopsin kinase', 23.923076923076923)),
 (2, ('radial keratotomy', 7.777777777777778)),
 (3, ('remnant kidney', 6.25)),
 (4, ('kernels', 4.2)),
 (5, ('ribokinase', 3.5)),
 (6,
  ('sence and presence of microtubule obstacles two concentrations of a tau or b rigor kinesin',
   3)),
 (7, ('lysine', 2.0)),
 (8, ('red kohlrabi', 2.0)),
 (9, ('rapid kindling', 2.0))]

In [186]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [6]]

In [187]:
longforms0

[('raspberry ketone', 26.85714285714286),
 ('rhodopsin kinase', 23.923076923076923),
 ('radial keratotomy', 7.777777777777778),
 ('remnant kidney', 6.25),
 ('kernels', 4.2),
 ('ribokinase', 3.5),
 ('lysine', 2.0),
 ('red kohlrabi', 2.0),
 ('rapid kindling', 2.0)]

In [99]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [13]])

In [100]:
longforms0

[('clock', 86.3921568627451),
 ('cdc2 like kinase', 6.0),
 ('contralateral kidney', 2.0)]

In [101]:
longforms0.sort(key=lambda x: -x[1])

In [188]:
longforms, scores = zip(*longforms0)

In [189]:
longforms

('raspberry ketone',
 'rhodopsin kinase',
 'radial keratotomy',
 'remnant kidney',
 'kernels',
 'ribokinase',
 'lysine',
 'red kohlrabi',
 'rapid kindling')

In [190]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [191]:
grounding_map

{'raspberry ketone': 'CHEBI:CHEBI:68656',
 'rhodopsin kinase': 'MESH:D051606',
 'radial keratotomy': 'MESH:D007646',
 'ribokinase': 'HGNC:30325',
 'lysine': 'CHEBI:CHEBI:25094'}

In [192]:
names

{'CHEBI:CHEBI:68656': 'raspberry ketone',
 'MESH:D051606': 'G-Protein-Coupled Receptor Kinase 1',
 'MESH:D007646': 'Keratotomy, Radial',
 'HGNC:30325': 'RBKS',
 'CHEBI:CHEBI:25094': 'lysine'}

In [193]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

ERROR: [2020-01-21 09:57:25] werkzeug - Error on request:
Traceback (most recent call last):
  File "/Users/albertsteppi/.virtualenvs/py37/lib/python3.7/site-packages/werkzeug/serving.py", line 302, in run_wsgi
    execute(self.server.app)
  File "/Users/albertsteppi/.virtualenvs/py37/lib/python3.7/site-packages/werkzeug/serving.py", line 293, in execute
    write(data)
  File "/Users/albertsteppi/.virtualenvs/py37/lib/python3.7/site-packages/werkzeug/serving.py", line 274, in write
    self.wfile.write(data)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socketserver.py", line 799, in write
    self._sock.sendall(b)
OSError: [Errno 41] Protocol wrong type for socket


In [147]:
names

{'PUBCHEM:10869981': 'Fluoroestradiol F-18',
 'MESH:D004620': 'Embolism, Fat',
 'MESH:D004599': 'Electric Stimulation Therapy'}

In [148]:
grounding_map

{'estradiol': 'PUBCHEM:10869981',
 'fat embolism syndrome': 'MESH:D004620',
 'first episode schizophrenia': 'ungrounded',
 'fluoroestradiol': 'PUBCHEM:10869981',
 'free energy surface': 'ungrounded',
 'functional electrical stimulation': 'MESH:D004599'}

In [194]:
result = grounding_map, names, pos_labels

In [195]:
result

({'kernels': 'ungrounded',
  'lysine': 'ungrounded',
  'radial keratotomy': 'MESH:D007646',
  'rapid kindling': 'ungrounded',
  'raspberry ketone': 'CHEBI:CHEBI:68656',
  'red kohlrabi': 'ungrounded',
  'remnant kidney': 'ungrounded',
  'rhodopsin kinase': 'MESH:D051606',
  'ribokinase': 'HGNC:30325'},
 {'MESH:D007646': 'Keratotomy, Radial',
  'CHEBI:CHEBI:68656': 'raspberry ketone',
  'MESH:D051606': 'G-Protein-Coupled Receptor Kinase 1',
  'HGNC:30325': 'RBKS'},
 ['CHEBI:CHEBI:68656', 'HGNC:30325', 'MESH:D007646', 'MESH:D051606'])

In [208]:
grounding_map, names, pos_labels = ({'kernels': 'ungrounded',
  'lysine': 'ungrounded',
  'radial keratotomy': 'MESH:D007646',
  'rapid kindling': 'ungrounded',
  'raspberry ketone': 'CHEBI:CHEBI:68656',
  'red kohlrabi': 'ungrounded',
  'remnant kidney': 'ungrounded',
  'rhodopsin kinase': 'MESH:D051606',
  'ribokinase': 'HGNC:30325'},
 {'MESH:D007646': 'Keratotomy, Radial',
  'CHEBI:CHEBI:68656': 'raspberry ketone',
  'MESH:D051606': 'G-Protein-Coupled Receptor Kinase 1',
  'HGNC:30325': 'RBKS'},
 ['CHEBI:CHEBI:68656', 'HGNC:30325', 'MESH:D007646', 'MESH:D051606'])

In [197]:
grounding_dict = {'RK': grounding_map}

In [198]:
classifier = AdeftClassifier('RK', pos_labels=pos_labels)

In [199]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [209]:
labeler = AdeftLabeler(grounding_dict)

In [210]:
corpus = labeler.build_from_texts(shortform_texts)

In [212]:
corpus.extend(entrez_texts)

In [215]:
texts, labels = zip(*corpus)

In [216]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-21 09:59:22] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-21 09:59:44] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.7337845992391446 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [217]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:68656': 10,
  'MESH:D051606': 24,
  'HGNC:30325': 14,
  'ungrounded': 15,
  'MESH:D007646': 7,
  'HGNC:10013': 42},
 'f1': {'mean': 0.7337845992391446, 'std': 0.13323452134462177},
 'precision': {'mean': 0.8299242424242423, 'std': 0.10772177930463969},
 'recall': {'mean': 0.6989898989898989, 'std': 0.17001083288315097}}

In [218]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [219]:
d = disamb.disambiguate(shortform_texts)

In [220]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC')]

In [221]:
len(a)

88

In [224]:
a[40]

('PDF to Text Batch Convert Multiple Files Software - Please purchase\npersonal license.  Pergamon    Psychoneuroendocrinology,Vol. 22. No.\n1, pp. 1-11, ]997 Copyright 1997 Elsevier Science Ltd. All rights\nreserved Printed in Great Britain 0306-4530/97 $1%00+ .00 P I h\nS0306-4530(96)00042-X    CENTRAL MEDIATION OF THE EFFECTS OF\nINTERLEUKIN-1 O N S O C I A L E X P L O R A T I O N A N D B O D Y W E\nI G H T IN M I C E Rose-Marie Bluth61, Robert Dantzer 1 and Keith W.\nKelley e  IINRA-INSERM U394, Domaine de Carreire, Rue Camille Saint-\nSa~ns, 33077 Bordeaux Cedex, France; and 2Laboratory of\nImmunophysiology, University of Illinois, Urbana, Illinois, USA\n(Received 2 July 1996; in final form 19 September 1996)    SUMMARY  To\nstudy the role of central IL-1 receptors in the effects of recombinant\nhuman IL-1B (1L-lfl) on behavior and body weight,\nintracerebroventricular (ICV) injection of the specific antagonist of\nIL-1 receptors, IL-lra, was administered to mice injected\nintrape

In [225]:
disamb.dump('RK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('FES', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [226]:
model_to_s3(disamb)