In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

INFO: [2020-01-13 09:45:49] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
shortforms = ['ALK']
genes = ['ALK']
families = {'RSTK1': ['ACVRL1', 'ACVR1', 'ACVR1B', 'ACVR1C', 'BMPR1A',
                      'BMPR1B', 'TGFBR1']}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['ALK'].top()

In [6]:
top

[('anaplastic lymphoma kinase', 948.8728813559322),
 ('the anaplastic lymphoma kinase', 185.81188118811886),
 ('of anaplastic lymphoma kinase', 109.87826086956522),
 ('kinase', 92.91263940520446),
 ('and anaplastic lymphoma kinase', 64.29729729729729),
 ('of the anaplastic lymphoma kinase', 62.833333333333336),
 ('in anaplastic lymphoma kinase', 26.96551724137931),
 ('patients with anaplastic lymphoma kinase', 22.652173913043477),
 ('activin receptor like kinase', 20.454545454545453),
 ('rearrangements of the anaplastic lymphoma kinase', 20.0),
 ('an anaplastic lymphoma kinase', 19.909090909090914),
 ('with anaplastic lymphoma kinase', 19.63157894736842),
 ('for anaplastic lymphoma kinase', 18.5),
 ('in the anaplastic lymphoma kinase', 14.545454545454547),
 ('epidermal growth factor receptor egfr and anaplastic lymphoma kinase',
  13.733333333333334),
 ('or anaplastic lymphoma kinase', 13.0),
 ('echinoderm microtubule associated protein like 4 eml4 anaplastic lymphoma kinase',
  12.705

In [7]:
longforms0 = miners['ALK'].get_longforms()

In [8]:
longforms0

[('anaplastic lymphoma kinase', 948.8728813559322),
 ('alkaline phosphatase', 6.75),
 ('kinase gene', 6.428571428571429),
 ('2p23', 3.2),
 ('anaplastic lymphomakinase', 2.0),
 ('receptor', 1.3333333333333333)]

In [9]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 1, 3, 4]]

In [10]:
top[8]

('activin receptor like kinase', 20.454545454545453)

In [11]:
longforms0.extend([x for i, x in enumerate(top) if i in [8]])

In [12]:
longforms0

[('anaplastic lymphoma kinase', 948.8728813559322),
 ('alkaline phosphatase', 6.75),
 ('2p23', 3.2),
 ('anaplastic lymphomakinase', 2.0),
 ('activin receptor like kinase', 20.454545454545453)]

In [13]:
longforms0.sort(key=lambda x: -x[1])

In [14]:
longforms, scores = zip(*longforms0)

In [15]:
longforms

('anaplastic lymphoma kinase',
 'activin receptor like kinase',
 'alkaline phosphatase',
 '2p23',
 'anaplastic lymphomakinase')

In [None]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [None]:
grounding_map

In [None]:
names

In [None]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [None]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [None]:
result = (grounding_map, names, pos_labels)

In [None]:
result

In [16]:
grounding_map, names, pos_labels = ({'2p23': 'HGNC:427',
  'activin receptor like kinase': 'FPLX:RSTK1',
  'alkaline phosphatase': 'MESH:D000469',
  'anaplastic lymphoma kinase': 'HGNC:427',
  'anaplastic lymphomakinase': 'HGNC:427'},
 {'HGNC:427': 'ALK',
  'MESH:D000469': 'Alkaline Phosphatase',
  'FPLX:RSTK1': 'RSTK1'},
 ['FPLX:RSTK1', 'HGNC:427', 'MESH:D000469'])

In [17]:
grounding_dict = {'ALK': grounding_map}

In [21]:
classifier = AdeftClassifier('ALK', pos_labels=pos_labels)

In [22]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [23]:
labeler = AdeftLabeler(grounding_dict)

In [24]:
corpus = labeler.build_from_texts(shortform_texts)

In [25]:
corpus.extend(entrez_texts)

In [26]:
texts, labels = zip(*corpus)

In [27]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-13 09:51:54] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-13 09:52:40] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.974932693929812 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [28]:
classifier.stats

{'label_distribution': {'HGNC:427': 1181,
  'MESH:D000469': 6,
  'FPLX:RSTK1': 155},
 'f1': {'mean': 0.974932693929812, 'std': 0.011692411541541438},
 'precision': {'mean': 0.9736705272228592, 'std': 0.010918668748222329},
 'recall': {'mean': 0.9776727473742399, 'std': 0.010176447404917046}}

In [29]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [30]:
d = disamb.disambiguate(shortform_texts)

In [31]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'FPLX:RSTK1']

In [32]:
len(a)

53

In [34]:
a[1]

'The fully reprogrammed pluripotency can be sorted into ICM-like state (naïve) and post-implantation epiblasts state (primed) 23 . Dissections of each pluripotent state indicated that the naïve state was dependent on JAK/STAT signaling that was activated by LIF, and the primed state was dependent on PI3K/AKT and ALK/SMADs signaling that was activated by b-FGF and transforming growth factor-β1 (TGF-β1)/Activin A. The primed state pluripotency in human and mouse PSCs showed similar gene expression profiles and culture requirements 24 – 26 ; however, the naïve pluripotency was different between the two species, which required different stimulations 24 , 27 – 30 . Unfortunately, both defined states were illusive in pig since none of the above conditions were capable of deriving fully reprogrammed porcine ESCs 31 . The species-related regulatory signaling pathway as reported in mouse and human PSCs is likely to be applied in pig and other animals 32 , in which PI3K/AKT and TGF-beta signalin

In [35]:
disamb.dump('ALK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [None]:
model_to_s3(disamb)