In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

INFO: [2020-01-13 09:55:59] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
shortforms = ['LAK']
genes = ['ALPK1', 'ARL4C']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['LAK'].top()

In [6]:
top

[('lymphokine activated killer', 347.8528610354222),
 ('of lymphokine activated killer', 78.72527472527473),
 ('and lymphokine activated killer', 51.04615384615384),
 ('activated killer', 40.39408866995075),
 ('lymphokine activated killer cells', 39.94871794871795),
 ('natural killer nk and lymphokine activated killer', 17.666666666666668),
 ('the generation of lymphokine activated killer', 14.88888888888889),
 ('induced lymphokine activated killer', 13.0),
 ('nk and lymphokine activated killer', 11.846153846153847),
 ('generation of lymphokine activated killer', 11.481481481481481),
 ('cells', 10.941176470588232),
 ('natural killer nk cells and lymphokine activated killer',
  8.777777777777779),
 ('induction of lymphokine activated killer', 8.714285714285715),
 ('activated of lymphokine activated killer', 8.6),
 ('with lymphokine activated killer', 7.818181818181818),
 ('to lymphokine activated killer', 7.555555555555555),
 ('the induction of lymphokine activated killer', 7.4285714285

In [7]:
longforms0 = miners['LAK'].get_longforms()

In [8]:
longforms0

[('lymphokine activated killer', 347.8528610354222),
 ('lymphokine activated killer cells', 39.94871794871795),
 ('lymphokine activated killing', 6.5),
 ('activated', 5.142857142857142),
 ('nk', 1.3333333333333333),
 ('cytotoxic', 1.3333333333333333)]

In [9]:
longforms = longforms0[:-3]

In [10]:
longforms, scores = zip(*longforms)

In [11]:
longforms

('lymphokine activated killer',
 'lymphokine activated killer cells',
 'lymphokine activated killing')

In [None]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [None]:
grounding_map

In [None]:
names

In [None]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [None]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [None]:
result = (grounding_map, names, pos_labels)

In [None]:
result

In [12]:
grounding_map, names, pos_labels = ({'lymphokine activated killer': 'MESH:D015979',
  'lymphokine activated killer cells': 'MESH:D015979',
  'lymphokine activated killing': 'MESH:D015979'},
 {'MESH:D015979': 'Killer Cells, Lymphokine-Activated',
  'HGNC:20917': 'ALPK1',
  'HGNC:698': 'ARL4C'},
 ['HGNC:698', 'MESH:D015979', 'HGNC:20917'])

In [13]:
grounding_dict = {'LAK': grounding_map}

In [14]:
classifier = AdeftClassifier('LAK', pos_labels=pos_labels)

In [15]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [16]:
labeler = AdeftLabeler(grounding_dict)

In [17]:
corpus = labeler.build_from_texts(shortform_texts)

In [18]:
corpus.extend(entrez_texts)

In [19]:
texts, labels = zip(*corpus)

In [20]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-13 09:56:44] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-13 09:56:48] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9697013670661081 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [21]:
classifier.stats

{'label_distribution': {'MESH:D015979': 412, 'HGNC:20917': 23, 'HGNC:698': 15},
 'f1': {'mean': 0.9697013670661081, 'std': 0.008747556320121906},
 'precision': {'mean': 0.9722186854769191, 'std': 0.008544994583387209},
 'recall': {'mean': 0.9711327873125626, 'std': 0.008759240691242223}}

In [22]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [23]:
d = disamb.disambiguate(shortform_texts)

In [24]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC:698']

In [25]:
len(a)

0

In [None]:
disamb.dump('LAK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [None]:
model_to_s3(disamb)