In [15]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [16]:
shortforms = ['CLK']
genes = ['CLOCK']
families = {'CLK': ['CLK1', 'CLK2', 'CLK3', 'CLK4']}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [17]:
genes

['CLOCK', 'CLK1', 'CLK2', 'CLK3', 'CLK4']

In [18]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [19]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [20]:
top = miners['CLK'].top()

In [21]:
top

[('clock', 89.51428571428572),
 ('transcription factors clock', 22.756756756756758),
 ('the transcription factors clock', 19.476190476190474),
 ('the clock', 9.09090909090909),
 ('cdc2 like kinase', 6.0),
 ('the transcription activators clock', 5.5),
 ('and clock', 4.666666666666667),
 ('of clock', 4.0),
 ('of the transcription factors clock', 4.0),
 ('kidney', 3.0),
 ('the proteins clock', 3.0),
 ('two transcription factors clock', 3.0),
 ('bhlh pas transcription factors clock', 2.5),
 ('of the transcription activators clock', 2.5),
 ('contralateral kidney', 2.0),
 ('by the transcription factors clock', 2.0),
 ('activators clock', 1.8000000000000007),
 ('transcription activators clock', 1.7777777777777777),
 ('like kinase', 1.75),
 ('proteins clock', 1.6),
 ('timeless tim clock', 1.5),
 ('by clock', 1.3333333333333333),
 ('by the clock', 1.3333333333333333),
 ('contra lateral kidney', 1.3333333333333333),
 ('per timeless tim clock', 1.3333333333333333),
 ('drosophila the transcription

In [22]:
longforms = miners['CLK'].get_longforms(abs_decay_param=0.001)

In [23]:
longforms

[('clock', 106, 0.9246367609300921),
 ('cdc2 like kinase', 7, 0.9975062395963412),
 ('contra lateral kidney', 3, 0.9981129987413702),
 ('contralateral kidney', 3, 0.8827786327110382),
 ('contra lateral', 2, 0.7457923787155183),
 ('clk proteins levels', 2, 0.637207331456189),
 ('cdc like kinase', 1, 0.9990628514746259),
 ('cycle kaput', 1, 0.9434263027606444),
 ('chemotactic lymphokines', 1, 0.8842122898223528),
 ('c kit + lin', 1, 0.846736156908713),
 ('clk alone', 1, 0.7494061755245505),
 ('clock ortholog', 1, 0.7086398896955698)]

In [12]:
longforms

[('clock', 106, 0.9246367609300921),
 ('cdc2 like kinase', 7, 0.9975062395963412),
 ('contra lateral kidney', 3, 0.9981129987413702),
 ('contralateral kidney', 3, 0.8827786327110382),
 ('contra lateral', 2, 0.7457923787155183),
 ('clk proteins levels', 2, 0.637207331456189),
 ('cdc like kinase', 1, 0.9990628514746259),
 ('cycle kaput', 1, 0.9434263027606444),
 ('chemotactic lymphokines', 1, 0.8842122898223528),
 ('c kit + lin', 1, 0.846736156908713),
 ('clk alone', 1, 0.7494061755245505),
 ('clock ortholog', 1, 0.7086398896955698)]

In [14]:
from adeft.ground import AdeftGrounder
grounder = AdeftGrounder()

In [97]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 1]]

In [98]:
longforms0

[('clock', 86.3921568627451), ('cdc2 like kinase', 6.0)]

In [99]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [13]])

In [100]:
longforms0

[('clock', 86.3921568627451),
 ('cdc2 like kinase', 6.0),
 ('contralateral kidney', 2.0)]

In [101]:
longforms0.sort(key=lambda x: -x[1])

In [102]:
longforms, scores = zip(*longforms0)

In [103]:
longforms

('clock', 'cdc2 like kinase', 'contralateral kidney')

In [104]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [105]:
grounding_map

{'clock': 'HGNC:2082'}

In [106]:
names

{'HGNC:2082': 'CLOCK'}

In [107]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [108]:
names

{'FPLX:CLK': 'CLK', 'HGNC:2082': 'CLOCK'}

In [109]:
grounding_map

{'cdc2 like kinase': 'FPLX:CLK',
 'clock': 'HGNC:2082',
 'contralateral kidney': 'ungrounded'}

In [110]:
result = grounding_map, names, pos_labels

In [111]:
result

({'cdc2 like kinase': 'FPLX:CLK',
  'clock': 'HGNC:2082',
  'contralateral kidney': 'ungrounded'},
 {'FPLX:CLK': 'CLK', 'HGNC:2082': 'CLOCK'},
 ['FPLX:CLK', 'HGNC:2082'])

In [112]:
grounding_map, names, pos_labels = ({'cdc2 like kinase': 'FPLX:CLK',
  'clock': 'HGNC:2082',
  'contralateral kidney': 'ungrounded'},
 {'FPLX:CLK': 'CLK', 'HGNC:2082': 'CLOCK'},
 ['FPLX:CLK', 'HGNC:2082'])

In [113]:
grounding_dict = {'CLK': grounding_map}

In [114]:
classifier = AdeftClassifier('CLK', pos_labels=pos_labels)

In [115]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [116]:
labeler = AdeftLabeler(grounding_dict)

In [117]:
corpus = labeler.build_from_texts(shortform_texts)

In [118]:
corpus.extend(entrez_texts)

In [119]:
texts, labels = zip(*corpus)

In [120]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-17 14:31:29] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-17 14:31:34] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9507457227323768 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [121]:
classifier.stats

{'label_distribution': {'ungrounded': 3, 'HGNC:2082': 295, 'FPLX:CLK': 49},
 'f1': {'mean': 0.9507457227323768, 'std': 0.029135992296673168},
 'precision': {'mean': 0.947456159148541, 'std': 0.03093001570141328},
 'recall': {'mean': 0.9564791133844842, 'std': 0.02587786533592789}}

In [122]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [123]:
d = disamb.disambiguate(shortform_texts)

In [127]:
a = [(text, pred) for pred, text in zip(d, shortform_texts)if pred[0].startswith('ung')]

In [128]:
len(a)

3

In [129]:
a

[('Unilateral ureteral obstruction (UUO) results in increased renal resistance as well as in exaggerated prostaglandin (PG) release from the obstructed hydronephrotic kidney (HNK). We have reported previously that platelet-activating factor (PAF) dose-dependently stimulates the release of PGs from both the HNK and unobstructed contralateral kidney (CLK), with CLK release being 10% that of the HNK. In the present report, we studied the interaction of PAF with its receptor by examining the effects of PAF-receptor antagonists on the release of PGs from the isolated perfused rabbit HNK and CLK stimulated by PAF; angiotensin II (AII), and bradykinin (BK) were also used as agonists. In the HNK, kadsurenone (3 microM) inhibited PAF-stimulated PGE2 and thromboxane B2 (TxB2) release by 28.2 and 62.5% respectively. CV-3988 (20 microM) and triazolam (5 microM) also preferentially diminished PAF-stimulated TxB2 release. In addition, all three drugs significantly diminished BK- and AII-stimulated T

In [130]:
disamb.dump('CLK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [61]:
model_to_s3(disamb)