In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['EK']
genes = ['TMPRSS15', 'CHKA']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
genes

['TMPRSS15', 'CHKA']

In [4]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [5]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [6]:
top = miners['EK'].top()

In [7]:
top

[('enterokinase', 6.428571428571429),
 ('electrokinetic', 6.0),
 ('enkephalin', 5.428571428571429),
 ('epidermal keratinocytes', 3.6),
 ('ethanolamine kinase', 3.6),
 ('enteropeptidase', 2.0),
 ('endothelial keratoplasty', 2.0),
 ('iib x 417 wt 407', 2),
 ('iia 1 898 wt 1 980', 2),
 ('iib x 1 965 wt 2 323', 2),
 ('number of individual fibers in edl measured iia 612 wt 505', 2),
 ('ach genotype were analyzed number of individual fibers in soleus measured type i 980 wt 1 116',
  2),
 ('enamel knot', 1.5),
 ('the enamel knot', 1.3333333333333333),
 ('erythrokeratoderma', 1),
 ('knowledge', 1.0),
 ('and enterokinase', 1.0),
 ('by enterokinase', 1.0),
 ('euphorbia kansui', 1),
 ('exposure keratopathy', 1.0),
 ('mammalian enterokinase', 1),
 ('met enkephalin', 1.0),
 ('of enkephalin', 1.0),
 ('k + equilibrium potential', 1.0),
 ('the production of enkephalin', 1),
 ('cultured normal human epidermal keratinocytes', 1.0),
 ('escherichia coli and klebsiella species', 1.0),
 ('ethyl acetate extr

In [12]:
longforms0 = miners['EK'].get_longforms(scale=True)

In [13]:
longforms0

[('electrokinetic', 0.5),
 ('enterokinase', 0.4935064935064935),
 ('enkephalin', 0.4428571428571429),
 ('ethanolamine kinase', 0.325),
 ('epidermal keratinocytes', 0.325),
 ('iib x 417 wt 407', 0.2),
 ('iia 1 898 wt 1 980', 0.2),
 ('iib x 1 965 wt 2 323', 0.2),
 ('number of individual fibers in edl measured iia 612 wt 505', 0.2),
 ('ach genotype were analyzed number of individual fibers in soleus measured type i 980 wt 1 116',
  0.2),
 ('enteropeptidase', 0.16666666666666666),
 ('endothelial keratoplasty', 0.16666666666666666),
 ('enamel knot', 0.07142857142857142)]

In [12]:
longforms0 = longforms0[:6]

In [13]:
longforms0

[('enterokinase', 6.428571428571429),
 ('electrokinetic', 6.0),
 ('enkephalin', 5.428571428571429),
 ('ethanolamine kinase', 3.6),
 ('epidermal keratinocytes', 3.6),
 ('enteropeptidase', 2.0)]

In [14]:
longforms0.sort(key=lambda x: -x[1])

In [15]:
longforms, scores = zip(*longforms0)

In [16]:
longforms

('enterokinase',
 'electrokinetic',
 'enkephalin',
 'ethanolamine kinase',
 'epidermal keratinocytes',
 'enteropeptidase')

In [17]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [18]:
grounding_map

{'enterokinase': 'HGNC:9490',
 'enkephalin': 'MESH:D004745',
 'ethanolamine kinase': 'HGNC:1937',
 'enteropeptidase': 'HGNC:9490'}

In [19]:
names

{'HGNC:9490': 'TMPRSS15', 'MESH:D004745': 'Enkephalins', 'HGNC:1937': 'CHKA'}

In [20]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [33]:
names

{'MESH:D004745': 'Enkephalins', 'HGNC:9490': 'TMPRSS15', 'HGNC:1937': 'CHKA'}

In [180]:
grounding_map

{'electrokinetic': 'ungrounded',
 'enkephalin': 'MESH:D004745',
 'enterokinase': 'HGNC:9490',
 'enteropeptidase': 'HGNC:9490',
 'epidermal keratinocytes': 'ungrounded',
 'ethanolamine kinase': 'FPLX:CHK'}

In [21]:
result = (grounding_map, names, pos_labels)

In [22]:
result

({'electrokinetic': 'ungrounded',
  'enkephalin': 'MESH:D004745',
  'enterokinase': 'HGNC:9490',
  'enteropeptidase': 'HGNC:9490',
  'epidermal keratinocytes': 'ungrounded',
  'ethanolamine kinase': 'HGNC:1937'},
 {'MESH:D004745': 'Enkephalins', 'HGNC:9490': 'TMPRSS15', 'HGNC:1937': 'CHKA'},
 ['HGNC:1937', 'HGNC:9490', 'MESH:D004745'])

In [24]:
grounding_map, names, pos_labels = ({'electrokinetic': 'ungrounded',
  'enkephalin': 'MESH:D004745',
  'enterokinase': 'HGNC:9490',
  'enteropeptidase': 'HGNC:9490',
  'epidermal keratinocytes': 'ungrounded',
  'ethanolamine kinase': 'HGNC:1937'},
 {'MESH:D004745': 'Enkephalins', 'HGNC:9490': 'TMPRSS15', 'HGNC:1937': 'CHKA'},
 ['HGNC:1937', 'HGNC:9490', 'MESH:D004745'])

In [25]:
grounding_dict = {'EK': grounding_map}

In [26]:
classifier = AdeftClassifier('EK', pos_labels=pos_labels)

In [27]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [28]:
labeler = AdeftLabeler(grounding_dict)

In [29]:
corpus = labeler.build_from_texts(shortform_texts)

In [30]:
corpus.extend(entrez_texts)

In [31]:
texts, labels = zip(*corpus)

In [32]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-16 15:41:27] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-16 15:41:30] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.8012468805469674 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [194]:
classifier.stats

{'label_distribution': {'ungrounded': 11,
  'HGNC:9490': 30,
  'MESH:D004745': 7,
  'FPLX:CHK': 58},
 'f1': {'mean': 0.8133311951206688, 'std': 0.04073679018039223},
 'precision': {'mean': 0.8343876357560568, 'std': 0.04378480128192264},
 'recall': {'mean': 0.8423391812865498, 'std': 0.04507797165586637}}

In [195]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [196]:
d = disamb.disambiguate(shortform_texts)

In [197]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'FPLX:CHK']

In [200]:
a[2]

'1. Extracellular K+ activity and transmembrane potential were simultaneously monitored with a K+-selective micro-electrode placed in the extracellular space and a standard KCl-filled micro-electrode in the intracellular space of the frog ventricular muscle. 2. K+ was found to accumulate during activity and had the approximate magnitude and time course to account for the measured membrane depolarization. 3. The magnitude of the K+ accumulation depended on the frequency of stimulation, diameter of the muscle and temperature of the bathing solution. 4. The time constants of accumulation and decay were dependent only on the diameter and the temperature of the strip. A Q10 of 2 was measured for the decay of accumulated K+. 5. Double barrelled K+-electrodes were used to monitor the change in K+ activity accompanying a single action potential, since the reference barrel allowed for rapid compensation of the electrical potential fluctuations encountered in the subendothelial space. 6. K+ accu

In [88]:
a[16]

'The OsPCS2 exhibits root- and shoot-specific differential ratios of alternatively spliced transcripts in indica rice under Cd stress, and plays role in Cd and As stress tolerance and accumulation. Enzymatic activity of phytochelatin synthase (PCS) in plant produces phytochelatins, which help in sequestration of heavy metal(loid)s inside the cell vacuole to alleviate toxicity. Here we report that among the two PCS genes-OsPCS1 and OsPCS2 in indica rice (Oryza sativa) cultivar, the OsPCS2 produces an alternatively spliced OsPCS2b transcript that bears the unusual premature termination codon besides the canonically spliced OsPCS2a transcript. Root- and shoot-specific differential ratios of alternatively spliced OsPCS2a and OsPCS2b transcript expressions were observed under cadmium stress. Saccharomyces cerevisiae cells transformed with OsPCS2a exhibited increased cadmium (Cd) and arsenic (As) tolerance and accumulation, unlike the OsPCS2b transformed yeast cells. An intron-containing hai

In [201]:
disamb.dump('EK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [90]:
model_to_s3(disamb)