In [5]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.disambiguate import load_disambiguator
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [11]:
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [15]:
HK1_pmids = all_pmids['HK1']
HK1_entrez_refs, HK1_entrez_content = get_text_content_from_pmids(HK1_pmids)
HK1_entrez_texts = [universal_extract_text(text) for text in HK1_entrez_content.values() if text]

In [16]:
HOOK1_pmids = all_pmids['HOOK1']
HOOK1_entrez_refs, HOOK1_entrez_content = get_text_content_from_pmids(HOOK1_pmids)
HOOK1_entrez_texts = [universal_extract_text(text) for text in HOOK1_entrez_content.values() if text]

In [22]:
KCNA4_pmids = all_pmids['KCNA4']
KCNA4_entrez_refs, KCNA4_entrez_content = get_text_content_from_pmids(KCNA4_pmids)
KCNA4_entrez_texts = [universal_extract_text(text) for text in KCNA4_entrez_content.values() if text]

In [24]:
shortform = 'HK1'

In [25]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

refs, content = get_text_content_from_stmt_ids(stmts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [29]:
overlap = set(refs.values()) & (set(KCNA4_entrez_refs.values()) | set(HK1_entrez_refs.values()) |
                                                                     set(HOOK1_entrez_refs.values()))

In [30]:
len(overlap)

8

In [31]:
shortform_texts = [universal_extract_text(text, contains=[shortform]) for ref, text in content.items()
                   if text and ref not in overlap and len(text) >= 5]

In [32]:
miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

In [33]:
top = miner.top()

In [34]:
top

[('hexokinase 1', 33.0),
 ('1', 3.8378378378378386),
 ('hexokinase', 3.0),
 ('and hexokinase 1', 3.0),
 ('npc cells line', 2.0),
 ('of hexokinase 1', 2.0),
 ('enzyme hexokinase 1', 1.5),
 ('npc cells', 1.3333333333333333),
 ('glycolytic enzyme hexokinase 1', 1.3333333333333333),
 ('rv0600c', 1.0),
 ('hexokinase i', 1.0),
 ('including hexokinase 1', 1.0),
 ('with hexokinase 1', 1.0),
 ('a histidine aspartate kinase', 1.0),
 ('compared with npc cells', 1.0),
 ('unique type 1 hexokinase', 1),
 ('the glycolytic enzyme hexokinase 1', 1.0),
 ('its has been known for many years that both hexokinase 1', 1),
 ('rbitol dehydrogenase sord phosphoribosyl pyrophosphate synthetase 1 prps1 and hexokinase 1',
  1),
 ('tal phosphoribosyl pyrophosphate synthetase 1 prps1 sorbitol dehydrogenase sord hexokinase 1',
  1),
 ('in transcriptional regulated cytosolic smad5 enhances glycolysis by interacting with hexokinase 1',
  1),
 ('xpression of several genes associated with glycolysis including the glucose

In [35]:
miner.get_longforms()

[('hexokinase 1', 33.0),
 ('hexokinase', 3.0),
 ('npc cells line', 2.0),
 ('npc cells', 1.3333333333333333)]

In [36]:
longforms = miner.get_longforms()

In [38]:
longforms.append(('histidine aspartate kinase', 1.0))

In [39]:
longforms

[('hexokinase 1', 33.0),
 ('hexokinase', 3.0),
 ('npc cells line', 2.0),
 ('npc cells', 1.3333333333333333),
 ('histidine aspartate kinase', 1.0)]

In [40]:
longforms, scores = zip(*longforms)

In [43]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [44]:
grounding_map

{'hexokinase 1': 'HGNC:4922', 'hexokinase': 'MESH:D006593'}

In [45]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [46]:
names['HGNC:6222 '] = 'KCNA4'
pos_labels.extend(['HGNC:6222'])
names['HGNC:19884'] = 'HOOK1'
pos_labels.extend(['HGNC:19884'])

In [47]:
result = (grounding_map, names, pos_labels)

In [48]:
result

({'hexokinase': 'HGNC:4922',
  'hexokinase 1': 'HGNC:4922',
  'histidine aspartate kinase': 'ungrounded',
  'npc cells': 'ungrounded',
  'npc cells line': 'ungrounded'},
 {'HGNC:4922': 'HK1', 'HGNC:6222 ': 'KCNA4', 'HGNC:19884': 'HOOK1'},
 ['HGNC:4922', 'HGNC:6222', 'HGNC:19884'])

In [64]:
grounding_map, names, pos_labels = ({'hexokinase': 'HGNC:4922',
                                      'hexokinase 1': 'HGNC:4922',
                                      'histidine aspartate kinase': 'ungrounded',
                                      'npc cells': 'ungrounded',
                                      'npc cells line': 'ungrounded'},
                                     {'HGNC:4922': 'HK1', 'HGNC:6222 ': 'KCNA4', 'HGNC:19884': 'HOOK1'},
                                     ['HGNC:4922', 'HGNC:6222', 'HGNC:19884'])

In [65]:
grounding_dict = {'HK1': grounding_map}

In [66]:
classifier = AdeftClassifier('HK1', pos_labels=pos_labels)

In [67]:
param_grid = {'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}

In [68]:
labeler = AdeftLabeler(grounding_dict)

In [69]:
corpus = labeler.build_from_texts(shortform_texts)

In [70]:
corpus.extend([(text, 'HGNC:4922') for text in HK1_entrez_texts])
corpus.extend([(text, 'HGNC:6222') for text in KCNA4_entrez_texts])
corpus.extend([(text, 'HGNC:19884') for text in HOOK1_entrez_texts])

In [71]:
texts, labels = zip(*corpus)

In [72]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 15:49:23] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}
INFO: [2019-12-17 15:50:27] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.8659681206190537 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [73]:
classifier.stats

{'label_distribution': {'HGNC:4922': 128,
  'ungrounded': 5,
  'HGNC:6222': 50,
  'HGNC:19884': 23},
 'f1': {'mean': 0.8496007582015285, 'std': 0.02149518324514752},
 'precision': {'mean': 0.8561127883393815, 'std': 0.024016486586897434},
 'recall': {'mean': 0.8609130706691683, 'std': 0.017801572857693276}}

In [74]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [75]:
disamb.dump('HK1', '../results')

In [76]:
disambs = disamb.disambiguate(shortform_texts)

In [2]:
disamb = load_disambiguator('HK1', path='../results')



In [3]:
disamb

<adeft.disambiguate.AdeftDisambiguator at 0x141753190>

In [4]:
disamb.shortforms

['HK1']

In [6]:
model_to_s3(disamb)