In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['ODC']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('ornithine decarboxylase', 1492),
 ('omithine decarboxylase', 18),
 ('ornithine decarboxylase activity', 18),
 ('oligodendrocytes', 9),
 ('ornithine decarboxylase ec 4 1 1 17', 9),
 ('ornithine decarboxy lase', 4),
 ('oxidative dehydrogenative carboxylation', 3),
 ('ornithine de carboxylase', 3),
 ('oxygen dissociation curve', 3),
 ('ornithine decar boxylase', 3)]

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-02 02:56:47] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'oligodendrocytes': 'MESH:D009836',
  'omithine decarboxylase': 'HGNC:8109',
  'ornithine de carboxylase': 'HGNC:8109',
  'ornithine decar boxylase': 'HGNC:8109',
  'ornithine decarboxy lase': 'HGNC:8109',
  'ornithine decarboxylase': 'HGNC:8109',
  'ornithine decarboxylase activity': 'HGNC:8109',
  'ornithine decarboxylase ec 4 1 1 17': 'HGNC:8109',
  'oxidative dehydrogenative carboxylation': 'ungrounded',
  'oxygen dissociation curve': 'ungrounded'},
 {'MESH:D009836': 'Oligodendroglia', 'HGNC:8109': 'ODC1'},
 ['HGNC:8109', 'MESH:D009836']]

In [9]:
grounding_map, names, pos_labels = [{'oligodendrocytes': 'MESH:D009836',
  'omithine decarboxylase': 'HGNC:8109',
  'ornithine de carboxylase': 'HGNC:8109',
  'ornithine decar boxylase': 'HGNC:8109',
  'ornithine decarboxy lase': 'HGNC:8109',
  'ornithine decarboxylase': 'HGNC:8109',
  'ornithine decarboxylase activity': 'HGNC:8109',
  'ornithine decarboxylase ec 4 1 1 17': 'HGNC:8109',
  'oxidative dehydrogenative carboxylation': 'ungrounded',
  'oxygen dissociation curve': 'ungrounded'},
 {'MESH:D009836': 'Oligodendroglia', 'HGNC:8109': 'ODC1'},
 ['HGNC:8109', 'MESH:D009836']]

In [10]:
excluded_longforms = []

In [11]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [12]:
additional_entities = {}

In [13]:
unambiguous_agent_texts = {}

In [14]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [15]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-02 03:01:44] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-02 03:01:55] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9947229619751227 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [16]:
classifier.stats

{'label_distribution': {'HGNC:8109': 1349, 'ungrounded': 4, 'MESH:D009836': 7},
 'f1': {'mean': 0.994723, 'std': 0.002896},
 'precision': {'mean': 0.992658, 'std': 0.004646},
 'recall': {'mean': 0.997051, 'std': 0.001475},
 'ungrounded': {'f1': {'mean': 0.2, 'std': 0.4},
  'pr': {'mean': 0.2, 'std': 0.4},
  'rc': {'mean': 0.2, 'std': 0.4}},
 'HGNC:8109': {'f1': {'mean': 0.997416, 'std': 0.001476},
  'pr': {'mean': 1.0, 'std': 0.0},
  'rc': {'mean': 0.99485, 'std': 0.00294}},
 'MESH:D009836': {'f1': {'mean': 0.333333, 'std': 0.421637},
  'pr': {'mean': 0.3, 'std': 0.4},
  'rc': {'mean': 0.4, 'std': 0.489898}}}

In [17]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [27]:
disamb.dump(model_name, results_path)

In [18]:
print(disamb.info())

Disambiguation model for ODC

Produces the disambiguations:
	ODC1*	HGNC:8109
	Oligodendroglia*	MESH:D009836

Class level metrics:
--------------------
Grounding      	Count	F1     
           ODC1*	1349	0.99742
Oligodendroglia*	   7	0.33333
     Ungrounded	   4	    0.2

Weighted Metrics:
-----------------
	F1 score:	0.99472
	Precision:	0.99266
	Recall:		0.99705

* Positive labels
See Docstring for explanation



In [28]:
model_to_s3(disamb)

In [19]:
preds = disamb.disambiguate(all_texts.values())

In [20]:
other = [text for pred, text in zip(preds, all_texts.values()) if not pred[0].startswith('HGNC')]

In [22]:
other

['Respiration Physiology (1983) 51, 21-30 21  Elsevier Biomedical Press\nCONSEQUENCES OF AN ACUTE INCREASE IN P5o IN ANAESTHETIZED  GUINEA PIGS\nCLAUDE D. SOULARD, BERNARD P. TEISSEIRE, LILIANE J. TEISSEIRE  and\nROBERT A. HERIGAULT  Service de Physiologie et d\'Explorations\nFonctionnelles, and 1NSERM U. 138, Hdpital Henri Mondor,  94010\nCreteil, France  Abstract. In anaesthetized guinea pigs, ventilated\nwith ambient air, the peripheral haemodynamics and  oxygen transport\ncharacteristics have been studied following a blood exchange\ntransfusion with rat  erytrocytes suspended in guinea pig plasma.\nSince the rat haemoglobin exhibited a lower oxygen affinity  than\nguinea pig haemoglobin, the oxygen partial pressure at 50% of oxygen\nhaemoglobin saturation  (Ps0) increased from 25.2 _+ 1.1 to 37.2 _+\n0.9 mm Hg (n = 10). This increase in Ps0 was accompanied  by a\nsignificant increase in arterial oxygen partial pressure (Pao2) and in\narterio-venous difference  (AVDo2). Cardiac outp

In [27]:
other[4]

"Following inflammatory insults to white matter of the central nervous system (CNS), frequently myelin is poorly repaired in both humans and animals. This deficit in remyelination contributes to axonal dysfunction and associated neurological disability in multiple sclerosis (MS), the most common syndrome of inflammatory demyelination in humans. In MS and similar diseases, the inflammatory response is associated with damage of oligodendrocytes (ODC) and myelin (reviewed in  Hunter and Rodriguez, 1995 ). A number of histologically similar diseases of animals are of viral origin. One naturally occurring picornavirus, Theiler's murine encephalomyelitis virus (TMEV), causes a slowly progressive myelopathy associated with inflammatory demyelinating lesions in spinal cord and brainstem (susceptibility and pathogenesis reviewed in  Rodriguez et al., 1987b ). This well-characterized mouse model of progressive MS has been useful in the study of immunological processes underlying CNS demyelinatio