In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['RTCA']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [24]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 0.5]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [25]:
list(zip(longforms, counts))

[('real time cell analysis', 8),
 ('real time cell analyser', 4),
 ('real time cell analyzer', 4),
 ('real time cellular analysis', 2),
 ('real time cellular assay', 1),
 ('reductive tca', 1),
 ('reverse tca', 1),
 ('real time cell proliferation assay', 1),
 ('realtime cell monitored', 1),
 ('nuclear receptor modulator on cell index curves', 1)]

In [27]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-10-30 01:56:25] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl


In [28]:
result = [grounding_map, names, pos_labels]

In [29]:
result

[{'nuclear receptor modulator on cell index curves': 'ungrounded',
  'real time cell analyser': 'ungrounded',
  'real time cell analysis': 'ungrounded',
  'real time cell analyzer': 'ungrounded',
  'real time cell proliferation assay': 'ungrounded',
  'real time cellular analysis': 'ungrounded',
  'real time cellular assay': 'ungrounded',
  'realtime cell monitored': 'ungrounded',
  'reductive tca': 'ungrounded',
  'reverse tca': 'ungrounded'},
 {},
 []]

In [77]:
grounding_map, names, pos_labels = [{'nuclear receptor modulator on cell index curves': 'ungrounded',
  'real time cell analyser': 'ungrounded',
  'real time cell analysis': 'ungrounded',
  'real time cell analyzer': 'ungrounded',
  'real time cell proliferation assay': 'ungrounded',
  'real time cellular analysis': 'ungrounded',
  'real time cellular assay': 'ungrounded',
  'realtime cell monitored': 'ungrounded',
  'reductive tca': 'ungrounded',
  'reverse tca': 'ungrounded'},
 {},
 []]

In [78]:
excluded_longforms = []

In [79]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [80]:
additional_entities = {'HGNC:17981': ['RTCA', ['cyclase']]}

In [81]:
unambiguous_agent_texts = {}

In [82]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [83]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [84]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [85]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [86]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [87]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [88]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [89]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-30 02:08:28] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-30 02:08:29] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.8 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [90]:
classifier.stats

{'label_distribution': {'ungrounded': 13, 'HGNC:17981': 4},
 'f1': {'mean': 0.8, 'std': 0.4},
 'precision': {'mean': 0.8, 'std': 0.4},
 'recall': {'mean': 0.8, 'std': 0.4},
 'HGNC:17981': {'f1': {'mean': 0.8, 'std': 0.4},
  'pr': {'mean': 0.8, 'std': 0.4},
  'rc': {'mean': 0.8, 'std': 0.4}},
 'ungrounded': {'f1': {'mean': 1.0, 'std': 0.0},
  'pr': {'mean': 1.0, 'std': 0.0},
  'rc': {'mean': 1.0, 'std': 0.0}}}

In [91]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [92]:
disamb.dump(model_name, results_path)

In [93]:
print(disamb.info())

Disambiguation model for RTCA

Produces the disambiguations:
	RTCA*	HGNC:17981

Class level metrics:
--------------------
Grounding	Count	F1     
Ungrounded	13	    1.0
RTCA*	 4	    0.8

Weighted Metrics:
-----------------
	F1 score:	0.8
	Precision:	0.8
	Recall:		0.8

* Positive labels
See Docstring for explanation



In [119]:
model_to_s3(disamb)

In [95]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [97]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:17981']

In [118]:
a[1

'Etoposide (ETP) and anthracyclines are applied for wide anti-cancer treatments. However, the ETP-induced cardiotoxicity remains to be a major safety issue and the underlying cardiotoxic mechanisms are not well understood. This study is aiming to unravel the cardiotoxicity profile of ETP in comparison to anthracyclines using physiologically relevant human pluripotent stem cell-derived cardiomyocytes (hPSC-CMs). Using xCELLigence real-time cell analyser (RTCA), we found that single high dose of ETP induces irreversible increase in hPSC-CMs beating rate and decrease in beating amplitude. We also identified 58 deregulated genes consisting of 33 upregulated and 25 downregulated genes in hPSC-CMs after ETP treatment. Gene ontology (GO) and pathway analysis showed that most upregulated genes are enriched in GO categories like positive regulation of apoptotic process, regulation of cell death, and mitochondria organization, whereas most downregulated genes were enriched in GO categories like 

In [58]:
pmids = get_pmids_for_entity('HGNC', '17981')

In [59]:
texts = get_plaintexts_for_pmids(pmids)

In [60]:
len(texts)

29

In [61]:
a = list(texts.values())

In [63]:
a[1]

'Large-scale sequencing of cDNAs randomly picked from libraries has proven to be a very powerful approach to discover (putatively) expressed sequences that, in turn, once mapped, may greatly expedite the process involved in the identification and cloning of human disease genes. However, the integrity of the data and the pace at which novel sequences can be identified depends to a great extent on the cDNA libraries that are used. Because altogether, in a typical cell, the mRNAs of the prevalent and intermediate frequency classes comprise as much as 50-65% of the total mRNA mass, but represent no more than 1000-2000 different mRNAs, redundant identification of mRNAs of these two frequency classes is destined to become overwhelming relatively early in any such random gene discovery programs, thus seriously compromising their cost-effectiveness. With the goal of facilitating such efforts, previously we developed a method to construct directionally cloned normalized cDNA libraries and appli

In [64]:
a[0]

"RNA 3'-terminal phosphate cyclase from HeLa cells.\n"

In [70]:
a[6]

"Large-scale mapping of human protein–protein interactions by mass spectrometry\nMapping protein–protein interactions is an invaluable tool for understanding protein function. Here, we report the first large-scale study of protein–protein interactions in human cells using a mass spectrometry-based approach. The study maps protein interactions for 338 bait proteins that were selected based on known or suspected disease and functional associations. Large-scale immunoprecipitation of Flag-tagged versions of these proteins followed by LC-ESI-MS/MS analysis resulted in the identification of 24 540 potential protein interactions. False positives and redundant hits were filtered out using empirical criteria and a calculated interaction confidence score, producing a data set of 6463 interactions between 2235 distinct proteins. This data set was further cross-validated using previously published and predicted human protein interactions. In-depth mining of the data set shows that it represents a

In [71]:
from indra.literature.pubmed_client import get_ids_for_gene

In [72]:
pmids = get_ids_for_gene('RTCA')

In [76]:
pmids[2]

'19690099'