In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['NOP']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [68]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 1]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [70]:
list(zip(longforms, counts))

[('n ofq peptide', 20),
 ('nociceptin orphanin fq peptide', 20),
 ('n ofq receptor', 15),
 ('nociceptin', 14),
 ('nociceptin opioid peptide', 12),
 ('n ofq peptide receptor', 11),
 ('nociceptin orphanin fq', 11),
 ('nitrooxypropanol', 10),
 ('non otitis prone', 7),
 ('n ofq', 7),
 ('nociceptin opioid', 5),
 ('nociceptin opioid receptor', 5),
 ('nitro o phenylenediamine', 4),
 ('nociceptin orphanin fq opioid peptide', 4),
 ('nociceptin orphanin fq receptor', 4),
 ('nonsurgical organ preservation modalities', 3),
 ('nociceptive opioid peptide receptor', 3),
 ('nociceptin orphanin fq peptide receptor', 3),
 ('neurite outgrowth promoting', 2),
 ('no production', 2),
 ('novel object preference', 2),
 ('nociceptin opioid peptide receptor', 2)]

In [73]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

In [74]:
result = [grounding_map, names, pos_labels]

In [75]:
result

[{'n ofq': 'HGNC:8155',
  'n ofq peptide': 'HGNC:8155',
  'n ofq peptide receptor': 'HGNC:8155',
  'n ofq receptor': 'HGNC:8155',
  'neurite outgrowth promoting': 'ungrounded',
  'nitro o phenylenediamine': 'PUBCHEM:5111791',
  'nitrooxypropanol': 'PUBCHEM:10011893',
  'no production': 'ungrounded',
  'nociceptin': 'HGNC:8155',
  'nociceptin opioid': 'HGNC:8155',
  'nociceptin opioid peptide': 'HGNC:8155',
  'nociceptin opioid peptide receptor': 'HGNC:8155',
  'nociceptin opioid receptor': 'HGNC:8155',
  'nociceptin orphanin fq': 'HGNC:8155',
  'nociceptin orphanin fq opioid peptide': 'HGNC:8155',
  'nociceptin orphanin fq peptide': 'HGNC:8155',
  'nociceptin orphanin fq peptide receptor': 'HGNC:8155',
  'nociceptin orphanin fq receptor': 'HGNC:8155',
  'nociceptive opioid peptide receptor': 'HGNC:8155',
  'non otitis prone': 'ungrounded',
  'nonsurgical organ preservation modalities': 'ungrounded',
  'novel object preference': 'ungrounded'},
 {'HGNC:8155': 'OPRL1',
  'PUBCHEM:5111791'

In [76]:
grounding_map, names, pos_labels = [{'n ofq': 'HGNC:8155',
  'n ofq peptide': 'HGNC:8155',
  'n ofq peptide receptor': 'HGNC:8155',
  'n ofq receptor': 'HGNC:8155',
  'neurite outgrowth promoting': 'ungrounded',
  'nitro o phenylenediamine': 'PUBCHEM:5111791',
  'nitrooxypropanol': 'PUBCHEM:10011893',
  'no production': 'ungrounded',
  'nociceptin': 'HGNC:8155',
  'nociceptin opioid': 'HGNC:8155',
  'nociceptin opioid peptide': 'HGNC:8155',
  'nociceptin opioid peptide receptor': 'HGNC:8155',
  'nociceptin opioid receptor': 'HGNC:8155',
  'nociceptin orphanin fq': 'HGNC:8155',
  'nociceptin orphanin fq opioid peptide': 'HGNC:8155',
  'nociceptin orphanin fq peptide': 'HGNC:8155',
  'nociceptin orphanin fq peptide receptor': 'HGNC:8155',
  'nociceptin orphanin fq receptor': 'HGNC:8155',
  'nociceptive opioid peptide receptor': 'HGNC:8155',
  'non otitis prone': 'ungrounded',
  'nonsurgical organ preservation modalities': 'ungrounded',
  'novel object preference': 'ungrounded'},
 {'HGNC:8155': 'OPRL1',
  'PUBCHEM:5111791': '4-Nitro-o-phenylenediamine',
  'PUBCHEM:10011893': '3-Hydroxypropyl Nitrate'},
 ['HGNC:8155']]

In [77]:
excluded_longforms = []

In [78]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [79]:
additional_entities = {}

In [80]:
unambiguous_agent_texts = {}

In [81]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [82]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [83]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [84]:
intersection1

[]

In [85]:
intersection2

[]

In [31]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['NOL3', 'CARD2', 'MYP', 'NOP30',
                                                             'nucleolar'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [34]:
names.update(additional_entities)
pos_labels.extend(additional_entities)

In [86]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-06 01:33:35] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-06 01:33:37] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.957026713124274 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [41]:
classifier.stats

{'label_distribution': {'PUBCHEM:5111791': 4,
  'HGNC:8155': 99,
  'ungrounded': 2,
  'PUBCHEM:10011893': 3,
  'HGNC:7869': 50},
 'f1': {'mean': 0.977196, 'std': 0.013396},
 'precision': {'mean': 0.962244, 'std': 0.013674},
 'recall': {'mean': 0.993333, 'std': 0.013333},
 'HGNC:8155': {'f1': {'mean': 0.990244, 'std': 0.011949},
  'pr': {'mean': 1.0, 'std': 0.0},
  'rc': {'mean': 0.980952, 'std': 0.023328}},
 'PUBCHEM:10011893': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}},
 'HGNC:7869': {'f1': {'mean': 0.951429, 'std': 0.031644},
  'pr': {'mean': 0.98, 'std': 0.04},
  'rc': {'mean': 0.925455, 'std': 0.037439}},
 'PUBCHEM:5111791': {'f1': {'mean': 0.8, 'std': 0.4},
  'pr': {'mean': 0.8, 'std': 0.4},
  'rc': {'mean': 0.8, 'std': 0.4}},
 'ungrounded': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}}}

In [87]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [27]:
disamb.dump(model_name, results_path)

In [88]:
print(disamb.info())

Disambiguation model for NOP

Produces the disambiguations:
	3-Hydroxypropyl Nitrate	PUBCHEM:10011893
	4-Nitro-o-phenylenediamine	PUBCHEM:5111791
	OPRL1*	HGNC:8155

Class level metrics:
--------------------
Grounding                 	Count	F1     
                     OPRL1*	100	0.95703
                Ungrounded	  6	    0.0
4-Nitro-o-phenylenediamine	  4	    0.8
   3-Hydroxypropyl Nitrate	  3	    0.0

Weighted Metrics:
-----------------
	F1 score:	0.95703
	Precision:	0.91775
	Recall:		1.0

* Positive labels
See Docstring for explanation



In [92]:
model_to_s3(disamb)

In [89]:
other_texts = [text for text in all_texts.values() if disamb.disambiguate(text)[0] != 'HGNC:8155']

In [91]:
other_texts

['Chlorophyllin, the sodium and copper salt of chlorophyll, chlorophyll a, and chlorophyll b were tested for their ability to inhibit the mutagenic activity of the direct-acting mutagen 4-nitro-o-phenylenediamine (NOP) and its plant-activated mutagenic enhancement. All three forms of chlorophyll were antimutagenic against both NOP and its plant-activated product, with chlorophyllin proving most effective. Chlorophyll-containing plant extracts, however, proved very efficient at activating NOP into a mutagen of greater potency. When these extracts were assayed for total chlorophyll content it was found that they contained far less chlorophyll than was required for an antimutagenic effect to occur. Thus, the balance between chemical mutagen activation and/or enhancement by chlorophyll-containing plant extracts and the potential antimutagenicity of these plant extracts is a function of chlorophyll concentration. The data presented here indicate that this balance must be taken into consider

In [67]:
NOL3_texts[11]

'Deducing biological function often relies on interpreting the effects of gene knockouts or targeted mutations reducing catalytic activity of enzymes. In the case of RNA-modifying enzymes with multiple substrates, it is unclear how to assign mutant phenotypes to loss of modification in specific RNA species. For instance, the NOP/Sun2RNA methyltransferase family member 2 (NSUN2) targets the majority of nuclear-encoded tRNAs, but also other cRNAs and ncRNAs [78,88]. A genetic knock-out in mice caused a variety of phenotypes ranging from impaired cellular differentiation to sex-specific infertility [89,90], while human individuals with congenital mutations in a splice-acceptor site of the NSUN2 gene display pleiotropic phenotypes defined by intellectual disability and skin differentiation defects [91,92]. Which methylation site(s) in which RNA substrate(s) cause(s) these phenotypes? While NSUN2 is a robust tRNA methyltransferase on (many) tRNAs, it is unclear which non-modified tRNA speci

In [64]:
len(all_texts.values())

209