In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [4]:
shortforms = ['NS1']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [5]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-09-18 22:52:39] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


KeyboardInterrupt: 

In [7]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'cp fine': 'MESH:D052638',
  'f poae': 'ungrounded',
  'f prostanoid': 'CHEBI:CHEBI:26347',
  'f series prostanoid': 'CHEBI:CHEBI:26347',
  'fabry perot': 'ungrounded',
  'fabry pérot': 'ungrounded',
  'faecal protease': 'ungrounded',
  'faecalibacterium prausnitzii': 'MESH:D000070037',
  'false positive': 'MESH:D005189',
  'false positives': 'MESH:D005189',
  'family physicians': 'MESH:D010821',
  'family planning': 'MESH:D005193',
  'farnesyl phosphate': 'CHEBI:CHEBI:24018',
  'fast pathway': 'CHEBI:CHEBI:34922',
  'fat pad': 'MESH:D000273',
  'fat percentage': 'ungrounded',
  'fatty pancreas': 'ungrounded',
  'female protein': 'ungrounded',
  'fenpropimorph': 'CHEBI:CHEBI:50145',
  'fenugreek powder': 'ungrounded',
  'fermentation production': 'ungrounded',
  'fertility preservation': 'MESH:D059247',
  'fetal pancreas': 'ungrounded',
  'few polyhedra': 'ungrounded',
  'fgfb pacap': 'ungrounded',
  'fiber protein': 'MESH:D012596',
  'field potential': 'field_potential',
  'field po

In [28]:
grounding_map, names, pos_labels = [{'cp fine': 'MESH:D052638',
  'f poae': 'ungrounded',
  'f prostanoid': 'CHEBI:CHEBI:26347',
  'f series prostanoid': 'CHEBI:CHEBI:26347',
  'fabry perot': 'ungrounded',
  'fabry pérot': 'ungrounded',
  'faecal protease': 'ungrounded',
  'faecalibacterium prausnitzii': 'MESH:D000070037',
  'false positive': 'MESH:D005189',
  'false positives': 'MESH:D005189',
  'family physicians': 'MESH:D010821',
  'family planning': 'MESH:D005193',
  'farnesyl phosphate': 'CHEBI:CHEBI:24018',
  'fast pathway': 'CHEBI:CHEBI:34922',
  'fat pad': 'MESH:D000273',
  'fat percentage': 'ungrounded',
  'fatty pancreas': 'ungrounded',
  'female protein': 'ungrounded',
  'fenpropimorph': 'CHEBI:CHEBI:50145',
  'fenugreek powder': 'ungrounded',
  'fermentation production': 'ungrounded',
  'fertility preservation': 'MESH:D059247',
  'fetal pancreas': 'ungrounded',
  'few polyhedra': 'ungrounded',
  'fgfb pacap': 'ungrounded',
  'fiber protein': 'MESH:D012596',
  'field potential': 'field_potential',
  'field potentials': 'field_potential',
  'filamentous processes': 'ungrounded',
  'filter paper': 'ungrounded',
  'fine particles': 'MESH:D052638',
  'fipronil': 'CHEBI:CHEBI:5063',
  'first progression': 'ungrounded',
  'fish peptide': 'ungrounded',
  'fixed point': 'ungrounded',
  'flagellar pocket': 'GO:GO:0020016',
  'flavonoids and phenolic acid': 'ungrounded',
  'flavopiridol': 'CHEBI:CHEBI:47344',
  'flavoprotein': 'CHEBI:CHEBI:5086',
  'floor plate': 'floor_plate',
  'flow probe': 'ungrounded',
  'flow proneness': 'ungrounded',
  'flowering period': 'ungrounded',
  'fluid percussion': 'fluid_percussion',
  'fluid pressure': 'ungrounded',
  'fluorescence polarization': 'MESH:D005454',
  'fluorescence protein': 'MESH:D008164',
  'fluorophosphonate': 'CHEBI:CHEBI:42699',
  'fluoropyrimidine': 'PUBCHEM:141643',
  'fluphenazine': 'CHEBI:CHEBI:5123',
  'flurbiprofen': 'CHEBI:CHEBI:5130',
  'fluticasone': 'CHEBI:CHEBI:5134',
  'fluticasone propionate': 'CHEBI:CHEBI:31441',
  'focused pulsed': 'ungrounded',
  'follicular phase': 'MESH:D005498',
  'foot processes': 'NCIT:C32623',
  'footpad': 'ungrounded',
  'foreperiod': 'ungrounded',
  'formyl peptide': 'MESH:D009240',
  'fowlpox': 'MESH:D005587',
  'fowlpox virus': 'MESH:D005587',
  'fp dipeptides': 'ungrounded',
  'fractional photothermolysis': 'ungrounded',
  'frailty phenotype': 'ungrounded',
  'from propolis': 'MESH:D011429',
  'frontal pole': 'ungrounded',
  'fronto parietal': 'ungrounded',
  'frontopolar cortex': 'ungrounded',
  'fructus psoraleae': 'ungrounded',
  'fucan polysaccharides': 'ungrounded',
  'fungiform papilla': 'ungrounded',
  'fusion peptide': 'MESH:D014760',
  'fusion peptides': 'MESH:D014760',
  'fusion positive': 'ungrounded',
  'fusion protein': 'MESH:D014760',
  'fusogenic peptides': 'MESH:D014760',
  'prostaglandin f': 'HGNC:9600',
  'prostaglandin f2α receptor': 'HGNC:9600'},
 {'MESH:D052638': 'Particulate Matter',
  'CHEBI:CHEBI:26347': 'prostanoid',
  'MESH:D000070037': 'Faecalibacterium prausnitzii',
  'MESH:D005189': 'False Positive Reactions',
  'MESH:D010821': 'Physicians, Family',
  'MESH:D005193': 'Family Planning Services',
  'CHEBI:CHEBI:24018': 'farnesyl phosphate',
  'CHEBI:CHEBI:34922': 'picloram',
  'MESH:D000273': 'Adipose Tissue',
  'CHEBI:CHEBI:50145': 'fenpropimorph',
  'MESH:D059247': 'Fertility Preservation',
  'MESH:D012596': 'Scleroproteins',
  'field_potential': 'field_potential',
  'CHEBI:CHEBI:5063': 'fipronil',
  'GO:GO:0020016': 'ciliary pocket',
  'CHEBI:CHEBI:47344': 'alvocidib',
  'CHEBI:CHEBI:5086': 'flavoprotein',
  'floor_plate': 'floor_plate',
  'fluid_percussion': 'fluid_percussion',
  'MESH:D005454': 'Fluorescence Polarization',
  'MESH:D008164': 'Luminescent Proteins',
  'CHEBI:CHEBI:42699': 'fluoridophosphate',
  'PUBCHEM:141643': '2-Fluoropyrimidine',
  'CHEBI:CHEBI:5123': 'fluphenazine',
  'CHEBI:CHEBI:5130': 'flurbiprofen',
  'CHEBI:CHEBI:5134': 'fluticasone',
  'CHEBI:CHEBI:31441': 'fluticasone propionate',
  'MESH:D005498': 'Follicular Phase',
  'NCIT:C32623': 'Foot Process',
  'MESH:D009240': 'N-Formylmethionine Leucyl-Phenylalanine',
  'MESH:D005587': 'Fowlpox virus',
  'MESH:D011429': 'Propolis',
  'MESH:D014760': 'Viral Fusion Proteins',
  'HGNC:9600': 'PTGFR'},
 ['CHEBI:CHEBI:47344',
  'CHEBI:CHEBI:5130',
  'MESH:D005189',
  'MESH:D005193',
  'MESH:D005454',
  'MESH:D008164',
  'MESH:D014760',
  'NCIT:C32623',
 'GO:GO:0020016']]

  

In [12]:
excluded_longforms = []

In [29]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [30]:
additional_entities = []

In [31]:
unambiguous_agent_texts = {}

In [32]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entities = pos_labels + additional_entities
entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in entities}

In [17]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [18]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [19]:
intersection1

[('CHEBI:CHEBI:47344', 'CHEBI:CHEBI:47344', 0),
 ('CHEBI:CHEBI:47344', 'CHEBI:CHEBI:5130', 0),
 ('CHEBI:CHEBI:47344', 'HGNC:9600', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005189', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005193', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005454', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D008164', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D014760', 0),
 ('CHEBI:CHEBI:47344', 'NCIT:C32623', 0),
 ('CHEBI:CHEBI:5130', 'CHEBI:CHEBI:47344', 0),
 ('CHEBI:CHEBI:5130', 'CHEBI:CHEBI:5130', 1314),
 ('CHEBI:CHEBI:5130', 'HGNC:9600', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005189', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005193', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005454', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D008164', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D014760', 0),
 ('CHEBI:CHEBI:5130', 'NCIT:C32623', 0),
 ('HGNC:9600', 'CHEBI:CHEBI:47344', 0),
 ('HGNC:9600', 'CHEBI:CHEBI:5130', 0),
 ('HGNC:9600', 'HGNC:9600', 0),
 ('HGNC:9600', 'MESH:D005189', 0),
 ('HGNC:9600', 'MESH:D005193', 0),
 ('HGNC:9600', 'MESH:D005454', 0),
 ('HGN

In [75]:
intersection2

[('CHEBI:CHEBI:27812', 'CHEBI:CHEBI:27812', 0),
 ('CHEBI:CHEBI:27812', 'CHEBI:CHEBI:7035', 0),
 ('CHEBI:CHEBI:27812', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('CHEBI:CHEBI:27812', 'HGNC:7321', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D000080364', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D000604', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D059630', 0),
 ('MESH:D059630', 'CHEBI:CHEBI:27812', 0),
 ('MESH:D059630', 'CHEBI:CHEBI:7035', 0),
 ('MESH:D059630', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('MESH:D059630', 'HGNC:7321', 0),
 ('MESH:D059630', 'MESH:D000080364', 0),
 ('MESH:D059630', 'MESH:D000604', 0),
 ('MESH:D059630', 'MESH:D059630', 2261),
 ('ungrounded', 'CHEBI:CHEBI:27812', 0),
 ('ungrounded', 'CHEBI:CHEBI:7035', 0),
 ('ungrounded', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('ungrounded', 'HGNC:7321', 0),
 ('ungrounded', 'MESH:D000080364', 0),
 ('ungrounded', 'MESH:D000604', 0),
 ('ungrounded', 'MESH:D059630', 0),
 ('MESH:D000604', 'CHEBI:CHEBI:27812', 0),
 ('MESH:D000604', 'CHEBI:CHEBI:7035', 0),


[('HGNC:7321', ['Musculin', 'musculin'])]

In [91]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [33]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-09-18 19:41:29] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-09-18 19:42:04] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.8876012939356045 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [34]:
classifier.stats

{'label_distribution': {'ungrounded': 93,
  'CHEBI:CHEBI:34922': 4,
  'MESH:D014760': 109,
  'MESH:D009240': 5,
  'CHEBI:CHEBI:24018': 3,
  'MESH:D012596': 2,
  'CHEBI:CHEBI:31441': 146,
  'CHEBI:CHEBI:5130': 14,
  'MESH:D005498': 6,
  'fluid_percussion': 9,
  'field_potential': 30,
  'CHEBI:CHEBI:5086': 6,
  'HGNC:9600': 13,
  'CHEBI:CHEBI:47344': 35,
  'MESH:D008164': 72,
  'MESH:D005454': 81,
  'PUBCHEM:141643': 6,
  'MESH:D005587': 6,
  'MESH:D005189': 107,
  'MESH:D005193': 16,
  'floor_plate': 38,
  'MESH:D000273': 3,
  'CHEBI:CHEBI:26347': 9,
  'CHEBI:CHEBI:5134': 5,
  'GO:GO:0020016': 8,
  'CHEBI:CHEBI:42699': 6,
  'NCIT:C32623': 16,
  'MESH:D000070037': 3,
  'CHEBI:CHEBI:5123': 1,
  'CHEBI:CHEBI:5063': 3,
  'MESH:D052638': 3,
  'MESH:D011429': 2,
  'CHEBI:CHEBI:50145': 2,
  'MESH:D059247': 6,
  'MESH:D010821': 6},
 'f1': {'mean': 0.887601, 'std': 0.033792},
 'precision': {'mean': 0.872287, 'std': 0.04696},
 'recall': {'mean': 0.915062, 'std': 0.024389},
 'MESH:D014760': {'f1':

In [35]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [37]:
disamb.dump(model_name, results_path)

In [36]:
print(disamb.info())

Disambiguation model for FP, and FPs

Produces the disambiguations:
	2-Fluoropyrimidine	PUBCHEM:141643
	Adipose Tissue	MESH:D000273
	Faecalibacterium prausnitzii	MESH:D000070037
	False Positive Reactions*	MESH:D005189
	Family Planning Services*	MESH:D005193
	Fertility Preservation	MESH:D059247
	Fluorescence Polarization*	MESH:D005454
	Follicular Phase	MESH:D005498
	Foot Process*	NCIT:C32623
	Fowlpox virus	MESH:D005587
	Luminescent Proteins*	MESH:D008164
	N-Formylmethionine Leucyl-Phenylalanine	MESH:D009240
	PTGFR	HGNC:9600
	Particulate Matter	MESH:D052638
	Physicians, Family	MESH:D010821
	Propolis	MESH:D011429
	Scleroproteins	MESH:D012596
	Viral Fusion Proteins*	MESH:D014760
	alvocidib*	CHEBI:CHEBI:47344
	ciliary pocket*	GO:GO:0020016
	farnesyl phosphate	CHEBI:CHEBI:24018
	fenpropimorph	CHEBI:CHEBI:50145
	field_potential	field_potential
	fipronil	CHEBI:CHEBI:5063
	flavoprotein	CHEBI:CHEBI:5086
	floor_plate	floor_plate
	fluid_percussion	fluid_percussion
	fluoridophosphate	CHEBI:CHEBI:42

In [38]:
model_to_s3(disamb)