In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['PP', 'PPs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('pulse pressure', 442),
 ('pancreatic polypeptide', 419),
 ('peyer s patches', 365),
 ('polypropylene', 138),
 ('peroxisome proliferators', 102),
 ('protein phosphatase', 72),
 ('perforant path', 63),
 ('perfusion pressure', 57),
 ('precocious puberty', 42),
 ('pentose phosphate', 41),
 ('per protocol', 39),
 ('pyrvinium pamoate', 39),
 ('portal pressure', 37),
 ('prone positive', 31),
 ('protein phosphatases', 29),
 ('posterior probability', 26),
 ('paliperidone palmitate', 24),
 ('postpartum', 21),
 ('propyl paraben', 20),
 ('proximal promoter', 20),
 ('paired pulse', 15),
 ('peak power', 15),
 ('phosphophoryn', 15),
 ('primary progressive', 14),
 ('pyrophosphate', 14),
 ('piperine', 12),
 ('protein protein', 12),
 ('post partum', 11),
 ('physical practice', 11),
 ('plasma protein', 11),
 ('primary production', 10),
 ('peak pressure', 10),
 ('physical performance', 9),
 ('perforant pathway', 9),
 ('plant protein', 9),
 ('pulsatile perfusion', 8),
 ('phytate p', 8),
 ('peyer patches

In [5]:
list(zip(longforms, counts))

[('pulse pressure', 442),
 ('pancreatic polypeptide', 419),
 ('peyer s patches', 240),
 ('polypropylene', 138),
 ('protein phosphatase', 72),
 ('perforant path', 63),
 ('perfusion pressure', 57),
 ('precocious puberty', 42),
 ('pentose phosphate', 41),
 ('peroxisome proliferators', 41),
 ('per protocol', 39),
 ('pyrvinium pamoate', 39),
 ('portal pressure', 37),
 ('prone positive', 31),
 ('posterior probability', 26),
 ('paliperidone palmitate', 24),
 ('postpartum', 21),
 ('propyl paraben', 20),
 ('proximal promoter', 20),
 ('paired pulse', 15),
 ('peak power', 15),
 ('phosphophoryn', 15),
 ('primary progressive', 14),
 ('pyrophosphate', 14),
 ('piperine', 12),
 ('protein protein', 12),
 ('post partum', 11),
 ('physical practice', 11),
 ('plasma protein', 11),
 ('primary production', 10),
 ('peak pressure', 10),
 ('physical performance', 9),
 ('perforant pathway', 9),
 ('plant protein', 9),
 ('pulsatile perfusion', 8),
 ('phytate p', 8),
 ('peyer patches', 8),
 ('postprandial', 8),
 ('

In [12]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-09-25 19:36:52] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [13]:
result = [grounding_map, names, pos_labels]

In [14]:
result

[{'p putida': 'ungrounded',
  'paired pulse': 'ungrounded',
  'paliperidone palmitate': 'CHEBI:CHEBI:83807',
  'palm pocket': 'ungrounded',
  'palmoplantar': 'ungrounded',
  'palmoplantar psoriasis': 'HP:HP:0025524',
  'pancreatic polypeptide': 'CHEBI:CHEBI:80329',
  'pancreatic progenitor': 'ungrounded',
  'pancreatic progenitors': 'ungrounded',
  'paracellular permeability': 'ungrounded',
  'paralytic peptide': 'MESH:C121924',
  'particulate p': 'ungrounded',
  'partner preference': 'ungrounded',
  'pathophysiologic process': 'ungrounded',
  'peak power': 'ungrounded',
  'peak pressure': 'ungrounded',
  'pentose pathway': 'ungrounded',
  'pentose phosphate': 'CHEBI:CHEBI:84055',
  'per protocol': 'ungrounded',
  'percentage of phagocytic cells': 'ungrounded',
  'percentage points': 'ungrounded',
  'percentage positive': 'ungrounded',
  'perceptual peak': 'ungrounded',
  'perforant path': 'MESH:D019580',
  'perforant pathway': 'MESH:D019580',
  'perfusion pressure': 'perfusion_pressur

In [6]:
grounding_map, names, pos_labels = [{'p putida': 'ungrounded',
  'paired pulse': 'ungrounded',
  'paliperidone palmitate': 'CHEBI:CHEBI:83807',
  'palm pocket': 'ungrounded',
  'palmoplantar': 'ungrounded',
  'palmoplantar psoriasis': 'HP:HP:0025524',
  'pancreatic polypeptide': 'HGNC:9327',
  'pancreatic progenitor': 'ungrounded',
  'pancreatic progenitors': 'ungrounded',
  'paracellular permeability': 'ungrounded',
  'paralytic peptide': 'MESH:C121924',
  'particulate p': 'ungrounded',
  'partner preference': 'ungrounded',
  'pathophysiologic process': 'ungrounded',
  'peak power': 'ungrounded',
  'peak pressure': 'ungrounded',
  'pentose pathway': 'ungrounded',
  'pentose phosphate': 'CHEBI:CHEBI:84055',
  'per protocol': 'ungrounded',
  'percentage of phagocytic cells': 'ungrounded',
  'percentage points': 'ungrounded',
  'percentage positive': 'ungrounded',
  'perceptual peak': 'ungrounded',
  'perforant path': 'MESH:D019580',
  'perforant pathway': 'MESH:D019580',
  'perfusion pressure': 'perfusion_pressure',
  'periodic paralysis': 'HP:HP:0003768',
  'peroxisome proliferators': 'MESH:D020025',
  'persimmon peel': 'ungrounded',
  'perspective projection': 'ungrounded',
  'peyer patches': 'MESH:D010581',
  'peyer s patches': 'MESH:D010581',
  'phagocytosis production': 'ungrounded',
  'pharmacological preconditioning': 'ungrounded',
  'phenotype plasticity': 'ungrounded',
  'phenylpropanoid': 'CHEBI:CHEBI:26004',
  'phenylpropanoid pathway': 'ungrounded',
  'phloem parenchyma': 'ungrounded',
  'phosphate pathway': 'ungrounded',
  'phosphophoryn': 'MESH:C017630',
  'phosphoprotein': 'MESH:D010749',
  'physalis peruviana l': 'ungrounded',
  'physical performance': 'ungrounded',
  'physical practice': 'ungrounded',
  'phytate p': 'CHEBI:CHEBI:17401',
  'phytate phosphorus': 'CHEBI:CHEBI:17401',
  'pineal protein': 'ungrounded',
  'piperine': 'CHEBI:CHEBI:28821',
  'placenta previa': 'MESH:D010923',
  'plant polyphenols': 'CHEBI:CHEBI:26195',
  'plant protein': 'ungrounded',
  'plasma protein': 'ungrounded',
  'plateau potential': 'ungrounded',
  'plateau potentials': 'ungrounded',
  'platinum pemetrexed': 'ungrounded',
  'pleural plaque': 'ungrounded',
  'pocket protein': 'FPLX:Pocket_protein',
  'polyol pathway': 'ungrounded',
  'polyphosphates': 'CHEBI:CHEBI:26197',
  'polyproline': 'CHEBI:CHEBI:8321',
  'polypropylene': 'CHEBI:CHEBI:53550',
  'polystyrene particles': 'ungrounded',
  'pomegranate peel': 'ungrounded',
  'poor prognosis': 'ungrounded',
  'porcine pancreatic': 'ungrounded',
  'porcine pericardium': 'MESH:D010496',
  'portal pressure': 'MESH:D017082',
  'positive psychology': 'MESH:D000080032',
  'post partum': 'MESH:D049590',
  'posterior pituitary': 'ungrounded',
  'posterior probability': 'ungrounded',
  'postpartum': 'MESH:D049590',
  'postprandial': 'ungrounded',
  'potato peel': 'ungrounded',
  'pp': 'ungrounded',
  'pp stimulated': 'ungrounded',
  'precocious puberty': 'MESH:D011629',
  'premature pubarche': 'HP:HP:0012411',
  'prepupae': 'ungrounded',
  'primary pharmacophore': 'ungrounded',
  'primary prevention': 'MESH:D011322',
  'primary production': 'ungrounded',
  'primary progressive': 'ungrounded',
  'primary prophylaxis': 'ungrounded',
  'primase pol α': 'FPLX:DNA_polymerase_alpha',
  'private practitioners': 'ungrounded',
  'prone positive': 'ungrounded',
  'propyl paraben': 'CHEBI:CHEBI:32063',
  'protease and phytase': 'ungrounded',
  'protein percentage': 'ungrounded',
  'protein phosphatase': 'MESH:D010749',
  'protein phosphatases': 'MESH:D010749',
  'protein protein': 'ungrounded',
  'proteose peptone': 'MESH:C018135',
  'protoporphyrin ix': 'MESH:C028025',
  'proximal promoter': 'ungrounded',
  'pseudoporphyria': 'ungrounded',
  'pulsatile perfusion': 'MESH:D011673',
  'pulse pressure': 'NCIT:C100945',
  'pustular psoriasis': 'ungrounded',
  'pyrazolo 3 4 d pyrimidine': 'PUBCHEM:67499',
  'pyrophosphate': 'CHEBI:CHEBI:29888',
  'pyrvinium pamoate': 'CHEBI:CHEBI:8688',
  'sp and prone': 'ungrounded'},
 {'CHEBI:CHEBI:83807': 'paliperidone palmitate',
  'HP:HP:0025524': 'Palmoplantar scaling skin',
  'HGNC:9327': 'PPY',
  'MESH:C121924': 'paralytic peptide, insect',
  'CHEBI:CHEBI:84055': 'pentose phosphate',
  'MESH:D019580': 'Perforant Pathway',
  'perfusion_pressure': 'perfusion_pressure',
  'HP:HP:0003768': 'Periodic paralysis',
  'MESH:D020025': 'Peroxisome Proliferators',
  'MESH:D010581': "Peyer's Patches",
  'CHEBI:CHEBI:26004': 'phenylpropanoid',
  'MESH:C017630': 'phosphophoryn',
  'MESH:D010749': 'Phosphoprotein Phosphatases',
  'CHEBI:CHEBI:17401': 'myo-inositol hexakisphosphate',
  'CHEBI:CHEBI:28821': 'piperine',
  'MESH:D010923': 'Placenta Previa',
  'CHEBI:CHEBI:26195': 'polyphenol',
  'FPLX:Pocket_protein': 'Pocket_protein',
  'CHEBI:CHEBI:26197': 'polyphosphates',
  'CHEBI:CHEBI:8321': 'Polyproline',
  'CHEBI:CHEBI:53550': 'poly(propylene)',
  'MESH:D010496': 'Pericardium',
  'MESH:D017082': 'Portal Pressure',
  'MESH:D000080032': 'Psychology, Positive',
  'MESH:D049590': 'Postpartum Period',
  'MESH:D011629': 'Puberty, Precocious',
  'HP:HP:0012411': 'Premature pubarche',
  'MESH:D011322': 'Primary Prevention',
  'FPLX:DNA_polymerase_alpha': 'DNA_polymerase_alpha',
  'CHEBI:CHEBI:32063': 'propylparaben',
  'MESH:C018135': 'proteose-peptone',
  'MESH:C028025': 'protoporphyrin IX',
  'MESH:D011673': 'Pulsatile Flow',
  'NCIT:C100945': 'Pulse Pressure',
  'PUBCHEM:67499': '1h-Pyrazolo[3,4-d]pyrimidine',
  'CHEBI:CHEBI:29888': 'diphosphoric acid',
  'CHEBI:CHEBI:8688': 'Pyrvinium pamoate'},
 ['CHEBI:CHEBI:53550',
  'CHEBI:CHEBI:84055',
  'HGNC:9327',
  'MESH:D010581',
  'MESH:D010749',
  'MESH:D011629',
  'MESH:D017082',
  'MESH:D019580',
  'MESH:D020025',
  'NCIT:C100945']]

In [7]:
excluded_longforms = ['pp']

In [8]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [9]:
additional_entities = {}

In [10]:
unambiguous_agent_texts = {}

In [11]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [19]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [20]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [21]:
intersection1

[('HGNC:1848', 'HGNC:1848', 86)]

In [22]:
intersection2

[('MESH:D018893', 'HGNC:1848', 0),
 ('HGNC:1848', 'HGNC:1848', 1),
 ('ungrounded', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:64198', 'HGNC:1848', 0),
 ('MESH:D019164', 'HGNC:1848', 0),
 ('MESH:C059416', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:15710', 'HGNC:1848', 0)]

[('HGNC:7321', ['Musculin', 'musculin'])]

In [23]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [12]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-26 02:35:07] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-26 02:36:18] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9358766804739996 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [23]:
classifier.stats

{'label_distribution': {'perfusion_pressure': 47,
  'HGNC:9327': 326,
  'NCIT:C100945': 335,
  'MESH:D010581': 300,
  'MESH:D019580': 49,
  'MESH:C028025': 3,
  'MESH:D010749': 105,
  'ungrounded': 270,
  'MESH:D017082': 31,
  'MESH:D020025': 106,
  'CHEBI:CHEBI:29888': 14,
  'PUBCHEM:67499': 3,
  'MESH:D049590': 18,
  'CHEBI:CHEBI:84055': 27,
  'MESH:C018135': 5,
  'MESH:C017630': 13,
  'MESH:D010496': 3,
  'CHEBI:CHEBI:8321': 3,
  'HP:HP:0012411': 4,
  'CHEBI:CHEBI:53550': 95,
  'MESH:D011629': 24,
  'CHEBI:CHEBI:32063': 15,
  'MESH:D011673': 7,
  'HP:HP:0025524': 2,
  'MESH:C121924': 6,
  'CHEBI:CHEBI:26004': 3,
  'CHEBI:CHEBI:26197': 2,
  'CHEBI:CHEBI:8688': 23,
  'MESH:D011322': 2,
  'CHEBI:CHEBI:83807': 15,
  'CHEBI:CHEBI:17401': 3,
  'FPLX:DNA_polymerase_alpha': 2,
  'FPLX:Pocket_protein': 5,
  'MESH:D010923': 5,
  'CHEBI:CHEBI:28821': 7,
  'HP:HP:0003768': 4,
  'MESH:D000080032': 2,
  'CHEBI:CHEBI:26195': 4},
 'f1': {'mean': 0.935759, 'std': 0.005905},
 'precision': {'mean': 0.

In [13]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [14]:
disamb.dump(model_name, results_path)

In [15]:
print(disamb.info())

Disambiguation model for PP, and PPs

Produces the disambiguations:
	1h-Pyrazolo[3,4-d]pyrimidine	PUBCHEM:67499
	DNA_polymerase_alpha	FPLX:DNA_polymerase_alpha
	PPY*	HGNC:9327
	Palmoplantar scaling skin	HP:HP:0025524
	Perforant Pathway*	MESH:D019580
	Pericardium	MESH:D010496
	Periodic paralysis	HP:HP:0003768
	Peroxisome Proliferators*	MESH:D020025
	Peyer's Patches*	MESH:D010581
	Phosphoprotein Phosphatases*	MESH:D010749
	Placenta Previa	MESH:D010923
	Pocket_protein	FPLX:Pocket_protein
	Polyproline	CHEBI:CHEBI:8321
	Portal Pressure*	MESH:D017082
	Postpartum Period	MESH:D049590
	Premature pubarche	HP:HP:0012411
	Primary Prevention	MESH:D011322
	Psychology, Positive	MESH:D000080032
	Puberty, Precocious*	MESH:D011629
	Pulsatile Flow	MESH:D011673
	Pulse Pressure*	NCIT:C100945
	Pyrvinium pamoate	CHEBI:CHEBI:8688
	diphosphoric acid	CHEBI:CHEBI:29888
	myo-inositol hexakisphosphate	CHEBI:CHEBI:17401
	paliperidone palmitate	CHEBI:CHEBI:83807
	paralytic peptide, insect	MESH:C121924
	pentose phosp

In [16]:
model_to_s3(disamb)