In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['CAP', 'CAPs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())

In [28]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, no_browser=True, port=8890)

In [29]:
result = [grounding_map, names, pos_labels]

In [30]:
result

[{'adherent proliferating cells': 'ungrounded',
  'analysis of principal coordinates': 'ungrounded',
  'calcium phosphate': 'CHEBI:CHEBI:77635',
  'camp receptor protein': 'MESH:D002373',
  'candidatus accumulibacter phosphatis': 'ungrounded',
  'cap': 'ungrounded',
  'capacitance': 'ungrounded',
  'capecitabine': 'CHEBI:CHEBI:31348',
  'capreomycin': 'CHEBI:CHEBI:3371',
  'capsaicin': 'CHEBI:CHEBI:3374',
  'capsaicinoids': 'CHEBI:CHEBI:46931',
  'captopril': 'CHEBI:CHEBI:3380',
  'carcinoma of the prostate': 'DOID:DOID:10286',
  'cardiac action potential': 'GO:GO:0001508',
  'cardiac activator period': 'ungrounded',
  'carotid artery plaque': 'MESH:D016893',
  'carprofen': 'CHEBI:CHEBI:364453',
  'catabolite activator protein': 'MESH:D002373',
  'catabolite gene activator protein': 'MESH:D002373',
  'cationic antimicrobial peptide': 'MESH:D023181',
  'cbl associated protein': 'HGNC:14926',
  'cellulose acetate phthalate': 'CHEBI:CHEBI:31386',
  'cementum attachment protein': 'HGNC:963

In [5]:
grounding_map, names, pos_labels = [{'adherent proliferating cells': 'ungrounded',
  'analysis of principal coordinates': 'ungrounded',
  'calcium phosphate': 'CHEBI:CHEBI:77635',
  'camp receptor protein': 'MESH:D002373',
  'candidatus accumulibacter phosphatis': 'ungrounded',
  'cap': 'ungrounded',
  'capacitance': 'ungrounded',
  'capecitabine': 'CHEBI:CHEBI:31348',
  'capreomycin': 'CHEBI:CHEBI:3371',
  'capsaicin': 'CHEBI:CHEBI:3374',
  'capsaicinoids': 'CHEBI:CHEBI:46931',
  'captopril': 'CHEBI:CHEBI:3380',
  'carcinoma of the prostate': 'DOID:DOID:10286',
  'cardiac action potential': 'GO:GO:0001508',
  'cardiac activator period': 'ungrounded',
  'carotid artery plaque': 'MESH:D016893',
  'carprofen': 'CHEBI:CHEBI:364453',
  'catabolite activator protein': 'MESH:D002373',
  'catabolite gene activator protein': 'MESH:D002373',
  'cationic antimicrobial peptide': 'MESH:D023181',
  'cbl associated protein': 'HGNC:14926',
  'cellulose acetate phthalate': 'CHEBI:CHEBI:31386',
  'cementum attachment protein': 'HGNC:9639',
  'cercarial antigen preparation': 'ungrounded',
  'channel activating proteases': 'ungrounded',
  'chemotherapy after pd 1 inhibitor treatment': 'ungrounded',
  'child and adolescent psychiatry': 'MESH:D000296',
  'chloramphenicol': 'CHEBI:CHEBI:17698',
  'chlorantraniliprole': 'CHEBI:CHEBI:67113',
  'cholinergic anti inflammatory pathway': 'MESH:D015213',
  'chronic abdominal pain': 'MESH:D015746',
  'chronic apical periodontitis': 'DOID:DOID:11269',
  'clathrin associated proteins': 'FPLX:Adaptor_protein',
  'clinical assessment protocols': 'ungrounded',
  'cochlear action potential': 'ungrounded',
  'cold atmospheric plasma': 'cold_atmospheric_plasma',
  'cold atmospheric plasmas': 'ungrounded',
  'cold atmospheric pressure plasma': 'ungrounded',
  'college of american pathologists': 'ungrounded',
  'community acquired pneumonia': 'MESH:D011014',
  'complement alternating pathway': 'MESH:D003170',
  'compound action potential': 'GO:GO:0001508',
  'compound action potential of the auditory nerve': 'GO:GO:0001508',
  'compound action potentials': 'GO:GO:0001508',
  'computer assisted planning': 'ungrounded',
  'concentrated air particles': 'MESH:D052638',
  'concentrated ambient particles': 'MESH:D052638',
  'concentrated ambient particulate matter pm 2 5': 'MESH:D052638',
  'concentrated ambient particulates': 'MESH:D052638',
  'concentrated ambient pm2 5': 'MESH:D052638',
  'concentrated fine particulate matter': 'MESH:D052638',
  'controlled attenuation parameter': 'NCIT:C174123',
  'coronary artery perforation': 'ungrounded',
  'coronary atherosclerotic plaque': 'MESH:D058226',
  'critical acromial point': 'ungrounded',
  'cyclase associated protein': 'FPLX:CAP',
  'cyclase associated proteins': 'FPLX:CAP',
  'cyclic alternating pattern': 'cyclic_alternating_pattern',
  'cyclic amp receptor protein': 'MESH:D002373',
  'prostate cancer': 'MESH:D011471'},
 {'CHEBI:CHEBI:77635': 'calcium phosphate',
  'MESH:D002373': 'Cyclic AMP Receptor Protein',
  'CHEBI:CHEBI:31348': 'capecitabine',
  'CHEBI:CHEBI:3371': 'capreomycin',
  'CHEBI:CHEBI:3374': 'capsaicin',
  'CHEBI:CHEBI:46931': 'capsaicinoid',
  'CHEBI:CHEBI:3380': 'captopril',
  'DOID:DOID:10286': 'prostate carcinoma',
  'GO:GO:0001508': 'action potential',
  'MESH:D016893': 'Carotid Stenosis',
  'CHEBI:CHEBI:364453': 'carprofen',
  'MESH:D023181': 'Antimicrobial Cationic Peptides',
  'HGNC:14926': 'SORBS1',
  'CHEBI:CHEBI:31386': 'Cellulose acetate phthalate',
  'HGNC:9639': 'HACD1',
  'MESH:D000296': 'Adolescent Psychiatry',
  'CHEBI:CHEBI:17698': 'chloramphenicol',
  'CHEBI:CHEBI:67113': 'chlorantraniliprole',
  'MESH:D015213': 'Neuroimmunomodulation',
  'MESH:D015746': 'Abdominal Pain',
  'DOID:DOID:11269': 'chronic apical periodontitis',
  'FPLX:Adaptor_protein': 'Adaptor_protein',
  'cold_atmospheric_plasma': 'cold_atmospheric_plasma',
  'MESH:D011014': 'Pneumonia',
  'MESH:D003170': 'Complement Pathway, Alternative',
  'MESH:D052638': 'Particulate Matter',
  'NCIT:C174123': 'Controlled Attenuation Parameter',
  'MESH:D058226': 'Plaque, Atherosclerotic',
  'FPLX:CAP': 'CAP',
  'cyclic_alternating_pattern': 'cyclic_alternating_pattern',
  'MESH:D011471': 'Prostatic Neoplasms'},
 ['CHEBI:CHEBI:17698',
  'CHEBI:CHEBI:31348',
  'CHEBI:CHEBI:31386',
  'CHEBI:CHEBI:3371',
  'CHEBI:CHEBI:3374',
  'CHEBI:CHEBI:3380',
  'CHEBI:CHEBI:364453',
  'CHEBI:CHEBI:46931',
  'CHEBI:CHEBI:67113',
  'CHEBI:CHEBI:77635',
  'DOID:DOID:10286',
  'DOID:DOID:11269',
  'FPLX:Adaptor_protein',
  'FPLX:CAP',
  'GO:GO:0001508',
  'HGNC:14926',
  'HGNC:9639',
  'MESH:D000296',
  'MESH:D002373',
  'MESH:D003170',
  'MESH:D011014',
  'MESH:D011471',
  'MESH:D015213',
  'MESH:D015746',
  'MESH:D016893',
  'MESH:D023181',
  'MESH:D052638',
  'MESH:D058226',
  'NCIT:C174123']]

In [6]:
excluded_longforms = ['cap']

In [7]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [8]:
additional_entities = {}

In [9]:
unambiguous_agent_texts = {}

In [10]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [11]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [12]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [13]:
list(zip(longforms, counts))

[('community acquired pneumonia', 595),
 ('capsaicin', 246),
 ('compound action potential', 198),
 ('cold atmospheric plasma', 153),
 ('chloramphenicol', 101),
 ('cholinergic anti inflammatory pathway', 55),
 ('catabolite activator protein', 54),
 ('compound action potentials', 49),
 ('concentrated ambient particles', 47),
 ('captopril', 43),
 ('cyclase associated protein', 42),
 ('catabolite gene activator protein', 39),
 ('controlled attenuation parameter', 38),
 ('cbl associated protein', 37),
 ('cellulose acetate phthalate', 24),
 ('capsaicinoids', 24),
 ('cyclic alternating pattern', 23),
 ('concentrated ambient pm2 5', 23),
 ('capecitabine', 20),
 ('cold atmospheric pressure plasma', 16),
 ('capreomycin', 14),
 ('cementum attachment protein', 14),
 ('college of american pathologists', 9),
 ('concentrated air particles', 9),
 ('cyclic amp receptor protein', 8),
 ('complement alternating pathway', 7),
 ('camp receptor protein', 6),
 ('concentrated ambient particulate matter pm 2 5'

In [39]:
intersection1

[('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:17698', 10342),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:31348', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:31386', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3371', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3374', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3380', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:364453', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:46931', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:67113', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:77635', 0),
 ('CHEBI:CHEBI:17698', 'DOID:DOID:10286', 0),
 ('CHEBI:CHEBI:17698', 'DOID:DOID:11269', 0),
 ('CHEBI:CHEBI:17698', 'FPLX:Adaptor_protein', 0),
 ('CHEBI:CHEBI:17698', 'FPLX:CAP', 0),
 ('CHEBI:CHEBI:17698', 'GO:GO:0001508', 0),
 ('CHEBI:CHEBI:17698', 'HGNC:14926', 0),
 ('CHEBI:CHEBI:17698', 'HGNC:9639', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D000296', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D002373', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D003170', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D011014', 143),
 ('CHEBI:CHEBI:17698', 'MESH:D011471'

In [40]:
intersection2

[('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:17698', 54),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:31348', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:31386', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3371', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3374', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:3380', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:364453', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:46931', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:67113', 0),
 ('CHEBI:CHEBI:17698', 'CHEBI:CHEBI:77635', 0),
 ('CHEBI:CHEBI:17698', 'DOID:DOID:10286', 0),
 ('CHEBI:CHEBI:17698', 'DOID:DOID:11269', 0),
 ('CHEBI:CHEBI:17698', 'FPLX:Adaptor_protein', 0),
 ('CHEBI:CHEBI:17698', 'FPLX:CAP', 0),
 ('CHEBI:CHEBI:17698', 'GO:GO:0001508', 0),
 ('CHEBI:CHEBI:17698', 'HGNC:14926', 0),
 ('CHEBI:CHEBI:17698', 'HGNC:9639', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D000296', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D002373', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D003170', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D011014', 0),
 ('CHEBI:CHEBI:17698', 'MESH:D011471', 0),

In [18]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [14]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-03 01:22:00] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-03 01:22:58] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.8940886154507333 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [45]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:17698': 89,
  'MESH:D002373': 85,
  'GO:GO:0001508': 218,
  'cyclic_alternating_pattern': 17,
  'CHEBI:CHEBI:3374': 193,
  'CHEBI:CHEBI:3380': 36,
  'DOID:DOID:10286': 3,
  'ungrounded': 56,
  'HGNC:9639': 10,
  'FPLX:CAP': 38,
  'MESH:D011014': 377,
  'HGNC:14926': 32,
  'MESH:D011471': 3,
  'CHEBI:CHEBI:77635': 4,
  'CHEBI:CHEBI:31386': 14,
  'CHEBI:CHEBI:31348': 17,
  'DOID:DOID:11269': 3,
  'MESH:D052638': 55,
  'MESH:D016893': 5,
  'MESH:D003170': 7,
  'MESH:D015213': 39,
  'MESH:D023181': 4,
  'CHEBI:CHEBI:3371': 7,
  'MESH:D000296': 3,
  'cold_atmospheric_plasma': 94,
  'NCIT:C174123': 18,
  'MESH:D015746': 3,
  'MESH:D058226': 2,
  'CHEBI:CHEBI:67113': 5,
  'CHEBI:CHEBI:46931': 5,
  'CHEBI:CHEBI:364453': 2,
  'FPLX:Adaptor_protein': 3},
 'f1': {'mean': 0.894089, 'std': 0.01807},
 'precision': {'mean': 0.891011, 'std': 0.023255},
 'recall': {'mean': 0.910927, 'std': 0.012052},
 'DOID:DOID:10286': {'f1': {'mean': 0.2, 'std': 0.4},
  'pr': {'me

In [15]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [16]:
disamb.dump(model_name, results_path)

In [17]:
print(disamb.info())

Disambiguation model for CAP, and CAPs

Produces the disambiguations:
	Abdominal Pain*	MESH:D015746
	Adaptor_protein*	FPLX:Adaptor_protein
	Adolescent Psychiatry*	MESH:D000296
	Antimicrobial Cationic Peptides*	MESH:D023181
	CAP*	FPLX:CAP
	Carotid Stenosis*	MESH:D016893
	Cellulose acetate phthalate*	CHEBI:CHEBI:31386
	Complement Pathway, Alternative*	MESH:D003170
	Controlled Attenuation Parameter*	NCIT:C174123
	Cyclic AMP Receptor Protein*	MESH:D002373
	HACD1*	HGNC:9639
	Neuroimmunomodulation*	MESH:D015213
	Particulate Matter*	MESH:D052638
	Plaque, Atherosclerotic*	MESH:D058226
	Pneumonia*	MESH:D011014
	Prostatic Neoplasms*	MESH:D011471
	SORBS1*	HGNC:14926
	action potential*	GO:GO:0001508
	calcium phosphate*	CHEBI:CHEBI:77635
	capecitabine*	CHEBI:CHEBI:31348
	capreomycin*	CHEBI:CHEBI:3371
	capsaicin*	CHEBI:CHEBI:3374
	capsaicinoid*	CHEBI:CHEBI:46931
	captopril*	CHEBI:CHEBI:3380
	carprofen*	CHEBI:CHEBI:364453
	chloramphenicol*	CHEBI:CHEBI:17698
	chlorantraniliprole*	CHEBI:CHEBI:67113
	ch

In [18]:
model_to_s3(disamb)