In [2]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [3]:
adeft_grounder = AdeftGrounder()

In [4]:
shortforms = ['APC', 'APCs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [5]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('adenomatous polyposis coli', 1608),
 ('antigen presenting cells', 1377),
 ('activated protein c', 1185),
 ('anaphase promoting complex', 585),
 ('allophycocyanin', 85),
 ('anesthetic preconditioning', 47),
 ('activated pc', 41),
 ('argon plasma coagulation', 41),
 ('adenomatosis polyposis coli', 41),
 ('adenomatous polyposis coli protein', 40),
 ('anaphase promoting complex cyclosome', 31),
 ('adenomatous polyposis coli gene', 24),
 ('anterior piriform cortex', 23),
 ('aphidicolin', 21),
 ('annual percentage changes', 17),
 ('advanced pancreatic cancer', 16),
 ('ag presenting cells', 16),
 ('alkylphosphocholine', 14),
 ('adenomatous polyposis coli gene product', 14),
 ('anesthetic induced preconditioning', 14),
 ('antibody photoabsorber conjugate', 13),
 ('allylpyrocatechol', 13),
 ('aerobic plate count', 11),
 ('photo absorber conjugate', 10),
 ('alternative pathway of complement', 10),
 ('annual percent changes', 9),
 ('adenomatous polyposis', 9),
 ('anaphase promoting complex or 

In [7]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-06 04:02:24] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [8]:
result = [grounding_map, names, pos_labels]

In [9]:
result

[{'acid postconditioning': 'ungrounded',
  'acid pre conditions': 'ungrounded',
  'acid preconditioning': 'ungrounded',
  'activated pc': 'HGNC:9451',
  'activated protein c': 'HGNC:9451',
  'adematous polyposis coli': 'HGNC:583',
  'adenoma polyposis coli': 'HGNC:583',
  'adenomatosis polyposis coli': 'HGNC:583',
  'adenomatous polyposis': 'HGNC:583',
  'adenomatous polyposis coil': 'HGNC:583',
  'adenomatous polyposis coli': 'HGNC:583',
  'adenomatous polyposis coli gene': 'HGNC:583',
  'adenomatous polyposis coli gene product': 'HGNC:583',
  'adenomatous polyposis coli protein': 'HGNC:583',
  'adenomatous polyposis coli tumor suppressor gene': 'HGNC:583',
  'adenomatous polyposis coli tumor suppressor protein': 'HGNC:583',
  'adenosine plasma concentration': 'ungrounded',
  'adipocyte precursor cells': 'ungrounded',
  'adipocyte progenitor cells': 'ungrounded',
  'adipose precursor cells': 'ungrounded',
  'adipose progenitor cells': 'ungrounded',
  'advanced pancreatic cancer': 'DOI

In [10]:
grounding_map, names, pos_labels = [{'acid postconditioning': 'ungrounded',
  'acid pre conditions': 'ungrounded',
  'acid preconditioning': 'ungrounded',
  'activated pc': 'HGNC:9451',
  'activated protein c': 'HGNC:9451',
  'adematous polyposis coli': 'HGNC:583',
  'adenoma polyposis coli': 'HGNC:583',
  'adenomatosis polyposis coli': 'HGNC:583',
  'adenomatous polyposis': 'HGNC:583',
  'adenomatous polyposis coil': 'HGNC:583',
  'adenomatous polyposis coli': 'HGNC:583',
  'adenomatous polyposis coli gene': 'HGNC:583',
  'adenomatous polyposis coli gene product': 'HGNC:583',
  'adenomatous polyposis coli protein': 'HGNC:583',
  'adenomatous polyposis coli tumor suppressor gene': 'HGNC:583',
  'adenomatous polyposis coli tumor suppressor protein': 'HGNC:583',
  'adenosine plasma concentration': 'ungrounded',
  'adipocyte precursor cells': 'ungrounded',
  'adipocyte progenitor cells': 'ungrounded',
  'adipose precursor cells': 'ungrounded',
  'adipose progenitor cells': 'ungrounded',
  'advanced pancreatic cancer': 'DOID:DOID:1793',
  'aerobic plate count': 'ungrounded',
  'aerobic plate counts': 'ungrounded',
  'ag presenting cells': 'MESH:D000938',
  'age period cohort': 'ungrounded',
  'air pollution control': 'ungrounded',
  'alkylphosphocholine': 'CHEBI:CHEBI:52589',
  'alkylphosphocholines': 'CHEBI:CHEBI:52589',
  'allophycocyanin': 'MESH:C002497',
  'allylpyrocatechol': 'MESH:C523074',
  'alternative pathway of complement': 'MESH:D003170',
  'amyloid plaque core': 'MESH:D058225',
  'anaesthetic preconditioning': 'ungrounded',
  'anaphase promoting complex': 'GO:GO:0005680',
  'anaphase promoting complex cyclosome': 'GO:GO:0005680',
  'anaphase promoting complex or cyclosome': 'GO:GO:0005680',
  'anesthetic induced preconditioning': 'ungrounded',
  'anesthetic preconditioning': 'ungrounded',
  'angiogenic progenitor cells': 'ungrounded',
  'annual percent changes': 'ungrounded',
  'annual percentage changes': 'ungrounded',
  'anoxic preconditioning': 'ungrounded',
  'anterior piriform cortex': 'MESH:D066195',
  'antibody photoabsorber conjugate': 'NCIT:C66919',
  'antibody photosensitizer conjugate': 'NCIT:C66919',
  'antigen presenting': 'MESH:D000938',
  'antigen presenting capacity': 'MESH:D000938',
  'antigen presenting cells': 'MESH:D000938',
  'antigen processing cells': 'MESH:D000938',
  'antigenpresenting cells': 'MESH:D000938',
  'antipseudomonal carbapenems': 'CHEBI:CHEBI:46633',
  'apc': 'ungrounded',
  'apc gene product': 'HGNC:583',
  'apcδ468': 'ungrounded',
  'aphidicolin': 'CHEBI:CHEBI:2766',
  'apical papilla cells': 'ungrounded',
  'apicidin': 'MESH:C102351',
  'apparent photosynthetic conductance': 'ungrounded',
  'aqueous protein concentration': 'ungrounded',
  'argon plasma coagulation': 'MESH:D057908',
  'article processing charge': 'ungrounded',
  'associated of pathology chairs': 'ungrounded',
  'atherosclerotic plaque characteristics': 'ungrounded',
  'atrial premature complex': 'MESH:D018880',
  'autologous platelet concentration': 'ungrounded',
  'photo absorber conjugate': 'NCIT:C66919',
  'photon absorber conjugate': 'NCIT:C66919'},
 {'HGNC:9451': 'PROC',
  'HGNC:583': 'APC',
  'DOID:DOID:1793': 'pancreatic cancer',
  'MESH:D000938': 'Antigen-Presenting Cells',
  'CHEBI:CHEBI:52589': '1-lyso-2-alkylphosphatidylcholine',
  'MESH:C002497': 'allophycocyanin',
  'MESH:C523074': 'allylpyrocatechol',
  'MESH:D003170': 'Complement Pathway, Alternative',
  'MESH:D058225': 'Plaque, Amyloid',
  'GO:GO:0005680': 'anaphase-promoting complex',
  'MESH:D066195': 'Piriform Cortex',
  'NCIT:C66919': 'Photoimmunotherapy',
  'CHEBI:CHEBI:46633': 'carbapenems',
  'CHEBI:CHEBI:2766': 'aphidicolin',
  'MESH:C102351': 'apicidin',
  'MESH:D057908': 'Argon Plasma Coagulation',
  'MESH:D018880': 'Atrial Premature Complexes'},
 ['CHEBI:CHEBI:2766',
  'CHEBI:CHEBI:52589',
  'DOID:DOID:1793',
  'GO:GO:0005680',
  'HGNC:583',
  'HGNC:9451',
  'MESH:D000938',
  'MESH:D018880',
  'MESH:D057908',
  'MESH:D066195',
  'NCIT:C66919']]

In [11]:
excluded_longforms = []

In [12]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [13]:
additional_entities = []

In [14]:
unambiguous_agent_texts = {}

In [15]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [19]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [20]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [21]:
intersection1

[('HGNC:1848', 'HGNC:1848', 86)]

In [22]:
intersection2

[('MESH:D018893', 'HGNC:1848', 0),
 ('HGNC:1848', 'HGNC:1848', 1),
 ('ungrounded', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:64198', 'HGNC:1848', 0),
 ('MESH:D019164', 'HGNC:1848', 0),
 ('MESH:C059416', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:15710', 'HGNC:1848', 0)]

[('HGNC:7321', ['Musculin', 'musculin'])]

In [23]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [16]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-06 04:09:01] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-06 04:12:55] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9694963847925113 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [16]:
classifier.stats

{'label_distribution': {'HGNC:9451': 982,
  'MESH:D000938': 1225,
  'CHEBI:CHEBI:2766': 18,
  'HGNC:583': 1427,
  'CHEBI:CHEBI:52589': 13,
  'MESH:D003170': 8,
  'GO:GO:0005680': 520,
  'ungrounded': 116,
  'MESH:D057908': 31,
  'MESH:C002497': 79,
  'MESH:D066195': 16,
  'DOID:DOID:1793': 12,
  'MESH:C523074': 11,
  'MESH:C102351': 2,
  'NCIT:C66919': 19,
  'CHEBI:CHEBI:46633': 1,
  'MESH:D058225': 1},
 'f1': {'mean': 0.954099, 'std': 0.004035},
 'precision': {'mean': 0.949026, 'std': 0.004176},
 'recall': {'mean': 0.961406, 'std': 0.005049},
 'MESH:C523074': {'f1': {'mean': 0.1, 'std': 0.2},
  'pr': {'mean': 0.1, 'std': 0.2},
  'rc': {'mean': 0.1, 'std': 0.2}},
 'GO:GO:0005680': {'f1': {'mean': 0.968409, 'std': 0.008393},
  'pr': {'mean': 0.973077, 'std': 0.012756},
  'rc': {'mean': 0.963927, 'std': 0.010733}},
 'ungrounded': {'f1': {'mean': 0.884878, 'std': 0.027763},
  'pr': {'mean': 0.836594, 'std': 0.062508},
  'rc': {'mean': 0.942807, 'std': 0.01311}},
 'MESH:D066195': {'f1': {'

In [17]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [19]:
disamb.dump(model_name, results_path)

In [18]:
print(disamb.info())

Disambiguation model for APC, and APCs

Produces the disambiguations:
	1-lyso-2-alkylphosphatidylcholine*	CHEBI:CHEBI:52589
	APC*	HGNC:583
	Antigen-Presenting Cells*	MESH:D000938
	Argon Plasma Coagulation*	MESH:D057908
	Atrial Premature Complexes*	MESH:D018880
	Complement Pathway, Alternative	MESH:D003170
	PROC*	HGNC:9451
	Photoimmunotherapy*	NCIT:C66919
	Piriform Cortex*	MESH:D066195
	Plaque, Amyloid	MESH:D058225
	allophycocyanin	MESH:C002497
	allylpyrocatechol	MESH:C523074
	anaphase-promoting complex*	GO:GO:0005680
	aphidicolin*	CHEBI:CHEBI:2766
	apicidin	MESH:C102351
	carbapenems	CHEBI:CHEBI:46633
	pancreatic cancer*	DOID:DOID:1793

Class level metrics:
--------------------
Grounding                        	Count	F1     
         Antigen-Presenting Cells*	4026	0.97074
                              APC*	1429	0.97738
                             PROC*	 982	0.97178
       anaphase-promoting complex*	 520	0.96722
                       Ungrounded	 143	0.85879
                  allophyco

In [20]:
model_to_s3(disamb)