In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['PC']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [6]:
list(zip(longforms, counts))

[('phosphatidylcholine', 2114),
 ('pancreatic cancer', 863),
 ('prostate cancer', 847),
 ('protein c', 295),
 ('preconditioning', 284),
 ('pyruvate carboxylase', 208),
 ('phosphocholine', 172),
 ('plasma cells', 156),
 ('purkinje cells', 152),
 ('protein carbonyl', 134),
 ('phosphorylcholine', 123),
 ('peritoneal carcinomatosis', 65),
 ('principal component', 61),
 ('positive control', 51),
 ('phosphatidyl choline', 51),
 ('platelet concentrations', 49),
 ('polycarbonate', 45),
 ('palliative care', 43),
 ('peritoneal cells', 42),
 ('piriform cortex', 41),
 ('phycocyanin', 40),
 ('pericytes', 37),
 ('portland cement', 35),
 ('pancreatic carcinoma', 34),
 ('plastocyanin', 31),
 ('primary care', 30),
 ('phytochelatin', 30),
 ('phase contrast', 29),
 ('polycomb', 27),
 ('primary cilia', 25),
 ('precore', 25),
 ('phosphocitrate', 25),
 ('paneth cells', 25),
 ('protein content', 23),
 ('proprotein convertase', 23),
 ('pyramidal cells', 23),
 ('primary cilium', 21),
 ('protein corona', 21),
 

In [7]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [8]:
names

{'CHEBI:CHEBI:64482': 'phosphatidylcholine',
 'DOID:DOID:1793': 'pancreatic cancer',
 'MESH:D011471': 'Prostatic Neoplasms',
 'CHEBI:CHEBI:82131': 'Protein C',
 'CHEBI:CHEBI:18132': 'phosphocholine',
 'MESH:D010950': 'Plasma Cells',
 'MESH:D011689': 'Purkinje Cells',
 'EFO:0001461': 'control',
 'MESH:D010713': 'Phosphatidylcholines',
 'MESH:C005506': 'polycarbonate',
 'MESH:D010166': 'Palliative Care',
 'MESH:D002477': 'Cells',
 'MESH:D066195': 'Piriform Cortex',
 'MESH:D010798': 'Phycocyanin',
 'MESH:D020286': 'Pericytes',
 'MESH:C562463': 'Pancreatic Carcinoma',
 'MESH:D010970': 'Plastocyanin',
 'MESH:D011320': 'Primary Health Care',
 'CHEBI:CHEBI:60836': 'phytochelatin',
 'MESH:C014148': 'phosphocitrate',
 'MESH:D019879': 'Paneth Cells',
 'MESH:D017966': 'Pyramidal Cells',
 'MESH:D000066970': 'Protein Corona',
 'MESH:D053549': 'Pachyonychia Congenita',
 'MESH:D013234': 'Stem Cells',
 'HP:HP:0006725': 'Pancreatic adenocarcinoma',
 'MESH:C045990': 'propylene carbonate',
 'DOID:DOID:10

In [9]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-11-11 02:43:04] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl


In [10]:
result = [grounding_map, names, pos_labels]

In [11]:
result

[{'ipc': 'ungrounded',
  'p cresol': 'ungrounded',
  'pachyonychia congenita': 'ungrounded',
  'pacinian': 'ungrounded',
  'paclitaxel carboplatin': 'ungrounded',
  'pain catastrophizing': 'ungrounded',
  'pair center': 'ungrounded',
  'pair complex': 'ungrounded',
  'palliative care': 'MESH:D010166',
  'palmitoyl carnitine': 'ungrounded',
  'palmitoyl l carnitine': 'CHEBI:CHEBI:17490',
  'pancreatic adenocarcinoma': 'DOID:DOID:1793',
  'pancreatic cancer': 'DOID:DOID:1793',
  'pancreatic carcinoma': 'DOID:1793',
  'pancreatic ductal adenocarcinoma': 'DOID:DOID:1793',
  'paneth cells': 'MESH:D019879',
  'paracetamol': 'CHEBI:CHEBI:46195',
  'parallel coordinates': 'ungrounded',
  'paramyotonia congenita': 'MESH:D020967',
  'parathyroid carcinoma': 'DOID:DOID:1540',
  'parenchymal cells': 'ungrounded',
  'paricalcitol': 'CHEBI:CHEBI:7931',
  'parietal cells': 'ungrounded',
  'parietal cortex': 'MESH:D010296',
  'partial colectomy': 'MESH:D003082',
  'partial cystectomy': 'MESH:D015653',

In [24]:
grounding_map, names, pos_labels = [{'ipc': 'ungrounded',
  'p cresol': 'ungrounded',
  'pachyonychia congenita': 'ungrounded',
  'pacinian': 'ungrounded',
  'paclitaxel carboplatin': 'ungrounded',
  'pain catastrophizing': 'ungrounded',
  'pair center': 'ungrounded',
  'pair complex': 'ungrounded',
  'palliative care': 'MESH:D010166',
  'palmitoyl carnitine': 'ungrounded',
  'palmitoyl l carnitine': 'CHEBI:CHEBI:17490',
  'pancreatic adenocarcinoma': 'DOID:DOID:1793',
  'pancreatic cancer': 'DOID:DOID:1793',
  'pancreatic carcinoma': 'DOID:1793',
  'pancreatic ductal adenocarcinoma': 'DOID:DOID:1793',
  'paneth cells': 'MESH:D019879',
  'paracetamol': 'CHEBI:CHEBI:46195',
  'parallel coordinates': 'ungrounded',
  'paramyotonia congenita': 'MESH:D020967',
  'parathyroid carcinoma': 'DOID:DOID:1540',
  'parenchymal cells': 'ungrounded',
  'paricalcitol': 'CHEBI:CHEBI:7931',
  'parietal cells': 'ungrounded',
  'parietal cortex': 'MESH:D010296',
  'partial colectomy': 'MESH:D003082',
  'partial cystectomy': 'MESH:D015653',
  'participation coefficient': 'ungrounded',
  'partition complex': 'ungrounded',
  'partner choice': 'ungrounded',
  'parvocellular': 'ungrounded',
  'parylene c': 'ungrounded',
  'pavement cells': 'ungrounded',
  'pc': 'ungrounded',
  'peer counseling': 'ungrounded',
  'pelvic control': 'ungrounded',
  'penile circumference': 'ungrounded',
  'pentamer complex': 'ungrounded',
  'pentameric complex': 'ungrounded',
  'pentose cycle': 'ungrounded',
  'pepc': 'ungrounded',
  'perceived competence': 'ungrounded',
  'perceived control': 'ungrounded',
  'percutaneous cholecystostomy': 'MESH:D002767',
  'pericytes': 'ungrounded',
  'perioperative chemotherapy': 'ungrounded',
  'periosteal circumference': 'ungrounded',
  'perirhinal cortex': 'MESH:D000071039',
  'peritoneal carcinomatosis': 'ungrounded',
  'peritoneal carcinosis': 'ungrounded',
  'peritoneal cavity': 'MESH:D010529',
  'peritoneal cells': 'ungrounded',
  'peritubular cells': 'ungrounded',
  'perseverative cognition': 'ungrounded',
  'personal computer': 'MESH:D003201',
  'petroleum coke': 'MESH:D003077',
  'pharmaceutical care': 'ungrounded',
  'pharmacological chaperone': 'ungrounded',
  'phase contrast': 'ungrounded',
  'phase cycle': 'ungrounded',
  'phatidylcholine': 'CHEBI:CHEBI:64482',
  'phellodendri cortex': 'ungrounded',
  'phenolic compounds': 'ungrounded',
  'pheochromocytoma': 'MESH:D010673',
  'pheochromocytoma crisis': 'MESH:D010673',
  'phospatidylcholine': 'CHEBI:CHEBI:64482',
  'phosphatidyicholine': 'CHEBI:CHEBI:64482',
  'phosphatidyl choline': 'CHEBI:CHEBI:64482',
  'phosphatidylcholine': 'CHEBI:CHEBI:64482',
  'phosphocellulose': 'ungrounded',
  'phosphocholine': 'CHEBI:CHEBI:18132',
  'phosphocitrate': 'MESH:C014148',
  'phosphocreatine': 'CHEBI:CHEBI:17287',
  'phospholipids complex': 'ungrounded',
  'phosphoryl choline': 'CHEBI:CHEBI:18132',
  'phosphorylcholine': 'CHEBI:CHEBI:18132',
  'photocatalytic': 'ungrounded',
  'photocoagulation': 'MESH:D008028',
  'photonic crystal': 'ungrounded',
  'phthalocyanine': 'CHEBI:CHEBI:34921',
  'phycocyanin': 'MESH:D010798',
  'physical capacity': 'ungrounded',
  'physical castration': 'MESH:D002369',
  'physical concerns': 'ungrounded',
  'physical cooling': 'ungrounded',
  'phytochelatin': 'MESH:D054811',
  'picryl chloride': 'CHEBI:CHEBI:53053',
  'pilocarpine': 'CHEBI:CHEBI:39462',
  'piriform cortex': 'MESH:D066195',
  'pituitary carcinoma': 'ungrounded',
  'place conditioning': 'ungrounded',
  'plasma cells': 'MESH:D010950',
  'plasma coloration': 'ungrounded',
  'plasma concentrations': 'ungrounded',
  'plasma cupping': 'ungrounded',
  'plasmacytoma': 'MESH:D010954',
  'plasticity compression': 'ungrounded',
  'plastocyanin': 'IP:IPR002387',
  'platelet component': 'ungrounded',
  'platelet concentrations': 'ungrounded',
  'platelet count': 'ungrounded',
  'pleomorphic carcinoma': 'DOID:DOID:5662',
  'pneumocephalus': 'MESH:D011007',
  'pneumocystis carinii': 'MESH:D045363',
  'podocalyxin': 'MESH:C001643',
  'policosanols': 'ungrounded',
  'polycarbonate': 'ungrounded',
  'polycomb': 'ungrounded',
  'polycomb protein': 'ungrounded',
  'polygonum cuspidatum': 'MESH:D045468',
  'polymerase complex': 'ungrounded',
  'polymeric conjugate': 'ungrounded',
  'polysaccharide': 'CHEBI:CHEBI:18154',
  'polysome content': 'ungrounded',
  'pontine cistern': 'ungrounded',
  'portland cement': 'ungrounded',
  'positive control': 'ungrounded',
  'positive control group': 'ungrounded',
  'post challenge': 'ungrounded',
  'post conditioning': 'ungrounded',
  'postchallenge': 'ungrounded',
  'postconditioning': 'ungrounded',
  'posterior capsulotomy': 'MESH:D064727',
  'posterior chamber': 'ungrounded',
  'posterior circulating': 'ungrounded',
  'posterior commissure': 'ungrounded',
  'posterior crossbite': 'ungrounded',
  'potassium citrate': 'MESH:D019357',
  'power chain': 'ungrounded',
  'praeruptorin c': 'MESH:C058808',
  'pre conditioning': 'ungrounded',
  'pre core': 'ungrounded',
  'precardiac cells': 'ungrounded',
  'preconditioning': 'ungrounded',
  'precore': 'ungrounded',
  'predicted component measured': 'ungrounded',
  'prednicarbate': 'CHEBI:CHEBI:135791',
  'prefrontal cortex': 'MESH:D017397',
  'preventive chemotherapy': 'ungrounded',
  'primary care': 'MESH:D011320',
  'primary cilia': 'ungrounded',
  'primary cilium': 'ungrounded',
  'primary closure': 'ungrounded',
  'principal': 'ungrounded',
  'principal cells': 'ungrounded',
  'principal component': 'ungrounded',
  'principle component': 'ungrounded',
  'proanthocyanidins': 'CHEBI:CHEBI:26267',
  'probiotic combination': 'ungrounded',
  'procapsid': 'ungrounded',
  'procerebral': 'ungrounded',
  'process cheeses': 'ungrounded',
  'prochymosin': 'MESH:C032443',
  'procollagen': 'MESH:D011347',
  'procyanidins': 'ungrounded',
  'procyclic': 'ungrounded',
  'progenitor cells': 'MESH:D013234',
  'prohormone convertase': 'ungrounded',
  'proliferative cholangitis': 'MESH:D002761',
  'propargyl carbonate': 'ungrounded',
  'proprotein convertase': 'MESH:D043484',
  'propylene carbonate': 'ungrounded',
  'prosencephalon': 'MESH:D016548',
  'prostacyclin': 'CHEBI:CHEBI:15552',
  'prostate cancer': 'MESH:D011471',
  'prostate carcinoma': 'MESH:D011471',
  'protein c': 'HGNC:9451',
  'protein carbonyl': 'MESH:D050050',
  'protein coding': 'ungrounded',
  'protein content': 'ungrounded',
  'protein convertase': 'ungrounded',
  'protein corona': 'MESH:D000066970',
  'provocation concentrations': 'ungrounded',
  'proximal colon': 'ungrounded',
  'pseudotumor cerebri': 'MESH:D011559',
  'psoralea corylifolia l': 'ungrounded',
  'psychological contract': 'ungrounded',
  'pulmonary contusion': 'ungrounded',
  'pulp colour': 'ungrounded',
  'pulsed current': 'ungrounded',
  'punicalagin': 'ungrounded',
  'purkinje cells': 'MESH:D011689',
  'pyloric caeca': 'ungrounded',
  'pyramidal cells': 'MESH:D017966',
  'pyridyl cholesterol': 'ungrounded',
  'pyrroline 5 carboxylation': 'ungrounded',
  'pyruvate carboxylase': 'HGNC:8636',
  'pyruvate carboxylation': 'HGNC:8636'},
 {'MESH:D010166': 'Palliative Care',
  'CHEBI:CHEBI:17490': 'O-palmitoyl-L-carnitine',
  'DOID:DOID:1793': 'pancreatic cancer',
  'DOID:1793': 'pancreatic cancer',
  'MESH:D019879': 'Paneth Cells',
  'CHEBI:CHEBI:46195': 'paracetamol',
  'MESH:D020967': 'Myotonic Disorders',
  'DOID:DOID:1540': 'parathyroid carcinoma',
  'CHEBI:CHEBI:7931': 'paricalcitol',
  'MESH:D010296': 'Parietal Lobe',
  'MESH:D003082': 'Colectomy',
  'MESH:D015653': 'Cystectomy',
  'MESH:D002767': 'Cholecystostomy',
  'MESH:D000071039': 'Perirhinal Cortex',
  'MESH:D010529': 'Peritoneal Cavity',
  'MESH:D003201': 'Computers',
  'MESH:D003077': 'Coke',
  'MESH:D010673': 'Pheochromocytoma',
  'CHEBI:CHEBI:64482': 'phosphatidylcholine',
  'CHEBI:CHEBI:18132': 'phosphocholine',
  'MESH:C014148': 'phosphocitrate',
  'CHEBI:CHEBI:17287': 'N-phosphocreatine',
  'MESH:D008028': 'Light Coagulation',
  'CHEBI:CHEBI:34921': 'phthalocyanine',
  'MESH:D010798': 'Phycocyanin',
  'MESH:D002369': 'Castration',
  'MESH:D054811': 'Phytochelatins',
  'CHEBI:CHEBI:53053': '1-chloro-2,4,6-trinitrobenzene',
  'CHEBI:CHEBI:39462': 'pilocarpine',
  'MESH:D066195': 'Piriform Cortex',
  'MESH:D010950': 'Plasma Cells',
  'MESH:D010954': 'Plasmacytoma',
  'IP:IPR002387': 'Plastocyanin',
  'DOID:DOID:5662': 'pleomorphic carcinoma',
  'MESH:D011007': 'Pneumocephalus',
  'MESH:D045363': 'Pneumocystis carinii',
  'MESH:C001643': 'podocalyxin',
  'MESH:D045468': 'Fallopia japonica',
  'CHEBI:CHEBI:18154': 'polysaccharide',
  'MESH:D064727': 'Posterior Capsulotomy',
  'MESH:D019357': 'Potassium Citrate',
  'MESH:C058808': 'praeruptorin C',
  'CHEBI:CHEBI:135791': 'prednicarbate',
  'MESH:D017397': 'Prefrontal Cortex',
  'MESH:D011320': 'Primary Health Care',
  'CHEBI:CHEBI:26267': 'proanthocyanidin',
  'MESH:C032443': 'prorennin',
  'MESH:D011347': 'Procollagen',
  'MESH:D013234': 'Stem Cells',
  'MESH:D002761': 'Cholangitis',
  'MESH:D043484': 'Proprotein Convertases',
  'MESH:D016548': 'Prosencephalon',
  'CHEBI:CHEBI:15552': 'prostaglandin I2',
  'MESH:D011471': 'Prostatic Neoplasms',
  'HGNC:9451': 'PROC',
  'MESH:D050050': 'Protein Carbonylation',
  'MESH:D000066970': 'Protein Corona',
  'MESH:D011559': 'Pseudotumor Cerebri',
  'MESH:D011689': 'Purkinje Cells',
  'MESH:D017966': 'Pyramidal Cells',
  'HGNC:8636': 'PC'},
 ['CHEBI:CHEBI:64482', 'HGNC:8636', 'HGNC:9451']]

In [25]:
excluded_longforms = ['pc', 'ipc']

In [26]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [27]:
additional_entities = {}

In [28]:
unambiguous_agent_texts = {}

In [29]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [18]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [19]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [20]:
intersection1

[('HGNC:10967', 'HGNC:10967', 98)]

In [21]:
intersection2

[('GO:GO:0001837', 'HGNC:10967', 0),
 ('ungrounded', 'HGNC:10967', 0),
 ('HGNC:10967', 'HGNC:10967', 0),
 ('MESH:D055032', 'HGNC:10967', 0)]

In [22]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [23]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [30]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-11-11 03:14:20] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-11-11 03:21:14] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9317006719712382 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [31]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:18132': 243,
  'HGNC:8636': 157,
  'CHEBI:CHEBI:64482': 1705,
  'ungrounded': 1015,
  'MESH:D019879': 16,
  'HGNC:9451': 251,
  'MESH:D017966': 17,
  'CHEBI:CHEBI:46195': 7,
  'MESH:D066195': 25,
  'MESH:D011689': 111,
  'CHEBI:CHEBI:34921': 4,
  'DOID:DOID:1793': 580,
  'MESH:D045363': 10,
  'MESH:C032443': 2,
  'IP:IPR002387': 26,
  'MESH:D010954': 3,
  'CHEBI:CHEBI:53053': 6,
  'MESH:D010798': 35,
  'MESH:C014148': 17,
  'CHEBI:CHEBI:15552': 3,
  'CHEBI:CHEBI:17287': 6,
  'MESH:D010950': 122,
  'MESH:D050050': 100,
  'MESH:D011471': 614,
  'MESH:D043484': 19,
  'MESH:D020967': 5,
  'CHEBI:CHEBI:39462': 7,
  'MESH:D054811': 29,
  'MESH:D011559': 3,
  'MESH:D045468': 4,
  'CHEBI:CHEBI:18154': 2,
  'MESH:D010673': 7,
  'CHEBI:CHEBI:135791': 4,
  'MESH:D011347': 2,
  'MESH:D000071039': 3,
  'MESH:C001643': 8,
  'MESH:D019357': 3,
  'MESH:D003201': 9,
  'MESH:D002767': 9,
  'MESH:D011320': 20,
  'MESH:D013234': 14,
  'MESH:D010529': 7,
  'DOID:DOID:56

In [32]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [33]:
disamb.dump(model_name, results_path)

In [34]:
print(disamb.info())

Disambiguation model for PC

Produces the disambiguations:
	1-chloro-2,4,6-trinitrobenzene	CHEBI:CHEBI:53053
	Castration	MESH:D002369
	Cholangitis	MESH:D002761
	Cholecystostomy	MESH:D002767
	Coke	MESH:D003077
	Colectomy	MESH:D003082
	Computers	MESH:D003201
	Cystectomy	MESH:D015653
	Fallopia japonica	MESH:D045468
	Light Coagulation	MESH:D008028
	Myotonic Disorders	MESH:D020967
	N-phosphocreatine	CHEBI:CHEBI:17287
	O-palmitoyl-L-carnitine	CHEBI:CHEBI:17490
	PC*	HGNC:8636
	PROC*	HGNC:9451
	Palliative Care	MESH:D010166
	Paneth Cells	MESH:D019879
	Parietal Lobe	MESH:D010296
	Perirhinal Cortex	MESH:D000071039
	Peritoneal Cavity	MESH:D010529
	Pheochromocytoma	MESH:D010673
	Phycocyanin	MESH:D010798
	Phytochelatins	MESH:D054811
	Piriform Cortex	MESH:D066195
	Plasma Cells	MESH:D010950
	Plasmacytoma	MESH:D010954
	Plastocyanin	IP:IPR002387
	Pneumocephalus	MESH:D011007
	Pneumocystis carinii	MESH:D045363
	Posterior Capsulotomy	MESH:D064727
	Potassium Citrate	MESH:D019357
	Prefrontal Cortex	MESH:D017

In [35]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1