In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['PS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('phosphatidylserine', 2393),
 ('photosensitizer', 307),
 ('polystyrene', 213),
 ('population spike', 188),
 ('polysaccharide', 153),
 ('presenilin', 139),
 ('paradoxical sleep', 124),
 ('prenatal stress', 112),
 ('protein s', 111),
 ('phosphatidyl serine', 94),
 ('pregnenolone sulfate', 91),
 ('phosphorothioate', 89),
 ('performance status', 78),
 ('persulfate', 70),
 ('peak shortening', 66),
 ('pulmonary surfactant', 48),
 ('plant sterols', 44),
 ('protamine sulfate', 43),
 ('photosystem', 39),
 ('phytosterols', 39),
 ('surface area product', 37),
 ('primitive streak', 36),
 ('psychological stress', 36),
 ('protein synthesis', 35),
 ('propensity score', 32),
 ('porous silicon', 30),
 ('panobinostat', 28),
 ('phytosiderophores', 26),
 ('pressure support', 19),
 ('particle size', 18),
 ('phosphatidyl l serine', 16),
 ('permeability surface area', 15),
 ('primary septum', 14),
 ('prosaposin', 14),
 ('pregnenolone sulphate', 13),
 ('psoriasis', 13),
 ('proteus syndrome', 13),
 ('polysul

In [6]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [7]:
names

{'CHEBI:CHEBI:18303': 'phosphatidyl-L-serine',
 'CHEBI:CHEBI:53276': 'poly(styrene)',
 'CHEBI:CHEBI:18154': 'polysaccharide',
 'MESH:D053763': 'Presenilins',
 'MESH:D012895': 'Sleep, REM',
 'HGNC:9456': 'PROS1',
 'CHEBI:CHEBI:35420': 'pregnenolone sulfate',
 'MESH:D011663': 'Pulmonary Surfactants',
 'CHEBI:CHEBI:26125': 'phytosterols',
 'CHEBI:CHEBI:16189': 'sulfate',
 'GO:GO:0009521': 'photosystem',
 'MESH:D054240': 'Primitive Streak',
 'MESH:D013315': 'Stress, Psychological',
 'GO:GO:0006412': 'translation',
 'CHEBI:CHEBI:27573': 'silicon atom',
 'CHEBI:CHEBI:85990': 'panobinostat',
 'CHEBI:CHEBI:38155': 'phytosiderophore',
 'MESH:D010316': 'Particle Size',
 'HGNC:9498': 'PSAP',
 'MESH:D011565': 'Psoriasis',
 'MESH:D016715': 'Proteus Syndrome',
 'CHEBI:CHEBI:53281': 'poly(sulfone) macromolecule',
 'MESH:D065289': 'Pedicle Screws',
 'CHEBI:CHEBI:28017': 'starch',
 'MESH:D013502': 'General Surgery',
 'HGNC:10760': 'SET',
 'MESH:C566822': 'Perry Syndrome',
 'MESH:D013090': 'Spermatocyte

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-10-31 03:15:03] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl
ERROR: [2020-10-31 03:36:36] adeft.gui.ground - Exception on /ground_add [POST]
Traceback (most recent call last):
  File "/adeft/.virtualenvs/adeft_env/lib/python3.8/site-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/adeft/.virtualenvs/adeft_env/lib/python3.8/site-packages/flask/app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/adeft/.virtualenvs/adeft_env/lib/python3.8/site-packages/flask/app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/adeft/.virtualenvs/adeft_env/lib/python3.8/site-packages/flask/_compat.py", line 39, in reraise
    raise value
  File "/adeft/.virtualenvs/adeft_env/lib/python3.8/site-packages/flask/app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  F

In [9]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'p shermanii': 'ungrounded',
  'pachytene spermatocytes': 'ungrounded',
  'packaging signal': 'ungrounded',
  'paeonia suffruticosa': 'ungrounded',
  'pancreatic steatosis': 'MESH:D005234',
  'pancreatic stents': 'MESH:D015607',
  'panobinostat': 'CHEBI:CHEBI:85990',
  'paper sludge': 'ungrounded',
  'paradoxical sleep': 'ungrounded',
  'parenteral support': 'ungrounded',
  'parenting stress': 'ungrounded',
  'parkinsonian syndrome': 'MESH:D020734',
  'particle size': 'MESH:D010316',
  'partner services': 'ungrounded',
  'passive smoking': 'ungrounded',
  'passive stretching': 'ungrounded',
  'patients serum': 'MESH:D044967',
  'patients supervision': 'ungrounded',
  'patrinia scabiosaefolia': 'ungrounded',
  'pea starch': 'CHEBI:CHEBI:28017',
  'peak shortening': 'ungrounded',
  'peak shortening amplitude': 'ungrounded',
  'peak strain': 'ungrounded',
  'pedicle screw': 'ungrounded',
  'pelvic suspension': 'ungrounded',
  'penalized sieve': 'ungrounded',
  'pendred syndrome': 'DOID:

In [13]:
grounding_map, names, pos_labels = [{'p shermanii': 'ungrounded',
  'pachytene spermatocytes': 'ungrounded',
  'packaging signal': 'ungrounded',
  'paeonia suffruticosa': 'ungrounded',
  'pancreatic steatosis': 'MESH:D005234',
  'pancreatic stents': 'MESH:D015607',
  'panobinostat': 'CHEBI:CHEBI:85990',
  'paper sludge': 'ungrounded',
  'paradoxical sleep': 'ungrounded',
  'parenteral support': 'ungrounded',
  'parenting stress': 'ungrounded',
  'parkinsonian syndrome': 'MESH:D020734',
  'particle size': 'MESH:D010316',
  'partner services': 'ungrounded',
  'passive smoking': 'ungrounded',
  'passive stretching': 'ungrounded',
  'patients supervision': 'ungrounded',
  'patrinia scabiosaefolia': 'ungrounded',
  'pea starch': 'CHEBI:CHEBI:28017',
  'peak shortening': 'ungrounded',
  'peak shortening amplitude': 'ungrounded',
  'peak strain': 'ungrounded',
  'pedicle screw': 'ungrounded',
  'pelvic suspension': 'ungrounded',
  'penalized sieve': 'ungrounded',
  'pendred syndrome': 'DOID:DOID:0060744',
  'penicillin streptomycin': 'CHEBI:CHEBI:17076',
  'pentasaccharide': 'CHEBI:CHEBI:35369',
  'perceived stress': 'ungrounded',
  'perceptual speed': 'ungrounded',
  'performance status': 'ungrounded',
  'peri tumor stroma': 'ungrounded',
  'periplastid space': 'ungrounded',
  'peritoneal sclerosis': 'MESH:D012598',
  'permeability surface': 'ungrounded',
  'permeability surface area': 'ungrounded',
  'peroxydisulfate': 'CHEBI:CHEBI:29267',
  'perry syndrome': 'MESH:C566822',
  'persimmon skin': 'ungrounded',
  'persulfate': 'CHEBI:CHEBI:29267',
  'persulphate': 'CHEBI:CHEBI:29267',
  'petroleum substances': 'ungrounded',
  'pfeiffer syndrome': 'ungrounded',
  'phase sensitive': 'DOID:DOID:0060056',
  'phase singularity': 'ungrounded',
  'phase synchronization': 'ungrounded',
  'phatidylserine': 'CHEBI:CHEBI:18303',
  'phenelzine sulfate': 'CHEBI:CHEBI:8061',
  'phosphati dylserine': 'CHEBI:CHEBI:18303',
  'phosphatidyl l serine': 'CHEBI:CHEBI:18303',
  'phosphatidyl serine': 'CHEBI:CHEBI:18303',
  'phosphatidylserine': 'CHEBI:CHEBI:18303',
  'phosphatidyserine': 'CHEBI:CHEBI:18303',
  'phosphatydilserine': 'CHEBI:CHEBI:18303',
  'phospho l serine': 'CHEBI:CHEBI:18303',
  'phospho sulindac': 'ungrounded',
  'phospholipids': 'MESH:D010743',
  'phosphoromonothioate': 'ungrounded',
  'phosphorothioate': 'CHEBI:CHEBI:76674',
  'phosphoserine': 'MESH:D010768',
  'phosphotidylserine': 'CHEBI:CHEBI:18303',
  'photic stimulation': 'MESH:D010775',
  'photo sensitive': 'DOID:DOID:0060056',
  'photosensitiser': 'ungrounded',
  'photosensitizer': 'ungrounded',
  'photosynthesis': 'GO:GO:0015979',
  'photosystem': 'ungrounded',
  'physical': 'ungrounded',
  'physical stress': 'ungrounded',
  'phytosiderophores': 'CHEBI:CHEBI:38155',
  'phytosphingosine': 'CHEBI:CHEBI:46961',
  'phytosterols': 'MESH:D010840',
  'pictet spengler': 'ungrounded',
  'picture story': 'ungrounded',
  'pig slurry': 'ungrounded',
  'pigment screening': 'ungrounded',
  'piper sarmentosum': 'ungrounded',
  'pisa syndrome': 'ungrounded',
  'plant sterols': 'MESH:D010840',
  'plant sterols stanols': 'MESH:D010840',
  'plaque score': 'ungrounded',
  'plasma separation': 'ungrounded',
  'plastic stents': 'MESH:D015607',
  'platelet satellitism': 'ungrounded',
  'platelet supernatant': 'ungrounded',
  'platelet suspension': 'ungrounded',
  'pluripotent stem': 'ungrounded',
  'poly styrene': 'CHEBI:CHEBI:61642',
  'polymerization shrinkage': 'ungrounded',
  'polymerization stress': 'ungrounded',
  'polymersomes': 'ungrounded',
  'polysaccharide': 'CHEBI:CHEBI:18154',
  'polystyrene': 'CHEBI:CHEBI:61642',
  'polysulfide': 'MESH:C032915',
  'polysulfone': 'ungrounded',
  'pool size': 'ungrounded',
  'population spike': 'ungrounded',
  'porcine kidney stable': 'ungrounded',
  'porcine serum': 'ungrounded',
  'porcine skin': 'MESH:D012867',
  'porous silicon': 'ungrounded',
  'portion size': 'MESH:D064787',
  'posterior spiracle': 'ungrounded',
  'posterior stabilized': 'ungrounded',
  'potassium sorbate': 'CHEBI:CHEBI:77868',
  'potato starch': 'CHEBI:CHEBI:28017',
  'potential stimulus': 'ungrounded',
  'power stroke': 'ungrounded',
  'pragmatic skills': 'ungrounded',
  'pravastatin': 'MESH:D017035',
  'precipitation station': 'ungrounded',
  'predatory stress': 'ungrounded',
  'prednisolone': 'CHEBI:CHEBI:8378',
  'pregelatinized starch': 'CHEBI:CHEBI:28017',
  'pregnenolone sulfate': 'CHEBI:CHEBI:35420',
  'pregnenolone sulphate': 'CHEBI:CHEBI:35420',
  'premonitory symptoms': 'ungrounded',
  'prenatal restraint stress': 'ungrounded',
  'prenatal stress': 'ungrounded',
  'presenilin': 'HGNC:9508',
  'pressure support': 'ungrounded',
  'pretzel syndrome': 'ungrounded',
  'primary septum': 'ungrounded',
  'primary sulfonamide': 'CHEBI:CHEBI:35358',
  'primary surgery': 'MESH:D013502',
  'priming stimulus': 'ungrounded',
  'primitive streak': 'ungrounded',
  'priority set': 'ungrounded',
  'probe set': 'ungrounded',
  'process speed': 'ungrounded',
  'proline serine': 'ungrounded',
  'propane sultone': 'ungrounded',
  'propensity score': 'ungrounded',
  'propofol sedation': 'ungrounded',
  'propylene sulfide': 'MESH:C090687',
  'prosaposin': 'HGNC:9498',
  'prosegment': 'ungrounded',
  'prostacyclin synthase': 'HGNC:9603',
  'prostaglandin stratum': 'ungrounded',
  'prosystemin': 'ungrounded',
  'protamine sulfate': 'NCIT:C84123',
  'protamine sulphate': 'NCIT:C84123',
  'protein s': 'HGNC:9456',
  'protein signature': 'ungrounded',
  'protein substitute': 'ungrounded',
  'protein synthesis': 'ungrounded',
  'proteotoxic stress': 'ungrounded',
  'proteus syndrome': 'MESH:D016715',
  'ps': 'CHEBI:CHEBI:18303',
  'ps + clt δ i': 'ungrounded',
  'pseudostratification': 'ungrounded',
  'pseudosubstrate': 'ungrounded',
  'pseudosubstrate inhibitor peptide': 'ungrounded',
  'psoriasis': 'NCIT:C3346',
  'psychological stress': 'MESH:D013315',
  'psychosis spectrum': 'ungrounded',
  'pterostilbene': 'CHEBI:CHEBI:8630',
  'pulmonary sarcoidosis': 'MESH:D017565',
  'pulmonary sequestration': 'MESH:D001998',
  'pulmonary stenosis': 'MESH:D011666',
  'pulmonary surfactant': 'MESH:D011663',
  'pulmonic stenosis': 'ungrounded',
  'pulsatile shear': 'ungrounded',
  'pulsatile shear flow': 'ungrounded',
  'pulsatile shear stress': 'ungrounded',
  'pupil size': 'ungrounded',
  'purkinje system': 'ungrounded',
  'pyloric sphincter': 'ungrounded',
  'sera from patients': 'ungrounded',
  'surface area product': 'ungrounded',
  'surface product': 'ungrounded',
  'swiss chard powder': 'ungrounded'},
 {'MESH:D005234': 'Fatty Liver',
  'MESH:D015607': 'Stents',
  'CHEBI:CHEBI:85990': 'panobinostat',
  'MESH:D020734': 'Parkinsonian Disorders',
  'MESH:D010316': 'Particle Size',
  'CHEBI:CHEBI:28017': 'starch',
  'DOID:DOID:0060744': 'Pendred Syndrome',
  'CHEBI:CHEBI:17076': 'streptomycin',
  'CHEBI:CHEBI:35369': 'pentasaccharide',
  'MESH:D012598': 'Sclerosis',
  'CHEBI:CHEBI:29267': 'peroxydisulfate',
  'MESH:C566822': 'Perry Syndrome',
  'DOID:DOID:0060056': 'hypersensitivity reaction disease',
  'CHEBI:CHEBI:8061': 'Phenelzine sulfate',
  'CHEBI:CHEBI:18303': 'phosphatidyl-L-serine',
  'MESH:D010743': 'Phospholipids',
  'CHEBI:CHEBI:76674': 'phosphorothioate oligonucleotide',
  'MESH:D010768': 'Phosphoserine',
  'MESH:D010775': 'Photic Stimulation',
  'GO:GO:0015979': 'photosynthesis',
  'CHEBI:CHEBI:38155': 'phytosiderophore',
  'CHEBI:CHEBI:46961': 'phytosphingosine',
  'MESH:D010840': 'Phytosterols',
  'CHEBI:CHEBI:61642': 'polystyrene polymer',
  'CHEBI:CHEBI:18154': 'polysaccharide',
  'MESH:C032915': 'polysulfide',
  'MESH:D012867': 'Skin',
  'MESH:D064787': 'Portion Size',
  'CHEBI:CHEBI:77868': 'potassium sorbate',
  'MESH:D017035': 'Pravastatin',
  'CHEBI:CHEBI:8378': 'prednisolone',
  'CHEBI:CHEBI:35420': 'pregnenolone sulfate',
  'HGNC:9508': 'PSEN1',
  'CHEBI:CHEBI:35358': 'sulfonamide',
  'MESH:D013502': 'General Surgery',
  'MESH:C090687': 'propylene sulfide',
  'HGNC:9498': 'PSAP',
  'HGNC:9603': 'PTGIS',
  'NCIT:C84123': 'Protamine Sulfate',
  'HGNC:9456': 'PROS1',
  'MESH:D016715': 'Proteus Syndrome',
  'NCIT:C3346': 'Psoriasis',
  'MESH:D013315': 'Stress, Psychological',
  'CHEBI:CHEBI:8630': 'pterostilbene',
  'MESH:D017565': 'Sarcoidosis, Pulmonary',
  'MESH:D001998': 'Bronchopulmonary Sequestration',
  'MESH:D011666': 'Pulmonary Valve Stenosis',
  'MESH:D011663': 'Pulmonary Surfactants'},
 ['CHEBI:CHEBI:18154',
  'CHEBI:CHEBI:18303',
  'CHEBI:CHEBI:29267',
  'CHEBI:CHEBI:35420',
  'CHEBI:CHEBI:38155',
  'CHEBI:CHEBI:61642',
  'CHEBI:CHEBI:76674',
  'CHEBI:CHEBI:85990',
  'HGNC:9456',
  'HGNC:9498',
  'HGNC:9508',
  'HGNC:9603',
  'NCIT:C84123']]

In [15]:
excluded_longforms = ['ps', 'ps + clt δ i']

In [16]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [17]:
additional_entities = {}

In [18]:
unambiguous_agent_texts = {}

In [19]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [20]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [21]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [22]:
intersection1

[]

In [23]:
intersection2

[]

In [22]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [23]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [24]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-31 03:45:37] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-31 03:49:10] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9251886089264726 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [25]:
classifier.stats

{'label_distribution': {'ungrounded': 1098,
  'CHEBI:CHEBI:18303': 1883,
  'CHEBI:CHEBI:18154': 115,
  'HGNC:9456': 86,
  'CHEBI:CHEBI:76674': 65,
  'NCIT:C84123': 38,
  'MESH:D010775': 4,
  'CHEBI:CHEBI:35420': 82,
  'CHEBI:CHEBI:35369': 4,
  'CHEBI:CHEBI:61642': 166,
  'MESH:D011663': 39,
  'MESH:D020734': 3,
  'CHEBI:CHEBI:8378': 5,
  'MESH:D010743': 6,
  'HGNC:9603': 3,
  'CHEBI:CHEBI:77868': 6,
  'HGNC:9508': 122,
  'NCIT:C3346': 10,
  'MESH:D010768': 4,
  'CHEBI:CHEBI:38155': 16,
  'MESH:D016715': 8,
  'MESH:D012867': 4,
  'MESH:D010840': 68,
  'DOID:DOID:0060744': 5,
  'CHEBI:CHEBI:46961': 6,
  'MESH:D013315': 25,
  'MESH:D017035': 3,
  'GO:GO:0015979': 4,
  'CHEBI:CHEBI:29267': 72,
  'CHEBI:CHEBI:85990': 11,
  'HGNC:9498': 6,
  'MESH:D064787': 3,
  'DOID:DOID:0060056': 5,
  'MESH:D010316': 16,
  'MESH:D015607': 8,
  'MESH:D011666': 5,
  'MESH:C090687': 2,
  'CHEBI:CHEBI:28017': 13,
  'MESH:C566822': 4,
  'CHEBI:CHEBI:17076': 5,
  'MESH:D012598': 1,
  'MESH:D013502': 7,
  'MESH:

In [27]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [28]:
disamb.dump(model_name, results_path)

In [29]:
print(disamb.info())

Disambiguation model for PS

Produces the disambiguations:
	Bronchopulmonary Sequestration	MESH:D001998
	Fatty Liver	MESH:D005234
	General Surgery	MESH:D013502
	PROS1*	HGNC:9456
	PSAP*	HGNC:9498
	PSEN1*	HGNC:9508
	PTGIS*	HGNC:9603
	Parkinsonian Disorders	MESH:D020734
	Particle Size	MESH:D010316
	Pendred Syndrome	DOID:DOID:0060744
	Perry Syndrome	MESH:C566822
	Phenelzine sulfate	CHEBI:CHEBI:8061
	Phospholipids	MESH:D010743
	Phosphoserine	MESH:D010768
	Photic Stimulation	MESH:D010775
	Phytosterols	MESH:D010840
	Portion Size	MESH:D064787
	Pravastatin	MESH:D017035
	Protamine Sulfate*	NCIT:C84123
	Proteus Syndrome	MESH:D016715
	Psoriasis	NCIT:C3346
	Pulmonary Surfactants	MESH:D011663
	Pulmonary Valve Stenosis	MESH:D011666
	Sarcoidosis, Pulmonary	MESH:D017565
	Sclerosis	MESH:D012598
	Skin	MESH:D012867
	Stents	MESH:D015607
	Stress, Psychological	MESH:D013315
	hypersensitivity reaction disease	DOID:DOID:0060056
	panobinostat*	CHEBI:CHEBI:85990
	pentasaccharide	CHEBI:CHEBI:35369
	peroxydisulfat

In [30]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1