In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [None]:
adeft_grounder = AdeftGrounder()

In [2]:
shortforms = ['CS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [3]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
# for longform in combined_longforms:
#     groundings = adeft_grounder.ground(longform)
#     if groundings:
#         grounding = groundings[0]['grounding']
#         grounding_map[longform] = grounding
#         names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [6]:
list(zip(longforms, counts))

[('cigarette smoke', 1418),
 ('chondroitin sulfate', 615),
 ('chitosan', 521),
 ('conditioned stimulus', 422),
 ('citrate synthase', 271),
 ('corticosteroids', 178),
 ('cockayne syndrome', 97),
 ('cushing s syndrome', 96),
 ('cardiogenic shock', 93),
 ('corticosterone', 84),
 ('compressed sensing', 78),
 ('coronary sinus', 70),
 ('contact sensitivity', 68),
 ('cesarean section', 66),
 ('chondroitin sulphate', 65),
 ('circumsporozoite', 61),
 ('contrast sensitivity', 59),
 ('caesarean section', 54),
 ('cordyceps sinensis', 52),
 ('cyclic stretch', 49),
 ('conditioned stimuli', 48),
 ('cholesterol sulfate', 42),
 ('costello syndrome', 38),
 ('complex spike', 37),
 ('culture supernatant', 37),
 ('cushing syndrome', 33),
 ('calcium silicate', 32),
 ('chronic stress', 29),
 ('cowden syndrome', 27),
 ('cardiac sarcoidosis', 25),
 ('cold storage', 24),
 ('circumsporozoite protein', 21),
 ('crystalline silica', 20),
 ('congenital scoliosis', 19),
 ('cyclosporin', 18),
 ('central sensitivity', 

In [7]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [7]:
names

{'CHEBI:CHEBI:37397': 'chondroitin sulfate',
 'CHEBI:CHEBI:16261': 'chitosan',
 'MESH:D000305': 'Adrenal Cortex Hormones',
 'MESH:D003057': 'Cockayne Syndrome',
 'MESH:D012770': 'Shock, Cardiogenic',
 'CHEBI:CHEBI:16827': 'corticosterone',
 'MESH:D054326': 'Coronary Sinus',
 'MESH:D003877': 'Dermatitis, Contact',
 'MESH:D002585': 'Cesarean Section',
 'MESH:D015350': 'Contrast Sensitivity',
 'CHEBI:CHEBI:41321': 'cholesterol sulfate',
 'MESH:D056685': 'Costello Syndrome',
 'MESH:D003480': 'Cushing Syndrome',
 'MESH:C031293': 'calcium silicate',
 'MESH:D001008': 'Anxiety Disorders',
 'MESH:D006223': 'Hamartoma Syndrome, Multiple',
 'DOID:DOID:13405': 'cardiac sarcoidosis',
 'CHEBI:CHEBI:36080': 'protein',
 'CHEBI:CHEBI:30563': 'silicon dioxide',
 'CHEBI:CHEBI:4031': 'cyclosporin A',
 'MESH:D044967': 'Serum',
 'MESH:D003398': 'Craniosynostoses',
 'MESH:D003444': 'Crush Syndrome',
 'GO:GO:0090398': 'cellular senescence',
 'EFO:0005135': 'strain',
 'HGNC:16288': 'TRIM9',
 'CHEBI:CHEBI:23051

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-11-03 00:53:37] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl


In [9]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'cabernet sauvignon': 'ungrounded',
  'cadmium sensitivity': 'ungrounded',
  'caesalpinia sappan': 'ungrounded',
  'caesarean section': 'ungrounded',
  'calcium score': 'ungrounded',
  'calcium silicate': 'ungrounded',
  'calcium sulfate': 'ungrounded',
  'calcium sulphate': 'ungrounded',
  'calf serum': 'ungrounded',
  'calsequestrin': 'ungrounded',
  'camelina seed': 'ungrounded',
  'camellia sinensis': 'ungrounded',
  'campaniform sensilla': 'ungrounded',
  'cancer stem': 'ungrounded',
  'canton s': 'ungrounded',
  'cantu syndrome': 'ungrounded',
  'cantú syndrome': 'ungrounded',
  'capparis spinosa l': 'ungrounded',
  'carbon spheres': 'ungrounded',
  'carcinoid syndrome': 'ungrounded',
  'carcinosarcomas': 'ungrounded',
  'cardiac sarcoidosis': 'ungrounded',
  'cardiac steroids': 'ungrounded',
  'cardiac surgery': 'ungrounded',
  'cardiogenic shock': 'NCIT:C50482',
  'cardiotonic steroids': 'ungrounded',
  'care staff member': 'ungrounded',
  'carnosol': 'ungrounded',
  'carotid

In [24]:
grounding_map, names, pos_labels = [{'cabernet sauvignon': 'ungrounded',
  'cadmium sensitivity': 'ungrounded',
  'caesalpinia sappan': 'ungrounded',
  'caesarean section': 'ungrounded',
  'calcium score': 'ungrounded',
  'calcium silicate': 'ungrounded',
  'calcium sulfate': 'ungrounded',
  'calcium sulphate': 'ungrounded',
  'calf serum': 'ungrounded',
  'calsequestrin': 'ungrounded',
  'camelina seed': 'ungrounded',
  'camellia sinensis': 'ungrounded',
  'campaniform sensilla': 'ungrounded',
  'cancer stem': 'ungrounded',
  'canton s': 'ungrounded',
  'cantu syndrome': 'ungrounded',
  'cantú syndrome': 'ungrounded',
  'capparis spinosa l': 'ungrounded',
  'carbon spheres': 'ungrounded',
  'carcinoid syndrome': 'ungrounded',
  'carcinosarcomas': 'ungrounded',
  'cardiac sarcoidosis': 'ungrounded',
  'cardiac steroids': 'ungrounded',
  'cardiac surgery': 'ungrounded',
  'cardiogenic shock': 'NCIT:C50482',
  'cardiotonic steroids': 'ungrounded',
  'care staff member': 'ungrounded',
  'carnosol': 'ungrounded',
  'carotid stenosis': 'ungrounded',
  'cartilage selected': 'ungrounded',
  'casparian strip': 'ungrounded',
  'cassava starch': 'ungrounded',
  'castanospermine': 'ungrounded',
  'castasterone': 'CHEBI:CHEBI:23051',
  'catalytic subunit': 'ungrounded',
  'cathepsin s': 'ungrounded',
  'cavernous sinus': 'ungrounded',
  'cecal slurry': 'ungrounded',
  'celiac sprue': 'ungrounded',
  'cells salvage': 'ungrounded',
  'cells separation': 'ungrounded',
  'cells sheets': 'ungrounded',
  'cells shortening': 'ungrounded',
  'cellular senescence': 'ungrounded',
  'cellulose sulfate': 'ungrounded',
  'cellulose sulphate': 'ungrounded',
  'central sensitivity': 'ungrounded',
  'central striatum': 'ungrounded',
  'centriolar satellites': 'ungrounded',
  'centroid size': 'ungrounded',
  'centrum semiovale': 'ungrounded',
  'ceramide synthase': 'ungrounded',
  'cerebral stroke': 'ungrounded',
  'cervical spondylosis': 'ungrounded',
  'cervical stimulation': 'ungrounded',
  'cesarean section': 'ungrounded',
  'charcoal strip': 'ungrounded',
  'charge separation': 'ungrounded',
  'charge solvation': 'ungrounded',
  'chebyshev semi iterative': 'ungrounded',
  'cheongsun': 'ungrounded',
  'chicken serum': 'ungrounded',
  'chikusetsu saponin iva': 'ungrounded',
  'chilling shock': 'ungrounded',
  'chinese spring': 'ungrounded',
  'chitosan': 'CHEBI:CHEBI:16261',
  'chitosan g stearate': 'ungrounded',
  'chlorella sorokiniana': 'ungrounded',
  'cholesterol': 'CHEBI:CHEBI:41321',
  'cholesterol sulfate': 'CHEBI:CHEBI:41321',
  'cholesteryl hemisuccinate tris salt': 'ungrounded',
  'cholesteryl sulfate': 'CHEBI:CHEBI:41321',
  'choline supplementation': 'ungrounded',
  'chondroitin 4 sulfate': 'CHEBI:CHEBI:37397',
  'chondroitin sulfate': 'CHEBI:CHEBI:37397',
  'chondroitin sulfate a sodium salt': 'CHEBI:CHEBI:37397',
  'chondroitin sulphate': 'CHEBI:CHEBI:37397',
  'chondrosarcoma': 'ungrounded',
  'chord and sgt1': 'ungrounded',
  'chorionic somatomammotropin': 'ungrounded',
  'chorismate synthase': 'ungrounded',
  'christianson syndrome': 'ungrounded',
  'chronic immobilization stress': 'ungrounded',
  'chronic restraint stress': 'ungrounded',
  'chronic schistosomiasis': 'ungrounded',
  'chronic stress': 'ungrounded',
  'chronological senescence': 'ungrounded',
  'ciclosporin': 'CHEBI:CHEBI:4031',
  'cigarette smoke': 'MESH:D000073865',
  'cigarette smoke exposure': 'MESH:D000073865',
  'cigarette smoke group': 'MESH:D000073865',
  'cigarette smoke inhaled': 'MESH:D000073865',
  'circular structure': 'ungrounded',
  'circumferential strain': 'ungrounded',
  'circumferential stress': 'ungrounded',
  'circumsporozoite': 'IP:IPR003067',
  'circumsporozoite protein': 'ungrounded',
  'cisplatin': 'ungrounded',
  'citrate synthase': 'HGNC:2422',
  'citrate synthetase': 'HGNC:2422',
  'citrulline synthase': 'ungrounded',
  'cleanliness score': 'ungrounded',
  'cleavage site': 'ungrounded',
  'cleavage stage': 'ungrounded',
  'clinical stage': 'ungrounded',
  'clinical supervision': 'ungrounded',
  'clonogenic survival': 'ungrounded',
  'cluster start': 'ungrounded',
  'cockayne s syndrome': 'NCIT:C9460',
  'cockayne syndrome': 'NCIT:C9460',
  'code switching': 'ungrounded',
  'cogan s syndrome': 'ungrounded',
  'cognitive skills': 'ungrounded',
  'cognitive stimulation': 'ungrounded',
  'cohen syndrome': 'ungrounded',
  'colchicine site': 'ungrounded',
  'cold stimulation': 'ungrounded',
  'cold storage': 'ungrounded',
  'cold stratification': 'ungrounded',
  'cold stress': 'ungrounded',
  'colistin sulfate': 'ungrounded',
  'collagen sponge': 'ungrounded',
  'collateral sensitivity': 'ungrounded',
  'colloidal silver': 'ungrounded',
  'combined score': 'ungrounded',
  'combined stress': 'ungrounded',
  'commercial strain': 'ungrounded',
  'common scab': 'ungrounded',
  'compartment syndrome': 'ungrounded',
  'compassion satisfaction': 'ungrounded',
  'compensatory sweating': 'ungrounded',
  'complement system': 'NCIT:C16459',
  'complete surgery': 'ungrounded',
  'complex spike': 'ungrounded',
  'composite score': 'ungrounded',
  'composted sewage sludge': 'ungrounded',
  'compressed sensing': 'ungrounded',
  'compressed stockings': 'ungrounded',
  'compressed strength': 'ungrounded',
  'condensate': 'ungrounded',
  'conditioned stimulation': 'ungrounded',
  'conditioned stimuli': 'ungrounded',
  'conditioned stimulus': 'ungrounded',
  'conditioned survival': 'ungrounded',
  'cone segment': 'ungrounded',
  'congenital scoliosis': 'ungrounded',
  'congenital syphilis': 'ungrounded',
  'conjunctival swab': 'ungrounded',
  'connecting sequence': 'ungrounded',
  'conscious sedation': 'ungrounded',
  'consensus sequence': 'ungrounded',
  'conserved spermatogenesis': 'ungrounded',
  'conserved surgery': 'ungrounded',
  'constant spring': 'ungrounded',
  'contact sensitivity': 'ungrounded',
  'continuous stimulation': 'ungrounded',
  'continuous stress loaded': 'ungrounded',
  'contrast sensitivity': 'ungrounded',
  'contrasuppressor': 'ungrounded',
  'control subjected': 'ungrounded',
  'conventional smear': 'ungrounded',
  'conversion surgery': 'ungrounded',
  'convulsive seizures': 'ungrounded',
  'cooking skills': 'ungrounded',
  'copper sulfate': 'ungrounded',
  'cordyceps sinensis': 'ungrounded',
  'cordyceps sinensis berk sacc': 'ungrounded',
  'core sequence': 'ungrounded',
  'core shell': 'ungrounded',
  'coriandrum sativum': 'ungrounded',
  'coriandrum sativum l': 'ungrounded',
  'corn silage': 'ungrounded',
  'corn silk': 'ungrounded',
  'corn starch': 'ungrounded',
  'corn stover': 'ungrounded',
  'corn straw': 'ungrounded',
  'corneal scarring': 'ungrounded',
  'corneal stiffness': 'ungrounded',
  'coronary sinus': 'ungrounded',
  'coronary stenosis': 'ungrounded',
  'corpus striatum': 'ungrounded',
  'cortical stiffness': 'ungrounded',
  'cortical stimulation': 'ungrounded',
  'corticospinal': 'ungrounded',
  'corticosteroids': 'MESH:D000305',
  'corticosterone': 'MESH:D000305',
  'cortisol': 'MESH:D000305',
  'costello syndrome': 'ungrounded',
  'costunolide': 'ungrounded',
  'coumaroyl serotonin': 'ungrounded',
  'cowan strain': 'ungrounded',
  'cowden syndrome': 'ungrounded',
  'craniosynostosis': 'ungrounded',
  'cromolyn sodium': 'ungrounded',
  'cronobacter sakazakii': 'ungrounded',
  'crouzon syndrome': 'ungrounded',
  'crush syndrome': 'ungrounded',
  'cryptogenic stroke': 'ungrounded',
  'crystalline silica': 'ungrounded',
  'crystallographic shear': 'ungrounded',
  'cs': 'ungrounded',
  'cs and with': 'ungrounded',
  'cuckoo search': 'ungrounded',
  'cue': 'ungrounded',
  'culture supernatant': 'ungrounded',
  'cumulative survival': 'ungrounded',
  'curdlan sulfate': 'ungrounded',
  'current smokers': 'ungrounded',
  'cuscutae semen': 'ungrounded',
  'cushing s syndrome': 'MESH:D003480',
  'cushing syndrome': 'MESH:D003480',
  'cyclic mechanism stretch': 'ungrounded',
  'cyclic strain': 'ungrounded',
  'cyclic stretch': 'ungrounded',
  'cycloserine': 'ungrounded',
  'cyclosporin': 'CHEBI:CHEBI:4031',
  'cyclosporin a': 'CHEBI:CHEBI:4031',
  'cynomorium songaricum rupr': 'ungrounded',
  'cysteamine': 'CHEBI:CHEBI:17141',
  'cysteine synthase': 'IP:IPR005856',
  'cytokine storm': 'HP:0033041',
  'cytoreductive surgery': 'ungrounded',
  'shark cartilage': 'ungrounded',
  'standard curcumin': 'ungrounded',
  'succinate acid': 'ungrounded',
  'sulfate c': 'ungrounded'},
 {'NCIT:C50482': 'Cardiogenic Shock',
  'CHEBI:CHEBI:23051': 'castasterone',
  'CHEBI:CHEBI:16261': 'chitosan',
  'CHEBI:CHEBI:41321': 'cholesterol sulfate',
  'CHEBI:CHEBI:37397': 'chondroitin sulfate',
  'CHEBI:CHEBI:4031': 'cyclosporin A',
  'MESH:D000073865': 'Cigarette Smoking',
  'IP:IPR003067': 'Plasmodium circumsporozoite protein',
  'HGNC:2422': 'CS',
  'NCIT:C9460': 'Cockayne Syndrome',
  'NCIT:C16459': 'Complement',
  'MESH:D000305': 'Adrenal Cortex Hormones',
  'MESH:D003480': 'Cushing Syndrome',
  'CHEBI:CHEBI:17141': 'cysteamine',
  'IP:IPR005856': 'Cysteine synthase',
  'HP:0033041': 'Cytokine storm'},
 ['CHEBI:CHEBI:16261',
  'CHEBI:CHEBI:37397',
  'CHEBI:CHEBI:4031',
  'CHEBI:CHEBI:41321',
  'HGNC:2422',
  'NCIT:C9460',
  'MESH:D000073865']]

In [25]:
excluded_longforms = ['cs', 'cs and with', 'cue']

In [26]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [16]:
additional_entities = {}

In [17]:
unambiguous_agent_texts = {}

In [27]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [18]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [19]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [20]:
intersection1

[('HGNC:10967', 'HGNC:10967', 98)]

In [21]:
intersection2

[('GO:GO:0001837', 'HGNC:10967', 0),
 ('ungrounded', 'HGNC:10967', 0),
 ('HGNC:10967', 'HGNC:10967', 0),
 ('MESH:D055032', 'HGNC:10967', 0)]

In [22]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [23]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [28]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-11-03 01:30:43] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-11-03 01:33:08] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.8915153778216951 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [29]:
classifier.stats

{'label_distribution': {'ungrounded': 1596,
  'IP:IPR003067': 50,
  'CHEBI:CHEBI:4031': 31,
  'MESH:D000305': 215,
  'CHEBI:CHEBI:37397': 487,
  'NCIT:C9460': 83,
  'MESH:D000073865': 938,
  'HGNC:2422': 194,
  'CHEBI:CHEBI:16261': 349,
  'CHEBI:CHEBI:41321': 44,
  'MESH:D003480': 100,
  'CHEBI:CHEBI:23051': 11,
  'CHEBI:CHEBI:17141': 9,
  'NCIT:C16459': 13,
  'NCIT:C50482': 69,
  'IP:IPR005856': 11,
  'HP:0033041': 4},
 'f1': {'mean': 0.891515, 'std': 0.012587},
 'precision': {'mean': 0.919, 'std': 0.008972},
 'recall': {'mean': 0.872062, 'std': 0.019876},
 'CHEBI:CHEBI:23051': {'f1': {'mean': 0.826667, 'std': 0.149666},
  'pr': {'mean': 0.733333, 'std': 0.226078},
  'rc': {'mean': 1.0, 'std': 0.0}},
 'ungrounded': {'f1': {'mean': 0.872251, 'std': 0.012291},
  'pr': {'mean': 0.932333, 'std': 0.015923},
  'rc': {'mean': 0.819688, 'std': 0.016707}},
 'CHEBI:CHEBI:4031': {'f1': {'mean': 0.15873, 'std': 0.13166},
  'pr': {'mean': 0.095238, 'std': 0.078246},
  'rc': {'mean': 0.5, 'std': 0.

In [30]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [31]:
disamb.dump(model_name, results_path)

In [32]:
print(disamb.info())

Disambiguation model for CS

Produces the disambiguations:
	Adrenal Cortex Hormones	MESH:D000305
	CS*	HGNC:2422
	Cardiogenic Shock	NCIT:C50482
	Cigarette Smoking*	MESH:D000073865
	Cockayne Syndrome*	NCIT:C9460
	Complement	NCIT:C16459
	Cushing Syndrome	MESH:D003480
	Cysteine synthase	IP:IPR005856
	Cytokine storm	HP:0033041
	Plasmodium circumsporozoite protein	IP:IPR003067
	castasterone	CHEBI:CHEBI:23051
	chitosan*	CHEBI:CHEBI:16261
	cholesterol sulfate*	CHEBI:CHEBI:41321
	chondroitin sulfate*	CHEBI:CHEBI:37397
	cyclosporin A*	CHEBI:CHEBI:4031
	cysteamine	CHEBI:CHEBI:17141

Class level metrics:
--------------------
Grounding                          	Count	F1     
                         Ungrounded	1596	0.87225
                  Cigarette Smoking*	 938	0.95408
                chondroitin sulfate*	 487	 0.8502
                           chitosan*	 349	0.85749
            Adrenal Cortex Hormones	 215	0.67394
                                 CS*	 194	0.88741
                   Cushing Synd

In [33]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1