In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['GC']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('gastric cancer', 3741),
 ('glucocorticoid', 1505),
 ('germinal center', 850),
 ('granulosa cells', 219),
 ('gastric carcinoma', 171),
 ('guanylate cyclase', 162),
 ('guanylyl cyclase', 147),
 ('germinal centre', 71),
 ('gene conversion', 62),
 ('granule cells', 53),
 ('gas chromatography', 52),
 ('glucocerebrosidase', 36),
 ('glucosylceramide', 31),
 ('gustatory cortex', 29),
 ('glucocorticosteroid', 29),
 ('goblet cells', 28),
 ('glycol chitosan', 25),
 ('growth cone', 24),
 ('germ cells', 22),
 ('golgi complex', 20),
 ('glucocorticoid hormone', 18),
 ('genomic classifier', 14),
 ('glycated chitosan', 14),
 ('ganglion cells', 13),
 ('german cockroach', 12),
 ('guanine cytosine', 12),
 ('glassy carbon', 12),
 ('glycocholate', 12),
 ('gas chromatograph', 10),
 ('gallocatechin', 10),
 ('granger causality', 9),
 ('galacto chitosan', 9),
 ('gastric', 9),
 ('geometric center', 8),
 ('glottic cancer', 8),
 ('gonococcus', 8),
 ('gc', 7),
 ('garcinia cambogia', 7),
 ('glycol chitin', 7),
 (

In [6]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [7]:
names

{'MESH:D013274': 'Stomach Neoplasms',
 'CHEBI:CHEBI:24261': 'glucocorticoid',
 'MESH:D018858': 'Germinal Center',
 'MESH:D006107': 'Granulosa Cells',
 'DOID:DOID:5517': 'stomach carcinoma',
 'MESH:D002849': 'Chromatography, Gas',
 'CHEBI:CHEBI:36500': 'glucosylceramide',
 'MESH:D020397': 'Goblet Cells',
 'MESH:C118638': 'glycol-chitosan',
 'GO:GO:0030426': 'growth cone',
 'MESH:D005854': 'Germ Cells',
 'GO:GO:0005794': 'Golgi apparatus',
 'CHEBI:CHEBI:24621': 'hormone',
 'MESH:C121252': 'glycated chitosan',
 'MESH:D020048': 'Blattellidae',
 'CHEBI:CHEBI:16040': 'cytosine',
 'CHEBI:CHEBI:27594': 'carbon atom',
 'CHEBI:CHEBI:29746': 'glycocholate',
 'CHEBI:CHEBI:68330': 'gallocatechin',
 'MESH:D015984': 'Causality',
 'CHEBI:CHEBI:16261': 'chitosan',
 'DOID:DOID:162': 'cancer',
 'MESH:D009344': 'Neisseria gonorrhoeae',
 'HGNC:4187': 'GC',
 'MESH:D029761': 'Garcinia cambogia',
 'MESH:C031610': 'glycolchitin',
 'GO:GO:0001652': 'granular component',
 'EFO:0001461': 'control',
 'MESH:D018302

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-11-11 03:33:25] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl


In [9]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'galacto chitosan': 'CHEBI:CHEBI:16261',
  'galactosylated chitosan': 'CHEBI:CHEBI:16261',
  'galla chinensis': 'ungrounded',
  'gallbladder cancer': 'DOID:DOID:3121',
  'gallocatechin': 'CHEBI:CHEBI:68330',
  'ganglion cells': 'ungrounded',
  'ganglion compartment': 'ungrounded',
  'garcinia cambogia': 'MESH:D029761',
  'gas chromatograph': 'ungrounded',
  'gas chromatography': 'ungrounded',
  'gastric': 'ungrounded',
  'gastric cancer': 'MESH:D013274',
  'gastric cancer tissue': 'MESH:D013274',
  'gastric carcinoma': 'MESH:D013274',
  'gastrocnemius': 'ungrounded',
  'gc': 'ungrounded',
  'gc b cells': 'MESH:D018858',
  'gc026': 'ungrounded',
  'gel consists': 'ungrounded',
  'gemcitabine': 'CHEBI:CHEBI:175901',
  'gemcitabine cisplatin': 'CHEBI:CHEBI:27899',
  'gemcitabine plus cisplatin': 'ungrounded',
  'gene conversion': 'ungrounded',
  'generation cells': 'ungrounded',
  'geniposide and chlorogenic acid': 'ungrounded',
  'genomic classifier': 'ungrounded',
  'genomic control':

In [11]:
grounding_map, names, pos_labels = [{'galacto chitosan': 'CHEBI:CHEBI:16261',
  'galactosylated chitosan': 'CHEBI:CHEBI:16261',
  'galla chinensis': 'ungrounded',
  'gallbladder cancer': 'DOID:DOID:3121',
  'gallocatechin': 'CHEBI:CHEBI:68330',
  'ganglion cells': 'ungrounded',
  'ganglion compartment': 'ungrounded',
  'garcinia cambogia': 'MESH:D029761',
  'gas chromatograph': 'ungrounded',
  'gas chromatography': 'ungrounded',
  'gastric': 'ungrounded',
  'gastric cancer': 'MESH:D013274',
  'gastric cancer tissue': 'MESH:D013274',
  'gastric carcinoma': 'MESH:D013274',
  'gastrocnemius': 'ungrounded',
  'gc': 'ungrounded',
  'gc b cells': 'MESH:D018858',
  'gc026': 'ungrounded',
  'gel consists': 'ungrounded',
  'gemcitabine': 'CHEBI:CHEBI:175901',
  'gemcitabine cisplatin': 'CHEBI:CHEBI:27899',
  'gemcitabine plus cisplatin': 'ungrounded',
  'gene conversion': 'ungrounded',
  'generation cells': 'ungrounded',
  'geniposide and chlorogenic acid': 'ungrounded',
  'genomic classifier': 'ungrounded',
  'genomic control': 'ungrounded',
  'gentisate catabolism': 'GO:GO:0009056',
  'geometric center': 'ungrounded',
  'geriatric consultation': 'ungrounded',
  'germ cells': 'MESH:D005854',
  'german cockroach': 'MESH:D020048',
  'germinal center': 'MESH:D018858',
  'germinal centre': 'MESH:D018858',
  'giant cells': 'MESH:D015726',
  'giant cluster': 'ungrounded',
  'gland cistern': 'ungrounded',
  'glass carbon': 'ungrounded',
  'glass ceramic': 'ungrounded',
  'glassy carbon': 'CHEBI:CHEBI:27594',
  'glioblastoma cells': 'ungrounded',
  'gliomatosis cerebri': 'MESH:D018302',
  'global coherence': 'ungrounded',
  'glottic cancer': 'ungrounded',
  'glucagon': 'CHEBI:CHEBI:5391',
  'glucocerebrosidase': 'HGNC:4177',
  'glucocorticoid': 'MESH:D005938',
  'glucocorticoid hormone': 'MESH:D005938',
  'glucocorticosteroid': 'MESH:D005938',
  'glucosylceramide': 'HGNC:4177',
  'glycaemic control': 'ungrounded',
  'glycated chitosan': 'MESH:C121252',
  'glycemic control': 'EFO:0001461',
  'glycerol carbon': 'ungrounded',
  'glycocalicin': 'MESH:C023592',
  'glycocholate': 'CHEBI:CHEBI:29746',
  'glycocholate acid': 'ungrounded',
  'glycol chitin': 'MESH:C031610',
  'glycol chitosan': 'MESH:C118638',
  'glycyrrhizin': 'CHEBI:CHEBI:15939',
  'gnetin c': 'MESH:C575740',
  'goblet cells': 'MESH:D020397',
  'golgi complex': 'GO:GO:0005794',
  'gomisin c': 'ungrounded',
  'gonococci': 'ungrounded',
  'gonococcus': 'MESH:D009344',
  'granger causality': 'ungrounded',
  'granular cells': 'ungrounded',
  'granular component': 'GO:GO:0001652',
  'granule cells': 'ungrounded',
  'granulosa cells': 'MESH:D006107',
  'graphene crystals': 'ungrounded',
  'gray crescent': 'ungrounded',
  'greater curvature': 'ungrounded',
  'greensporone c': 'ungrounded',
  'ground cover': 'ungrounded',
  'group positive control': 'ungrounded',
  'group specific component': 'HGNC:4187',
  'growth cone': 'ungrounded',
  'guanine and cytosine': 'ungrounded',
  'guanine cytosine': 'ungrounded',
  'guanylate cyclase': 'FPLX:GUCY',
  'guanylyl cyclase': 'FPLX:GUCY',
  'guard cells': 'ungrounded',
  'guchang capsule': 'ungrounded',
  'guiding catheter': 'ungrounded',
  'gustatory cortex': 'ungrounded',
  'gustatory insular cortex': 'ungrounded'},
 {'CHEBI:CHEBI:16261': 'chitosan',
  'DOID:DOID:3121': 'gallbladder cancer',
  'CHEBI:CHEBI:68330': 'gallocatechin',
  'MESH:D029761': 'Garcinia cambogia',
  'MESH:D013274': 'Stomach Neoplasms',
  'MESH:D018858': 'Germinal Center',
  'CHEBI:CHEBI:175901': 'gemcitabine',
  'CHEBI:CHEBI:27899': 'cisplatin',
  'GO:GO:0009056': 'catabolic process',
  'MESH:D005854': 'Germ Cells',
  'MESH:D020048': 'Blattellidae',
  'MESH:D015726': 'Giant Cells',
  'CHEBI:CHEBI:27594': 'carbon atom',
  'MESH:D018302': 'Neoplasms, Neuroepithelial',
  'CHEBI:CHEBI:5391': 'glucagon',
  'HGNC:4177': 'GBA',
  'MESH:D005938': 'Glucocorticoids',
  'MESH:C121252': 'glycated chitosan',
  'EFO:0001461': 'control',
  'MESH:C023592': 'glycocalicin',
  'CHEBI:CHEBI:29746': 'glycocholate',
  'MESH:C031610': 'glycolchitin',
  'MESH:C118638': 'glycol-chitosan',
  'CHEBI:CHEBI:15939': 'glycyrrhizinic acid',
  'MESH:C575740': 'gnetin C',
  'MESH:D020397': 'Goblet Cells',
  'GO:GO:0005794': 'Golgi apparatus',
  'MESH:D009344': 'Neisseria gonorrhoeae',
  'GO:GO:0001652': 'granular component',
  'MESH:D006107': 'Granulosa Cells',
  'HGNC:4187': 'GC',
  'FPLX:GUCY': 'GUCY'},
 ['FPLX:GUCY', 'HGNC:4177', 'HGNC:4187', 'MESH:D005938']]

In [17]:
excluded_longforms = ['gc']

In [18]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [19]:
additional_entities = {'HGNC:4187': ['GC', ['GC']]}

In [20]:
unambiguous_agent_texts = {}

In [21]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [22]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [23]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [24]:
intersection1

[('HGNC:4187', 'HGNC:4187', 1095)]

In [25]:
intersection2

[('FPLX:GUCY', 'HGNC:4187', 0),
 ('MESH:D005938', 'HGNC:4187', 0),
 ('MESH:D006107', 'HGNC:4187', 0),
 ('MESH:D018858', 'HGNC:4187', 0),
 ('GO:GO:0005794', 'HGNC:4187', 0),
 ('ungrounded', 'HGNC:4187', 0),
 ('HGNC:4177', 'HGNC:4187', 0),
 ('MESH:C023592', 'HGNC:4187', 0),
 ('MESH:D020397', 'HGNC:4187', 0),
 ('GO:GO:0001652', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:29746', 'HGNC:4187', 0),
 ('MESH:D005854', 'HGNC:4187', 0),
 ('MESH:D015726', 'HGNC:4187', 0),
 ('MESH:D013274', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:27594', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:16261', 'HGNC:4187', 0),
 ('MESH:D009344', 'HGNC:4187', 0),
 ('EFO:0001461', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:15939', 'HGNC:4187', 0),
 ('MESH:C121252', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:68330', 'HGNC:4187', 0),
 ('MESH:C118638', 'HGNC:4187', 0),
 ('DOID:DOID:3121', 'HGNC:4187', 0),
 ('MESH:D018302', 'HGNC:4187', 0),
 ('MESH:D020048', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:27899', 'HGNC:4187', 0),
 ('MESH:C575740', 'HGNC:4187', 0),
 ('CHEBI:CHEBI:5391', 'HGNC:41

In [26]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [27]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [28]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-11-11 03:42:33] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-11-11 03:46:50] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.951602605088884 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [29]:
classifier.stats

{'label_distribution': {'FPLX:GUCY': 260,
  'MESH:D005938': 1307,
  'MESH:D006107': 183,
  'MESH:D018858': 665,
  'GO:GO:0005794': 19,
  'ungrounded': 303,
  'HGNC:4177': 50,
  'MESH:C023592': 6,
  'MESH:D020397': 23,
  'GO:GO:0001652': 7,
  'CHEBI:CHEBI:29746': 7,
  'MESH:D005854': 11,
  'MESH:D015726': 4,
  'MESH:D013274': 2560,
  'CHEBI:CHEBI:27594': 9,
  'CHEBI:CHEBI:16261': 6,
  'MESH:D009344': 7,
  'EFO:0001461': 5,
  'CHEBI:CHEBI:15939': 2,
  'MESH:C121252': 9,
  'CHEBI:CHEBI:68330': 9,
  'MESH:C118638': 13,
  'DOID:DOID:3121': 4,
  'MESH:D018302': 5,
  'MESH:D020048': 10,
  'CHEBI:CHEBI:27899': 2,
  'MESH:C575740': 3,
  'CHEBI:CHEBI:5391': 2,
  'MESH:D029761': 6,
  'HGNC:4187': 240,
  'GO:GO:0009056': 1,
  'MESH:C031610': 1,
  'CHEBI:CHEBI:175901': 5},
 'f1': {'mean': 0.951603, 'std': 0.002927},
 'precision': {'mean': 0.947045, 'std': 0.006468},
 'recall': {'mean': 0.957998, 'std': 0.006278},
 'HGNC:4177': {'f1': {'mean': 0.841228, 'std': 0.071954},
  'pr': {'mean': 0.76, 'std'

In [30]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [31]:
disamb.dump(model_name, results_path)

In [32]:
print(disamb.info())

Disambiguation model for GC

Produces the disambiguations:
	Blattellidae	MESH:D020048
	GBA*	HGNC:4177
	GC*	HGNC:4187
	GUCY*	FPLX:GUCY
	Garcinia cambogia	MESH:D029761
	Germ Cells	MESH:D005854
	Germinal Center	MESH:D018858
	Giant Cells	MESH:D015726
	Glucocorticoids*	MESH:D005938
	Goblet Cells	MESH:D020397
	Golgi apparatus	GO:GO:0005794
	Granulosa Cells	MESH:D006107
	Neisseria gonorrhoeae	MESH:D009344
	Neoplasms, Neuroepithelial	MESH:D018302
	Stomach Neoplasms	MESH:D013274
	carbon atom	CHEBI:CHEBI:27594
	catabolic process	GO:GO:0009056
	chitosan	CHEBI:CHEBI:16261
	cisplatin	CHEBI:CHEBI:27899
	control	EFO:0001461
	gallbladder cancer	DOID:DOID:3121
	gallocatechin	CHEBI:CHEBI:68330
	gemcitabine	CHEBI:CHEBI:175901
	glucagon	CHEBI:CHEBI:5391
	glycated chitosan	MESH:C121252
	glycocalicin	MESH:C023592
	glycocholate	CHEBI:CHEBI:29746
	glycol-chitosan	MESH:C118638
	glycolchitin	MESH:C031610
	glycyrrhizinic acid	CHEBI:CHEBI:15939
	gnetin C	MESH:C575740
	granular component	GO:GO:0001652

Class level

In [33]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1