In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['EC']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('endothelial cells', 4110),
 ('endometrial cancer', 443),
 ('epicatechin', 243),
 ('entorhinal cortex', 236),
 ('esophageal cancer', 232),
 ('embryonal carcinoma', 231),
 ('endometrial carcinoma', 175),
 ('excitation contraction', 163),
 ('enterochromaffin', 128),
 ('elemental carbon', 111),
 ('electrical conductivity', 90),
 ('epithelial cells', 80),
 ('extracellular', 77),
 ('endocannabinoid', 68),
 ('epicutaneous', 55),
 ('evening complex', 55),
 ('epidermal cells', 49),
 ('elongation complex', 46),
 ('effortful control', 43),
 ('enriched condition', 42),
 ('escherichia coli', 36),
 ('esophageal carcinoma', 35),
 ('esterified cholesterol', 33),
 ('ethyl carbamate', 32),
 ('elite control', 32),
 ('eyes closed', 29),
 ('eccentric contraction', 29),
 ('electrochemical', 28),
 ('electrocoagulation', 27),
 ('energy charge', 25),
 ('electronic cigarette', 23),
 ('ethylcellulose', 22),
 ('ethyl cellulose', 20),
 ('enzyme commission', 18),
 ('ethylene carbon', 18),
 ('emergency contracept

In [7]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [8]:
names

{'MESH:D002477': 'Cells',
 'DOID:DOID:2871': 'endometrial carcinoma',
 'MESH:D018728': 'Entorhinal Cortex',
 'MESH:D000077277': 'Esophageal Squamous Cell Carcinoma',
 'MESH:D018236': 'Carcinoma, Embryonal',
 'CHEBI:CHEBI:33415': 'elemental carbon',
 'MESH:D004553': 'Electric Conductivity',
 'GO:GO:0005576': 'extracellular region',
 'CHEBI:CHEBI:67197': 'endocannabinoid',
 'MESH:D000078404': 'Epidermal Cells',
 'EFO:0001461': 'control',
 'MESH:D004194': 'Disease',
 'MESH:D004926': 'Escherichia coli',
 'HP:HP:0011459': 'Esophageal carcinoma',
 'CHEBI:CHEBI:16113': 'cholesterol',
 'CHEBI:CHEBI:17967': 'urethane',
 'MESH:D004564': 'Electrocoagulation',
 'MESH:D003267': 'Contraception',
 'MESH:D018269': 'Carcinoma, Endometrioid',
 'MESH:D004759': 'Cell, Enterochromaffin',
 'MESH:D020895': 'Enterocytes',
 'CHEBI:CHEBI:16526': 'carbon dioxide',
 'HP:HP:0030645': 'Central',
 'MESH:D003233': 'Conjunctivitis, Allergic',
 'MESH:D056344': 'Executive Function',
 'FPLX:Cadherin': 'Cadherin',
 'MESH:

In [13]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

In [14]:
result = [grounding_map, names, pos_labels]

In [15]:
result

[{'complex environment': 'ungrounded',
  'cyclophosphamide treated group': 'NCIT:C63421',
  'e cadherin': 'FPLX:Cadherin',
  'e cigarette': 'ungrounded',
  'e coli': 'MESH:D004926',
  'ec': 'ungrounded',
  'eccentric': 'ungrounded',
  'eccentric contraction': 'ungrounded',
  'eccentric muscle contraction': 'ungrounded',
  'echis carinatus': 'ungrounded',
  'ecklonia cava': 'ungrounded',
  'ectopic calcification': 'HP:HP:0010766',
  'eddy current': 'ungrounded',
  'effector cells': 'ungrounded',
  'effects concentrations': 'ungrounded',
  'effects connectivity': 'ungrounded',
  'effortful control': 'ungrounded',
  'ehrlich carcinoma': 'ungrounded',
  'eigenvector central': 'ungrounded',
  'electrical conductivity': 'ungrounded',
  'electrocautery': 'MESH:D004564',
  'electrochemical': 'ungrounded',
  'electrochemistry': 'ungrounded',
  'electrochromic': 'ungrounded',
  'electrocoagulation': 'ungrounded',
  'electronic cigarette': 'ungrounded',
  'electrostatic complex': 'ungrounded',
  

In [40]:
grounding_map, names, pos_labels = [{'complex environment': 'ungrounded',
  'cyclophosphamide treated group': 'NCIT:C63421',
  'e cadherin': 'FPLX:Cadherin',
  'e cigarette': 'ungrounded',
  'e coli': 'MESH:D004926',
  'ec': 'ungrounded',
  'eccentric': 'ungrounded',
  'eccentric contraction': 'ungrounded',
  'eccentric muscle contraction': 'ungrounded',
  'echis carinatus': 'ungrounded',
  'ecklonia cava': 'ungrounded',
  'ectopic calcification': 'HP:HP:0010766',
  'eddy current': 'ungrounded',
  'effector cells': 'ungrounded',
  'effects concentrations': 'ungrounded',
  'effects connectivity': 'ungrounded',
  'effortful control': 'ungrounded',
  'ehrlich carcinoma': 'ungrounded',
  'eigenvector central': 'ungrounded',
  'electrical conductivity': 'ungrounded',
  'electrocautery': 'MESH:D004564',
  'electrochemical': 'ungrounded',
  'electrochemistry': 'ungrounded',
  'electrochromic': 'ungrounded',
  'electrocoagulation': 'ungrounded',
  'electronic cigarette': 'ungrounded',
  'electrostatic complex': 'ungrounded',
  'elemental carbon': 'ungrounded',
  'elevated co 2': 'ungrounded',
  'elevated co2': 'ungrounded',
  'elite control': 'ungrounded',
  'elongation complex': 'ungrounded',
  'embryogenic calli': 'ungrounded',
  'embryogenic callus': 'ungrounded',
  'embryonal carcinoma': 'MESH:D018236',
  'embryonal carcinoma cells': 'MESH:D054278',
  'emergency contraception': 'MESH:D003281',
  'empathic concern': 'ungrounded',
  'emulsifying capacity': 'ungrounded',
  'emulsifying concentrations': 'ungrounded',
  'enclomiphene': 'MESH:D064695',
  'endemic control': 'ungrounded',
  'endocannabinoid': 'MESH:D063388',
  'endocardial cushion': 'MESH:D054089',
  'endochondral': 'ungrounded',
  'endocytoscopy': 'ungrounded',
  'endogenous cannabinoid': 'MESH:D063388',
  'endometrial adenocarcinoma': 'DOID:DOID:2871',
  'endometrial cancer': 'DOID:DOID:2871',
  'endometrial carcinoma': 'DOID:DOID:2871',
  'endometrioid carcinoma': 'DOID:DOID:2871',
  'endothelial cells': 'MESH:D042783',
  'endothelial cells s': 'MESH:D042783',
  'endothelium cells': 'MESH:D042783',
  'energy charge': 'ungrounded',
  'enkephalin containing': 'ungrounded',
  'enriched': 'ungrounded',
  'enriched condition': 'ungrounded',
  'ensemble coding': 'ungrounded',
  'enteric coated': 'ungrounded',
  'enterochromaffin': 'MESH:D004759',
  'enterochromaffin cells': 'MESH:D004759',
  'enterocyte': 'ungrounded',
  'entorhinal cortex': 'ungrounded',
  'entorhinal cortical': 'ungrounded',
  'environmental complex': 'ungrounded',
  'enzyme classification': 'MESH:D002965',
  'enzyme coding': 'ungrounded',
  'enzyme commission': 'ungrounded',
  'eosinophilic cytoplasm': 'ungrounded',
  'epicardial': 'ungrounded',
  'epicatechin': 'CHEBI:CHEBI:90',
  'epicutaneous': 'ungrounded',
  'epicutaneous sensitivity': 'ungrounded',
  'epidermal cells': 'MESH:D000078404',
  'epigallocatechin': 'CHEBI:CHEBI:90',
  'epirubicin plus cyclophosphamide': 'NCIT:C63421',
  'epithelial cells': 'MESH:D004847',
  'erythroid colony': 'ungrounded',
  'esat 6 cfp 10': 'ungrounded',
  'escherichia coli': 'MESH:D004926',
  'escherichia coli k12 mg1655': 'MESH:D004926',
  'escherichia coli o157 h7': 'MESH:D004926',
  'esophageal cancer': 'HP:HP:0011459',
  'esophageal carcinoma': 'HP:HP:0011459',
  'esophagus cancer': 'HP:HP:0011459',
  'essence of chicken': 'ungrounded',
  'esterified cholesterol': 'ungrounded',
  'esterified collagen': 'ungrounded',
  'ethnic conflicts': 'ungrounded',
  'ethyl carbamate': 'ungrounded',
  'ethyl cellulose': 'ungrounded',
  'ethyl chloride': 'CHEBI:CHEBI:47554',
  'ethylcellulose': 'ungrounded',
  'ethylene carbon': 'ungrounded',
  'eucaloric feeding': 'ungrounded',
  'eucheuma cottonii': 'ungrounded',
  'euchromatin': 'GO:GO:0000791',
  'euro collins': 'ungrounded',
  'european commission': 'ungrounded',
  'euthermic control': 'ungrounded',
  'evaluate condition': 'ungrounded',
  'evening complex': 'ungrounded',
  'excitation contraction': 'ungrounded',
  'excretory cells': 'ungrounded',
  'executive capacity': 'ungrounded',
  'executive control': 'MESH:D056344',
  'exercise capacity': 'ungrounded',
  'exercise control': 'ungrounded',
  'expanded criteria': 'ungrounded',
  'expectancy challenge': 'ungrounded',
  'experimental allergic conjunctivitis': 'MESH:D003233',
  'experimental colitis': 'MESH:D003092',
  'experimental immune mediated blepharoconjunctivitis': 'DOID:DOID:2456',
  'expression coherence': 'ungrounded',
  'extended consciousness': 'ungrounded',
  'external capsule': 'MESH:D066271',
  'extracellular': 'ungrounded',
  'extracellular cadherin': 'FPLX:Cadherin',
  'extracellular calcium binding': 'ungrounded',
  'extracellular domain': 'ungrounded',
  'extracorporeal circulating': 'ungrounded',
  'eyes closed': 'ungrounded',
  'vascular endothelium': 'MESH:D004730'},
 {'NCIT:C63421': 'Cyclophosphamide/Epirubicin Regimen',
  'FPLX:Cadherin': 'Cadherin',
  'MESH:D004926': 'Escherichia coli',
  'HP:HP:0010766': 'Ectopic calcification',
  'MESH:D004564': 'Electrocoagulation',
  'MESH:D018236': 'Carcinoma, Embryonal',
  'MESH:D054278': 'Embryonal Carcinoma Stem Cells',
  'MESH:D003281': 'Contraceptives, Postcoital',
  'MESH:D064695': 'Enclomiphene',
  'MESH:D063388': 'Endocannabinoids',
  'MESH:D054089': 'Endocardial Cushions',
  'DOID:DOID:2871': 'endometrial carcinoma',
  'MESH:D042783': 'Endothelial Cells',
  'MESH:D004759': 'Cell, Enterochromaffin',
  'MESH:D002965': 'Classification',
  'CHEBI:CHEBI:90': '(-)-epicatechin',
  'MESH:D000078404': 'Epidermal Cells',
  'MESH:D004847': 'Epithelial Cells',
  'HP:HP:0011459': 'Esophageal carcinoma',
  'CHEBI:CHEBI:47554': 'chloroethane',
  'GO:GO:0000791': 'euchromatin',
  'MESH:D056344': 'Executive Function',
  'MESH:D003233': 'Conjunctivitis, Allergic',
  'MESH:D003092': 'Colitis',
  'DOID:DOID:2456': 'blepharoconjunctivitis',
  'MESH:D066271': 'External Capsule',
  'MESH:D004730': 'Endothelium, Vascular'},
 ['CHEBI:CHEBI:90',
  'HP:HP:0011459'
  'DOID:DOID:2871',
  'FPLX:Cadherin',
  'MESH:D004759',
  'MESH:D016889',
  'MESH:D018236',
  'MESH:D042783',
  'MESH:D063388']]

In [41]:
excluded_longforms = ['ec']

In [42]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [43]:
additional_entities = {}

In [44]:
unambiguous_agent_texts = {}

In [45]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [18]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [19]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [20]:
intersection1

[('HGNC:10967', 'HGNC:10967', 98)]

In [21]:
intersection2

[('GO:GO:0001837', 'HGNC:10967', 0),
 ('ungrounded', 'HGNC:10967', 0),
 ('HGNC:10967', 'HGNC:10967', 0),
 ('MESH:D055032', 'HGNC:10967', 0)]

In [22]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [23]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [46]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-11-03 02:22:55] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-11-03 02:26:55] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9593306972252437 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [47]:
classifier.stats

{'label_distribution': {'MESH:D004759': 100,
  'MESH:D018236': 193,
  'DOID:DOID:2871': 445,
  'MESH:D042783': 3354,
  'ungrounded': 1134,
  'MESH:D000078404': 48,
  'MESH:D004847': 61,
  'HP:HP:0011459': 193,
  'MESH:D054278': 3,
  'MESH:D004926': 40,
  'MESH:D064695': 2,
  'CHEBI:CHEBI:90': 177,
  'MESH:D004730': 4,
  'MESH:D066271': 9,
  'FPLX:Cadherin': 11,
  'DOID:DOID:2456': 6,
  'MESH:D054089': 7,
  'MESH:D003281': 10,
  'MESH:D056344': 7,
  'MESH:D003233': 14,
  'MESH:D063388': 57,
  'MESH:D002965': 4,
  'MESH:D003092': 2,
  'NCIT:C63421': 3,
  'GO:GO:0000791': 3,
  'MESH:D004564': 4,
  'HP:HP:0010766': 1,
  'CHEBI:CHEBI:47554': 1},
 'f1': {'mean': 0.959331, 'std': 0.002216},
 'precision': {'mean': 0.952091, 'std': 0.004441},
 'recall': {'mean': 0.968141, 'std': 0.003832},
 'CHEBI:CHEBI:90': {'f1': {'mean': 0.866468, 'std': 0.026799},
  'pr': {'mean': 0.808571, 'std': 0.048948},
  'rc': {'mean': 0.93697, 'std': 0.039436}},
 'HP:HP:0010766': {'f1': {'mean': 0.0, 'std': 0.0},
  '

In [48]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [49]:
disamb.dump(model_name, results_path)

In [50]:
print(disamb.info())

Disambiguation model for EC

Produces the disambiguations:
	(-)-epicatechin*	CHEBI:CHEBI:90
	Cadherin*	FPLX:Cadherin
	Carcinoma, Embryonal*	MESH:D018236
	Cell, Enterochromaffin*	MESH:D004759
	Classification	MESH:D002965
	Colitis	MESH:D003092
	Conjunctivitis, Allergic	MESH:D003233
	Contraceptives, Postcoital	MESH:D003281
	Cyclophosphamide/Epirubicin Regimen	NCIT:C63421
	Ectopic calcification	HP:HP:0010766
	Electrocoagulation	MESH:D004564
	Embryonal Carcinoma Stem Cells	MESH:D054278
	Enclomiphene	MESH:D064695
	Endocannabinoids*	MESH:D063388
	Endocardial Cushions	MESH:D054089
	Endothelial Cells*	MESH:D042783
	Endothelium, Vascular	MESH:D004730
	Epidermal Cells	MESH:D000078404
	Epithelial Cells	MESH:D004847
	Escherichia coli	MESH:D004926
	Esophageal carcinoma	HP:HP:0011459
	Executive Function	MESH:D056344
	External Capsule	MESH:D066271
	blepharoconjunctivitis	DOID:DOID:2456
	chloroethane	CHEBI:CHEBI:47554
	endometrial carcinoma	DOID:DOID:2871
	euchromatin	GO:GO:0000791

Class level metrics

In [51]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1