In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['AC']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('adenylyl cyclase', 1135),
 ('adenylate cyclase', 809),
 ('activation carbon', 133),
 ('adenocarcinoma', 121),
 ('auditory cortex', 108),
 ('alternating current', 82),
 ('anchor cell', 75),
 ('apoptotic cell', 59),
 ('anterior chamber', 58),
 ('acid ceramidase', 58),
 ('allergic conjunctivitis', 48),
 ('adjuvant chemotherapy', 47),
 ('activation charcoal', 46),
 ('articular cartilage', 42),
 ('accessory cell', 39),
 ('acromioclavicular', 32),
 ('arrhythmogenic cardiomyopathy', 32),
 ('antioxidant capacity', 27),
 ('antrodia camphorata', 26),
 ('anticoagulation', 26),
 ('and cyclophosphamide', 25),
 ('amylose content', 24),
 ('attenuation correction', 24),
 ('abdominal circumference', 23),
 ('acylcarnitine', 22),
 ('aconitine', 22),
 ('anthocyanins', 20),
 ('anterior commissure', 19),
 ('acute cholecystitis', 19),
 ('aberrant crypts', 19),
 ('air conducted', 19),
 ('astrocytes', 19),
 ('antrodia cinnamomea', 18),
 ('anthracycline', 18),
 ('adherent cell', 18),
 ('aortic calcification'

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-10 02:11:11] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'a cinnamomea mycelia': 'TAXONOMY:2696576',
  'abdominal circumference': 'ungrounded',
  'abdominal compression': 'ungrounded',
  'aberrant crypts': 'ungrounded',
  'absolute configuration': 'ungrounded',
  'absorptive capacity': 'ungrounded',
  'ac': 'ungrounded',
  'acacetin': 'MESH:C023717',
  'acalculous cholecystitis': 'MESH:D042101',
  'acceleration': 'ungrounded',
  'acceleration capacity': 'ungrounded',
  'accessory cell': 'MESH:D000938',
  'acclimation': 'MESH:D000064',
  'accuracy': 'ungrounded',
  'acebutolol': 'CHEBI:CHEBI:2379',
  'acenocoumarol': 'CHEBI:CHEBI:53766',
  'acetaldehyde': 'CHEBI:CHEBI:15343',
  'acetate': 'CHEBI:CHEBI:30089',
  'acetoin': 'CHEBI:CHEBI:15688',
  'acetone': 'CHEBI:CHEBI:15347',
  'acetylcholine': 'CHEBI:CHEBI:15355',
  'acetylcysteine': 'CHEBI:CHEBI:22198',
  'acid': 'ungrounded',
  'acid ceramidase': 'MESH:D055573',
  'acid cluster': 'ungrounded',
  'aconitine': 'CHEBI:CHEBI:2430',
  'acrolein': 'CHEBI:CHEBI:15368',
  'acromioclavicular': 'u

In [19]:
grounding_map, names, pos_labels = [{'a cinnamomea mycelia': 'TAXONOMY:2696576',
  'abdominal circumference': 'ungrounded',
  'abdominal compression': 'ungrounded',
  'aberrant crypts': 'ungrounded',
  'absolute configuration': 'ungrounded',
  'absorptive capacity': 'ungrounded',
  'ac': 'ungrounded',
  'acacetin': 'MESH:C023717',
  'acalculous cholecystitis': 'MESH:D042101',
  'acceleration': 'ungrounded',
  'acceleration capacity': 'ungrounded',
  'accessory cell': 'MESH:D000938',
  'acclimation': 'MESH:D000064',
  'accuracy': 'ungrounded',
  'acebutolol': 'CHEBI:CHEBI:2379',
  'acenocoumarol': 'CHEBI:CHEBI:53766',
  'acetaldehyde': 'CHEBI:CHEBI:15343',
  'acetate': 'CHEBI:CHEBI:30089',
  'acetoin': 'CHEBI:CHEBI:15688',
  'acetone': 'CHEBI:CHEBI:15347',
  'acetylcholine': 'CHEBI:CHEBI:15355',
  'acetylcysteine': 'CHEBI:CHEBI:22198',
  'acid': 'ungrounded',
  'acid ceramidase': 'MESH:D055573',
  'acid cluster': 'ungrounded',
  'aconitine': 'CHEBI:CHEBI:2430',
  'acrolein': 'CHEBI:CHEBI:15368',
  'acromioclavicular': 'ungrounded',
  'acrylamide': 'CHEBI:CHEBI:28619',
  'actin cheilitis': 'ungrounded',
  'activation': 'ungrounded',
  'activation carbon': 'NCIT:C77524',
  'activation charcoal': 'NCIT:C77524',
  'activation control': 'ungrounded',
  'activation count': 'ungrounded',
  'acupuncture': 'MESH:D026881',
  'acute cholangitis': 'DOID:DOID:14271',
  'acute cholecystitis': 'MESH:D041881',
  'acute cocaine': 'CHEBI:CHEBI:27958',
  'acyl': 'ungrounded',
  'acylcarnitine': 'MESH:C116917',
  'addition concerns': 'ungrounded',
  'adenocarcinoma': 'MESH:D000230',
  'adenylate cyclase': 'FPLX:ADCY',
  'adenylcyclase': 'ungrounded',
  'adenylyl cyclase': 'FPLX:ADCY',
  'adenylylcyclase': 'ungrounded',
  'adf cofilin': 'FPLX:Cofilin',
  'adherent cell': 'ungrounded',
  'adherent culture': 'ungrounded',
  'adhesive capsulitis': 'ungrounded',
  'adjuvant chemotherapy': 'MESH:D017024',
  'adrenal crisis': 'ungrounded',
  'adrenocortical': 'ungrounded',
  'adriamycin cyclophosphamide': 'NCIT:C63415',
  'affected commitment': 'ungrounded',
  'after calving': 'ungrounded',
  'aggregation coefficient': 'ungrounded',
  'air conditions': 'ungrounded',
  'air conducted': 'ungrounded',
  'alcohol cirrhosis': 'MESH:D008104',
  'alcohol consumption': 'MESH:D000428',
  'alginate chitosan': 'CHEBI:CHEBI:16261',
  'all cases': 'ungrounded',
  'allergic colitis': 'MESH:D003092',
  'allergic conjunctivitis': 'MESH:D003233',
  'alocasia cucullata': 'ungrounded',
  'alternating current': 'alternating_current',
  'aluminum chloride': 'CHEBI:CHEBI:30114',
  'amacrine cell': 'MESH:D025042',
  'aminochrome': 'MESH:C091584',
  'ammonium chloride': 'MESH:D000643',
  'amniocentesis': 'MESH:D000649',
  'amoxicillin clavulanate': 'CHEBI:CHEBI:487869',
  'amylose content': 'ungrounded',
  'anaerobic capacity': 'ungrounded',
  'ananas comosus': 'ungrounded',
  'ananas comosus l leaves': 'ungrounded',
  'anchor cell': 'anchor_cell',
  'and cyclophosphamide': 'NCIT:C63415',
  'anemia of cancer': 'ungrounded',
  'animal cap': 'ungrounded',
  'anterior chamber': 'MESH:D000867',
  'anterior cingulate': 'ungrounded',
  'anterior cingulate cortex': 'ungrounded',
  'anterior commissure': 'ungrounded',
  'anthocyanins': 'CHEBI:CHEBI:38697',
  'anthocyanins complex': 'ungrounded',
  'anthracenecarboxylate': 'ungrounded',
  'anthracycline': 'NCIT:C63415',
  'anthracycline plus cyclophosphamide': 'NCIT:C63415',
  'anticholinergic': 'ungrounded',
  'anticoagulation': 'ungrounded',
  'antioxidant capacity': 'ungrounded',
  'antrodia camphorata': 'TAXONOMY:2696576',
  'antrodia cinnamomea': 'TAXONOMY:2696576',
  'aorta constriction': 'ungrounded',
  'aortic calcification': 'MESH:C562942',
  'aortic calcium': 'MESH:C562942',
  'aortic coarctation': 'MESH:D001017',
  'aortic compliance': 'ungrounded',
  'aortic constriction': 'ungrounded',
  'apatite cement': 'ungrounded',
  'apical constriction': 'GO:GO:0003383',
  'apoptotic cell': 'apoptotic_cell',
  'appraisal committee': 'ungrounded',
  'arachnoid cysts': 'MESH:D016080',
  'aralia continentalis kitagawa': 'ungrounded',
  'arctigenin': 'MESH:C071942',
  'arm circumference': 'ungrounded',
  'arrhythmogenic cardiomyopathy': 'ungrounded',
  'artemisia capillaris': 'ungrounded',
  'arterial calcification': 'HP:HP:0003207',
  'arterial compliance': 'ungrounded',
  'arthritic control': 'ungrounded',
  'articular cartilage': 'MESH:D002358',
  'articular chondrocytes': 'ungrounded',
  'ascending colon': 'MESH:D044682',
  'assembly compartment': 'ungrounded',
  'assisted cough': 'ungrounded',
  'associated commissure': 'ungrounded',
  'astrocytes': 'MESH:D001253',
  'atherogenic coefficient': 'ungrounded',
  'atorvastatin calcium': 'CHEBI:CHEBI:50686',
  'attention control': 'ungrounded',
  'attenuation correction': 'ungrounded',
  'atypical carcinoid': 'MESH:D002276',
  'auditory cortex': 'MESH:D001303',
  'auditory cortical': 'ungrounded',
  'augmentation cystoplasty': 'ungrounded',
  'autochaperone': 'ungrounded',
  'autoclaved': 'ungrounded',
  'available chlorine': 'MESH:D002713',
  'awake craniotomy': 'MESH:D003399',
  'axon compartment': 'ungrounded'},
 {'TAXONOMY:2696576': 'Taiwanofungus camphoratus',
  'MESH:C023717': 'acacetin',
  'MESH:D042101': 'Acalculous Cholecystitis',
  'MESH:D000938': 'Antigen-Presenting Cells',
  'MESH:D000064': 'Acclimatization',
  'CHEBI:CHEBI:2379': 'acebutolol',
  'CHEBI:CHEBI:53766': 'acenocoumarol',
  'CHEBI:CHEBI:15343': 'acetaldehyde',
  'CHEBI:CHEBI:30089': 'acetate',
  'CHEBI:CHEBI:15688': 'acetoin',
  'CHEBI:CHEBI:15347': 'acetone',
  'CHEBI:CHEBI:15355': 'acetylcholine',
  'CHEBI:CHEBI:22198': 'acetylcysteine',
  'MESH:D055573': 'Acid Ceramidase',
  'CHEBI:CHEBI:2430': 'aconitine',
  'CHEBI:CHEBI:15368': 'acrolein',
  'CHEBI:CHEBI:28619': 'acrylamide',
  'NCIT:C77524': 'Activated Charcoal',
  'MESH:D026881': 'Acupuncture',
  'DOID:DOID:14271': 'acute cholangitis',
  'MESH:D041881': 'Cholecystitis, Acute',
  'CHEBI:CHEBI:27958': 'cocaine',
  'MESH:C116917': 'acylcarnitine',
  'MESH:D000230': 'Adenocarcinoma',
  'FPLX:ADCY': 'ADCY',
  'FPLX:Cofilin': 'Cofilin',
  'MESH:D017024': 'Chemotherapy, Adjuvant',
  'NCIT:C63415': 'AC Regimen',
  'MESH:D008104': 'Liver Cirrhosis, Alcoholic',
  'MESH:D000428': 'Alcohol Drinking',
  'CHEBI:CHEBI:16261': 'chitosan',
  'MESH:D003092': 'Colitis',
  'MESH:D003233': 'Conjunctivitis, Allergic',
  'alternating_current': 'alternating_current',
  'CHEBI:CHEBI:30114': 'aluminium trichloride',
  'MESH:D025042': 'Amacrine Cells',
  'MESH:C091584': 'aminochrome 1',
  'MESH:D000643': 'Ammonium Chloride',
  'MESH:D000649': 'Amniocentesis',
  'CHEBI:CHEBI:487869': 'clavulanate',
  'anchor_cell': 'anchor_cell',
  'MESH:D000867': 'Anterior Chamber',
  'CHEBI:CHEBI:38697': 'anthocyanin',
  'MESH:C562942': 'Aortic Valve, Calcification of',
  'MESH:D001017': 'Aortic Coarctation',
  'GO:GO:0003383': 'apical constriction',
  'apoptotic_cell': 'apoptotic_cell',
  'MESH:D016080': 'Arachnoid Cysts',
  'MESH:C071942': 'arctigenin',
  'HP:HP:0003207': 'Arterial calcification',
  'MESH:D002358': 'Cartilage, Articular',
  'MESH:D044682': 'Colon, Ascending',
  'MESH:D001253': 'Astrocytes',
  'CHEBI:CHEBI:50686': 'atorvastatin calcium',
  'MESH:D002276': 'Carcinoid Tumor',
  'MESH:D001303': 'Auditory Cortex',
  'MESH:D002713': 'Chlorine',
  'MESH:D003399': 'Craniotomy'},
 ['FPLX:ADCY',
  'MESH:D000230',
  'MESH:D000867',
  'MESH:D000938',
  'MESH:D002358',
  'MESH:D003233',
  'MESH:D017024',
  'MESH:D055573',
  'NCIT:C63415',
  'NCIT:C77524',
  'TAXONOMY:2696576',
  'CHEBI:CHEBI:2430',
  'FPLX:Cofilin',
  'MESH:D041881',
  'MESH:C562942']]

In [10]:
excluded_longforms = []

In [11]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [12]:
additional_entities = {}

In [13]:
unambiguous_agent_texts = {}

In [14]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [20]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-10 02:59:53] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-10 03:02:32] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9358074955051074 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [21]:
classifier.stats

{'label_distribution': {'FPLX:ADCY': 1644,
  'CHEBI:CHEBI:15355': 3,
  'ungrounded': 410,
  'MESH:D001253': 8,
  'NCIT:C77524': 148,
  'MESH:D000867': 47,
  'CHEBI:CHEBI:15347': 5,
  'CHEBI:CHEBI:22198': 8,
  'MESH:D000938': 38,
  'CHEBI:CHEBI:28619': 3,
  'alternating_current': 61,
  'CHEBI:CHEBI:15368': 6,
  'CHEBI:CHEBI:30089': 3,
  'anchor_cell': 52,
  'CHEBI:CHEBI:53766': 4,
  'MESH:D008104': 5,
  'MESH:D001017': 3,
  'CHEBI:CHEBI:2430': 18,
  'MESH:D000428': 4,
  'MESH:D000230': 83,
  'MESH:D000643': 8,
  'MESH:C116917': 13,
  'NCIT:C63415': 37,
  'MESH:D002276': 7,
  'MESH:D055573': 44,
  'CHEBI:CHEBI:2379': 3,
  'MESH:D042101': 2,
  'MESH:D001303': 62,
  'MESH:D003233': 29,
  'MESH:D000064': 4,
  'FPLX:Cofilin': 10,
  'CHEBI:CHEBI:15343': 3,
  'MESH:C023717': 3,
  'CHEBI:CHEBI:16261': 5,
  'apoptotic_cell': 40,
  'MESH:D002358': 24,
  'HP:HP:0003207': 10,
  'MESH:D044682': 4,
  'DOID:DOID:14271': 5,
  'TAXONOMY:2696576': 29,
  'MESH:D026881': 4,
  'GO:GO:0003383': 4,
  'CHEBI:C

In [22]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [24]:
disamb.dump(model_name, results_path)

In [23]:
print(disamb.info())

Disambiguation model for AC

Produces the disambiguations:
	AC Regimen*	NCIT:C63415
	ADCY*	FPLX:ADCY
	Acalculous Cholecystitis	MESH:D042101
	Acclimatization	MESH:D000064
	Acid Ceramidase*	MESH:D055573
	Activated Charcoal*	NCIT:C77524
	Acupuncture	MESH:D026881
	Adenocarcinoma*	MESH:D000230
	Alcohol Drinking	MESH:D000428
	Amacrine Cells	MESH:D025042
	Ammonium Chloride	MESH:D000643
	Amniocentesis	MESH:D000649
	Anterior Chamber*	MESH:D000867
	Antigen-Presenting Cells*	MESH:D000938
	Aortic Coarctation	MESH:D001017
	Aortic Valve, Calcification of*	MESH:C562942
	Arachnoid Cysts	MESH:D016080
	Arterial calcification	HP:HP:0003207
	Astrocytes	MESH:D001253
	Auditory Cortex	MESH:D001303
	Carcinoid Tumor	MESH:D002276
	Cartilage, Articular*	MESH:D002358
	Chemotherapy, Adjuvant*	MESH:D017024
	Chlorine	MESH:D002713
	Cholecystitis, Acute*	MESH:D041881
	Cofilin*	FPLX:Cofilin
	Colitis	MESH:D003092
	Colon, Ascending	MESH:D044682
	Conjunctivitis, Allergic*	MESH:D003233
	Craniotomy	MESH:D003399
	Liver Cirrh

In [25]:
model_to_s3(disamb)