In [42]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [43]:
adeft_grounder = AdeftGrounder()

In [1]:
shortforms = ['OA']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

NameError: name 'escape_filename' is not defined

In [45]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

In [46]:
longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms(weight_decay_param=0.001)
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [47]:
list(zip(longforms, counts))

[('thymidylate synthase', 819),
 ('tourette syndrome', 176),
 ('turner syndrome', 134),
 ('trophoblast stem', 127),
 ('tobacco smoke', 100),
 ('transition state', 69),
 ('trans sialidase', 61),
 ('timothy syndrome', 42),
 ('takotsubo syndrome', 40),
 ('tourette s syndrome', 40),
 ('tensile strength', 34),
 ('test stimulus', 31),
 ('tuberous sclerosis', 31),
 ('thymidylate synthetase', 31),
 ('testosterone', 30),
 ('template switching', 28),
 ('temporal summation', 28),
 ('temperature sensitive', 26),
 ('total solids', 21),
 ('tea saponins', 21),
 ('transferrin saturation', 20),
 ('tumor suppressor', 19),
 ('turner s syndrome', 19),
 ('toona sinensis', 18),
 ('tocopherol succinate', 16),
 ('tactile stimulation', 16),
 ('tail suspension', 15),
 ('transcribed strand', 15),
 ('thiostrepton', 14),
 ('trimethoprim sulfamethoxazole', 13),
 ('tokishakuyakusan', 13),
 ('target sequence', 12),
 ('tocopheryl succinate', 12),
 ('total saponins', 12),
 ('tractus solitarius', 11),
 ('thermal sensati

In [None]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

In [23]:
result = [grounding_map, names, pos_labels]

In [24]:
result

[{'and orotic': 'ungrounded',
  'oa': 'ungrounded',
  'obstructive apnea': 'MESH:D001049',
  'obstructive azoospermia': 'HP:HP:0011962',
  'obtusilactone a': 'MESH:C551476',
  'ocadaic acid': 'MESH:D019319',
  'occipital artery': 'ungrounded',
  'occupational asthma': 'MESH:D059366',
  'ocean acidification': 'ocean_acidification',
  'ochratoxin a': 'CHEBI:CHEBI:7719',
  'oct 1 activity': 'HGNC:9212',
  'octa acid': 'ungrounded',
  'octanoic': 'CHEBI:CHEBI:28837',
  'octanoic acid': 'CHEBI:CHEBI:28837',
  'octopamine': 'CHEBI:CHEBI:17134',
  'octopaminergic': 'ungrounded',
  'ocular albinism': 'MESH:D016117',
  'oesophageal adenocarcinoma': 'DOID:DOID:4914',
  'of assembly': 'ungrounded',
  'offset analgesia': 'MESH:D000698',
  'okadaic acid': 'MESH:D019319',
  'older adults': 'ungrounded',
  'oleamide': 'CHEBI:CHEBI:116314',
  'oleanolic': 'CHEBI:CHEBI:37659',
  'oleanolic acid': 'CHEBI:CHEBI:37659',
  'oleate': 'ungrounded',
  'oleic acid': 'CHEBI:CHEBI:16196',
  'oleuropein aglycone'

In [35]:
grounding_map, names, pos_labels = [{'and orotic': 'ungrounded',
  'oa': 'ungrounded',
  'obstructive apnea': 'MESH:D001049',
  'obstructive azoospermia': 'HP:HP:0011962',
  'obtusilactone a': 'MESH:C551476',
  'ocadaic acid': 'MESH:D019319',
  'occipital artery': 'ungrounded',
  'occupational asthma': 'MESH:D059366',
  'ocean acidification': 'ocean_acidification',
  'ochratoxin a': 'CHEBI:CHEBI:7719',
  'oct 1 activity': 'HGNC:9212',
  'octa acid': 'ungrounded',
  'octanoic': 'CHEBI:CHEBI:28837',
  'octanoic acid': 'CHEBI:CHEBI:28837',
  'octopamine': 'CHEBI:CHEBI:17134',
  'octopaminergic': 'ungrounded',
  'ocular albinism': 'MESH:D016117',
  'oesophageal adenocarcinoma': 'DOID:DOID:4914',
  'of assembly': 'ungrounded',
  'offset analgesia': 'MESH:D000698',
  'okadaic acid': 'MESH:D019319',
  'older adults': 'ungrounded',
  'oleamide': 'CHEBI:CHEBI:116314',
  'oleanolic': 'CHEBI:CHEBI:37659',
  'oleanolic acid': 'CHEBI:CHEBI:37659',
  'oleate': 'ungrounded',
  'oleic acid': 'CHEBI:CHEBI:16196',
  'oleuropein aglycone': 'CHEBI:CHEBI:139162',
  'oligomycin a': 'CHEBI:CHEBI:28285',
  'ongoing activity': 'ungrounded',
  'open abdomen': 'ungrounded',
  'open access': 'ungrounded',
  'open angle': 'ungrounded',
  'open appendectomy': 'MESH:D001062',
  'open arm': 'ungrounded',
  'ophthalmic artery': 'MESH:D009880',
  'optic ataxia': 'HP:HP:0031868',
  'optoacoustic': 'ungrounded',
  'oral aboral': 'ungrounded',
  'oral anticoagulation': 'ungrounded',
  'oral appliance': 'ungrounded',
  'orbital atherectomy': 'MESH:D017073',
  'orexin a': 'CHEBI:CHEBI:80319',
  'organic acid': 'CHEBI:CHEBI:64709',
  'organic acid anions': 'CHEBI:CHEBI:64709',
  'organic aerosol': 'ungrounded',
  'organic anions': 'CHEBI:CHEBI:25696',
  'organizational ambidexterity': 'ungrounded',
  'oriented attachment': 'ungrounded',
  'ornithine l aspartate': 'CHEBI:CHEBI:29991',
  'orolingual angioedema': 'MESH:D000799',
  'oropharyngeal aspiration': 'ungrounded',
  'orotic acid': 'CHEBI:CHEBI:16742',
  'oroxylin a': 'CHEBI:CHEBI:61668',
  'orsellinic acid': 'MESH:C501612',
  'osmotic adjustment': 'ungrounded',
  'osteo arthritis': 'MESH:D010003',
  'osteoactivin': 'UP:Q9QZF6',
  'osteoarthritic': 'MESH:D010003',
  'osteoarthritis': 'MESH:D010003',
  'osteoarthrosis': 'MESH:D010003',
  'ovalbumin': 'MESH:D010047',
  'overall accuracy': 'ungrounded',
  'overall adjustment acupuncture': 'MESH:D026881',
  'oxalic acid': 'CHEBI:CHEBI:16995',
  'oxaloacetate': 'CHEBI:CHEBI:16452',
  'oxidative addition': 'ungrounded',
  'oxonic acid': 'CHEBI:CHEBI:30863'},
 {'MESH:D001049': 'Apnea',
  'HP:HP:0011962': 'Obstructive azoospermia',
  'MESH:C551476': 'obtusilactone A',
  'MESH:D019319': 'Okadaic Acid',
  'MESH:D059366': 'Asthma, Occupational',
  'ocean_acidification': 'ocean_acidification',
  'CHEBI:CHEBI:7719': 'ochratoxin A',
  'HGNC:9212': 'POU2F1',
  'CHEBI:CHEBI:28837': 'octanoic acid',
  'CHEBI:CHEBI:17134': 'octopamine',
  'MESH:D016117': 'Albinism, Ocular',
  'DOID:DOID:4914': 'esophagus adenocarcinoma',
  'MESH:D000698': 'Analgesia',
  'CHEBI:CHEBI:116314': 'oleamide',
  'CHEBI:CHEBI:37659': 'oleanolic acid',
  'CHEBI:CHEBI:16196': 'oleic acid',
  'CHEBI:CHEBI:139162': 'oleuropein aglycone',
  'CHEBI:CHEBI:28285': 'oligomycin A',
  'MESH:D001062': 'Appendectomy',
  'MESH:D009880': 'Ophthalmic Artery',
  'HP:HP:0031868': 'Optic ataxia',
  'MESH:D017073': 'Atherectomy',
  'CHEBI:CHEBI:80319': 'Orexin A',
  'CHEBI:CHEBI:64709': 'organic acid',
  'CHEBI:CHEBI:25696': 'organic anion',
  'CHEBI:CHEBI:29991': 'L-aspartate(1-)',
  'MESH:D000799': 'Angioedema',
  'CHEBI:CHEBI:16742': 'orotic acid',
  'CHEBI:CHEBI:61668': 'oroxylin A',
  'MESH:C501612': 'orsellinic acid',
  'MESH:D010003': 'Osteoarthritis',
  'UP:Q9QZF6': 'Osteoactivin',
  'MESH:D010047': 'Ovalbumin',
  'MESH:D026881': 'Acupuncture',
  'CHEBI:CHEBI:16995': 'oxalic acid',
  'CHEBI:CHEBI:16452': 'oxaloacetate(2-)',
  'CHEBI:CHEBI:30863': '5-azaorotic acid'},
 ['CHEBI:CHEBI:16196',
  'CHEBI:CHEBI:16742',
  'CHEBI:CHEBI:16995',
  'CHEBI:CHEBI:17134',
  'CHEBI:CHEBI:28837',
  'CHEBI:CHEBI:37659',
  'MESH:D010003',
  'MESH:D010047',
  'MESH:D019319',
  'MESH:D059366',
  'UP:Q9QZF6']]

In [26]:
excluded_longforms = []

In [27]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [28]:
additional_entities = {}

In [29]:
unambiguous_agent_texts = {}

In [30]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [36]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-08 01:55:00] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-08 01:59:11] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9435155110007478 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [37]:
classifier.stats

{'label_distribution': {'MESH:D010047': 105,
  'MESH:D010003': 2816,
  'MESH:D019319': 764,
  'CHEBI:CHEBI:7719': 26,
  'CHEBI:CHEBI:16196': 567,
  'CHEBI:CHEBI:16742': 29,
  'CHEBI:CHEBI:28837': 21,
  'CHEBI:CHEBI:17134': 84,
  'CHEBI:CHEBI:16452': 8,
  'CHEBI:CHEBI:16995': 38,
  'CHEBI:CHEBI:37659': 251,
  'MESH:D001049': 4,
  'ungrounded': 127,
  'MESH:D059366': 54,
  'CHEBI:CHEBI:116314': 3,
  'CHEBI:CHEBI:29991': 3,
  'CHEBI:CHEBI:61668': 16,
  'CHEBI:CHEBI:25696': 3,
  'UP:Q9QZF6': 18,
  'CHEBI:CHEBI:28285': 3,
  'MESH:D009880': 6,
  'CHEBI:CHEBI:64709': 26,
  'DOID:DOID:4914': 2,
  'MESH:D016117': 3,
  'ocean_acidification': 80,
  'HP:HP:0031868': 3,
  'MESH:C501612': 3,
  'MESH:D000799': 2,
  'CHEBI:CHEBI:80319': 4,
  'CHEBI:CHEBI:30863': 4,
  'HP:HP:0011962': 5,
  'MESH:D000698': 4,
  'CHEBI:CHEBI:139162': 5,
  'MESH:D001062': 2,
  'HGNC:9212': 1,
  'MESH:D017073': 6,
  'MESH:C551476': 1,
  'MESH:D026881': 1},
 'f1': {'mean': 0.943516, 'std': 0.005057},
 'precision': {'mean': 

In [38]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [40]:
disamb.dump(model_name, results_path)

In [39]:
print(disamb.info())

Disambiguation model for OA

Produces the disambiguations:
	5-azaorotic acid	CHEBI:CHEBI:30863
	Acupuncture	MESH:D026881
	Albinism, Ocular	MESH:D016117
	Analgesia	MESH:D000698
	Angioedema	MESH:D000799
	Apnea	MESH:D001049
	Appendectomy	MESH:D001062
	Asthma, Occupational*	MESH:D059366
	Atherectomy	MESH:D017073
	L-aspartate(1-)	CHEBI:CHEBI:29991
	Obstructive azoospermia	HP:HP:0011962
	Okadaic Acid*	MESH:D019319
	Ophthalmic Artery	MESH:D009880
	Optic ataxia	HP:HP:0031868
	Orexin A	CHEBI:CHEBI:80319
	Osteoactivin*	UP:Q9QZF6
	Osteoarthritis*	MESH:D010003
	Ovalbumin*	MESH:D010047
	POU2F1	HGNC:9212
	esophagus adenocarcinoma	DOID:DOID:4914
	obtusilactone A	MESH:C551476
	ocean_acidification	ocean_acidification
	ochratoxin A	CHEBI:CHEBI:7719
	octanoic acid*	CHEBI:CHEBI:28837
	octopamine*	CHEBI:CHEBI:17134
	oleamide	CHEBI:CHEBI:116314
	oleanolic acid*	CHEBI:CHEBI:37659
	oleic acid*	CHEBI:CHEBI:16196
	oleuropein aglycone	CHEBI:CHEBI:139162
	oligomycin A	CHEBI:CHEBI:28285
	organic acid	CHEBI:CHEBI:6

In [41]:
model_to_s3(disamb)