In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['TS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

In [5]:
longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms(weight_decay_param=0.001)
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [6]:
list(zip(longforms, counts))

[('thymidylate synthase', 819),
 ('tourette syndrome', 176),
 ('turner syndrome', 134),
 ('trophoblast stem', 127),
 ('tobacco smoke', 100),
 ('transition state', 69),
 ('trans sialidase', 61),
 ('timothy syndrome', 42),
 ('takotsubo syndrome', 40),
 ('tourette s syndrome', 40),
 ('tensile strength', 34),
 ('test stimulus', 31),
 ('tuberous sclerosis', 31),
 ('thymidylate synthetase', 31),
 ('testosterone', 30),
 ('template switching', 28),
 ('temporal summation', 28),
 ('temperature sensitive', 26),
 ('total solids', 21),
 ('tea saponins', 21),
 ('transferrin saturation', 20),
 ('tumor suppressor', 19),
 ('turner s syndrome', 19),
 ('toona sinensis', 18),
 ('tocopherol succinate', 16),
 ('tactile stimulation', 16),
 ('tail suspension', 15),
 ('transcribed strand', 15),
 ('thiostrepton', 14),
 ('trimethoprim sulfamethoxazole', 13),
 ('tokishakuyakusan', 13),
 ('target sequence', 12),
 ('tocopheryl succinate', 12),
 ('total saponins', 12),
 ('tractus solitarius', 11),
 ('thermal sensati

In [48]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

In [49]:
result = [grounding_map, names, pos_labels]

In [50]:
result

[{'solitary tract': 'ungrounded',
  'suppressor t cells': 'ungrounded',
  't suppressor': 'ungrounded',
  'tactile stimulation': 'ungrounded',
  'tafro syndrome': 'ungrounded',
  'tail suspension': 'ungrounded',
  'takotsubo syndrome': 'MESH:D054549',
  'talaporfin sodium': 'ungrounded',
  'tape stripping': 'ungrounded',
  'tapioca starch': 'ungrounded',
  'tardive syndrome': 'ungrounded',
  'target sequence': 'ungrounded',
  'target strand': 'ungrounded',
  'tea saponins': 'ungrounded',
  'technical skills': 'ungrounded',
  'temperature sensitive': 'ungrounded',
  'temperature shift': 'ungrounded',
  'template strand': 'ungrounded',
  'template switching': 'ungrounded',
  'temporal summation': 'ungrounded',
  'tensile strength': 'MESH:D013718',
  'terminal spikelet': 'ungrounded',
  'test stimulus': 'ungrounded',
  'testosterone': 'CHEBI:CHEBI:17347',
  'tetanic stimulation': 'ungrounded',
  'the striatum': 'ungrounded',
  'thermal sensation': 'ungrounded',
  'thin section': 'unground

In [7]:
grounding_map, names, pos_labels = [{'solitary tract': 'ungrounded',
  'suppressor t cells': 'ungrounded',
  't suppressor': 'ungrounded',
  'tactile stimulation': 'ungrounded',
  'tafro syndrome': 'ungrounded',
  'tail suspension': 'ungrounded',
  'takotsubo syndrome': 'MESH:D054549',
  'talaporfin sodium': 'ungrounded',
  'tape stripping': 'ungrounded',
  'tapioca starch': 'ungrounded',
  'tardive syndrome': 'ungrounded',
  'target sequence': 'ungrounded',
  'target strand': 'ungrounded',
  'tea saponins': 'ungrounded',
  'technical skills': 'ungrounded',
  'temperature sensitive': 'ungrounded',
  'temperature shift': 'ungrounded',
  'template strand': 'ungrounded',
  'template switching': 'ungrounded',
  'temporal summation': 'ungrounded',
  'tensile strength': 'MESH:D013718',
  'terminal spikelet': 'ungrounded',
  'test stimulus': 'ungrounded',
  'testosterone': 'CHEBI:CHEBI:17347',
  'tetanic stimulation': 'ungrounded',
  'the striatum': 'ungrounded',
  'thermal sensation': 'ungrounded',
  'thin section': 'ungrounded',
  'thiostrepton': 'CHEBI:CHEBI:29693',
  'thiosulfate': 'CHEBI:CHEBI:26977',
  'threonine synthase': 'ungrounded',
  'threshold selector': 'ungrounded',
  'threshold shift': 'ungrounded',
  'threshold suspend': 'ungrounded',
  'thrombospondin': 'FPLX:THBS',
  'thymidilate synthase': 'HGNC:12361',
  'thymidine synthase': 'HGNC:12361',
  'thymidylate synthase': 'HGNC:12361',
  'thymidylate synthetase': 'HGNC:12361',
  'thymostimulin': 'MESH:C031407',
  'thyroid storm': 'MESH:D013958',
  'thyrotoxic storm': 'MESH:D013958',
  'timothy syndrome': 'DOID:DOID:0060173',
  'tissue specific': 'ungrounded',
  'tobacco smoke': 'MESH:D014028',
  'tobacco smoke exposure': 'MESH:D014028',
  'tocopherol succinate': 'CHEBI:CHEBI:135821',
  'tocopheryl hemisuccinate': 'CHEBI:CHEBI:135821',
  'tocopheryl succinate': 'CHEBI:CHEBI:135821',
  'tokishakuyakusan': 'MESH:C413220',
  'toona sinensis': 'ungrounded',
  'tooth sensitive': 'ungrounded',
  'top strand': 'ungrounded',
  'total salvage': 'ungrounded',
  'total saponins': 'CHEBI:CHEBI:26605',
  'total score': 'ungrounded',
  'total solids': 'ungrounded',
  'total soy saponins': 'ungrounded',
  'total starch': 'ungrounded',
  'total sugar': 'MESH:D000073893',
  'tourette s syndrome': 'MESH:D005879',
  'tourette syndrome': 'MESH:D005879',
  'toxicity stress': 'ungrounded',
  'tpa + sb': 'CHEBI:CHEBI:30513',
  'tracheal stenosis': 'MESH:D014135',
  'tractus solitarius': 'ungrounded',
  'trailing sif': 'ungrounded',
  'training status': 'ungrounded',
  'training stimuli': 'ungrounded',
  'trans sialidase': 'IP:IPR008377',
  'trans splicing': 'MESH:D020040',
  'trans stilbene': 'CHEBI:CHEBI:36007',
  'transcribed strand': 'ungrounded',
  'transcription sites': 'ungrounded',
  'transcription slippage': 'ungrounded',
  'transferrin saturation': 'ungrounded',
  'transition state': 'ungrounded',
  'transmural stimulation': 'ungrounded',
  'transseptal': 'ungrounded',
  'transsulfuration': 'GO:GO:0019346',
  'traumatic stress': 'ungrounded',
  'treated with endoscopic surgery': 'ungrounded',
  'triangularis sterni': 'ungrounded',
  'triceps skinfold': 'ungrounded',
  'triceps surae': 'ungrounded',
  'triclosan': 'CHEBI:CHEBI:164200',
  'trigeminovascular system': 'ungrounded',
  'trimethoprim sulfamethoxazole': 'CHEBI:CHEBI:3770',
  'tripartite synapses': 'ungrounded',
  'troglitazone sulfate': 'MESH:C549564',
  'trophectoderm stem': 'ungrounded',
  'trophoblast stem': 'BTO:BTO:0004774',
  'trophoblast stem cells': 'BTO:BTO:0004774',
  'tropical sprue': 'MESH:D013182',
  'tryptophan synthase': 'MESH:D014367',
  'ts65dn': 'ungrounded',
  'tuberous sclerosis': 'MESH:D014402',
  'tuberous shape': 'ungrounded',
  'tumor size': 'EFO:0004134',
  'tumor sphere': 'ungrounded',
  'tumor spheroids': 'ungrounded',
  'tumor stiffness': 'ungrounded',
  'tumor supernatant': 'ungrounded',
  'tumor suppressing': 'ungrounded',
  'tumor suppressor': 'MESH:D016147',
  'turbulence slope': 'ungrounded',
  'turner s syndrome': 'MESH:D014424',
  'turner syndrome': 'MESH:D014424'},
 {'MESH:D054549': 'Takotsubo Cardiomyopathy',
  'MESH:D013718': 'Tensile Strength',
  'CHEBI:CHEBI:17347': 'testosterone',
  'CHEBI:CHEBI:29693': 'thiostrepton',
  'CHEBI:CHEBI:26977': 'thiosulfate',
  'FPLX:THBS': 'THBS',
  'HGNC:12361': 'TYMS',
  'MESH:C031407': 'thymostimulin',
  'MESH:D013958': 'Thyroid Crisis',
  'DOID:DOID:0060173': 'Timothy syndrome',
  'MESH:D014028': 'Tobacco Smoke Pollution',
  'CHEBI:CHEBI:135821': 'tocopherol succinate',
  'MESH:C413220': 'toki-shakuyaku-san',
  'CHEBI:CHEBI:26605': 'saponin',
  'MESH:D000073893': 'Sugars',
  'MESH:D005879': 'Tourette Syndrome',
  'CHEBI:CHEBI:30513': 'antimony atom',
  'MESH:D014135': 'Tracheal Stenosis',
  'IP:IPR008377': 'Trypanosome sialidase',
  'MESH:D020040': 'Trans-Splicing',
  'CHEBI:CHEBI:36007': 'trans-stilbene',
  'GO:GO:0019346': 'transsulfuration',
  'CHEBI:CHEBI:164200': 'triclosan',
  'CHEBI:CHEBI:3770': 'co-trimoxazole',
  'MESH:C549564': 'troglitazone sulfate',
  'BTO:BTO:0004774': 'trophoblast stem cell',
  'BTO:BTO:0004774': 'trophoblast stem cell',
  'MESH:D013182': 'Sprue, Tropical',
  'MESH:D014367': 'Tryptophan Synthase',
  'MESH:D014402': 'Tuberous Sclerosis',
  'EFO:0004134': 'tumor size',
  'MESH:D016147': 'Genes, Tumor Suppressor',
  'MESH:D014424': 'Turner Syndrome'},
 ['BTO:BTO:0004774',
  'CHEBI:CHEBI:135821',
  'MESH:C413220',
  'DOID:DOID:0060173',
  'FPLX:THBS',
  'HGNC:12361',
  'IP:IPR008377',
  'MESH:D005879',
  'MESH:D054549',
  'MESH:D014028',
  'MESH:D014402',
  'MESH:D014424']]

In [8]:
excluded_longforms = []

In [9]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [10]:
additional_entities = {}

In [11]:
unambiguous_agent_texts = {}

In [12]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [13]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-23 03:16:36] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-23 03:17:43] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9325885036715059 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [14]:
classifier.stats

{'label_distribution': {'ungrounded': 376,
  'IP:IPR008377': 41,
  'CHEBI:CHEBI:135821': 31,
  'HGNC:12361': 644,
  'MESH:C413220': 12,
  'MESH:D005879': 147,
  'FPLX:THBS': 8,
  'MESH:D014028': 80,
  'GO:GO:0019346': 3,
  'MESH:C031407': 8,
  'CHEBI:CHEBI:26977': 4,
  'CHEBI:CHEBI:17347': 23,
  'MESH:D014424': 119,
  'CHEBI:CHEBI:36007': 3,
  'CHEBI:CHEBI:164200': 4,
  'MESH:D014402': 23,
  'BTO:BTO:0004774': 73,
  'DOID:DOID:0060173': 27,
  'CHEBI:CHEBI:3770': 6,
  'MESH:D014135': 3,
  'MESH:D000073893': 4,
  'MESH:D016147': 14,
  'MESH:C549564': 2,
  'CHEBI:CHEBI:29693': 6,
  'MESH:D013958': 4,
  'MESH:D013718': 23,
  'MESH:D054549': 24,
  'BTO:0004774': 4,
  'MESH:D013182': 4,
  'MESH:D014367': 5,
  'MESH:D020040': 1,
  'CHEBI:CHEBI:26605': 7,
  'EFO:0004134': 2,
  'CHEBI:CHEBI:30513': 2},
 'f1': {'mean': 0.927329, 'std': 0.016484},
 'precision': {'mean': 0.936942, 'std': 0.012091},
 'recall': {'mean': 0.930041, 'std': 0.017814},
 'MESH:D020040': {'f1': {'mean': 0.0, 'std': 0.0},
 

In [14]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [15]:
disamb.dump(model_name, results_path)

In [16]:
print(disamb.info())

Disambiguation model for TS

Produces the disambiguations:
	Genes, Tumor Suppressor	MESH:D016147
	Sprue, Tropical	MESH:D013182
	Sugars	MESH:D000073893
	THBS*	FPLX:THBS
	TYMS*	HGNC:12361
	Takotsubo Cardiomyopathy*	MESH:D054549
	Tensile Strength	MESH:D013718
	Thyroid Crisis	MESH:D013958
	Timothy syndrome*	DOID:DOID:0060173
	Tobacco Smoke Pollution*	MESH:D014028
	Tourette Syndrome*	MESH:D005879
	Tracheal Stenosis	MESH:D014135
	Trans-Splicing	MESH:D020040
	Trypanosome sialidase*	IP:IPR008377
	Tryptophan Synthase	MESH:D014367
	Tuberous Sclerosis*	MESH:D014402
	Turner Syndrome*	MESH:D014424
	antimony atom	CHEBI:CHEBI:30513
	co-trimoxazole	CHEBI:CHEBI:3770
	saponin	CHEBI:CHEBI:26605
	testosterone	CHEBI:CHEBI:17347
	thiostrepton	CHEBI:CHEBI:29693
	thiosulfate	CHEBI:CHEBI:26977
	thymostimulin	MESH:C031407
	tocopherol succinate*	CHEBI:CHEBI:135821
	toki-shakuyaku-san*	MESH:C413220
	trans-stilbene	CHEBI:CHEBI:36007
	transsulfuration	GO:GO:0019346
	triclosan	CHEBI:CHEBI:164200
	troglitazone sulfat

In [17]:
model_to_s3(disamb)