In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['RT', 'RTs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names,
                                                   pos_labels=pos_labels, no_browser=True, port=8890)

In [9]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'r tomentosum': 'ungrounded',
  'radiation': 'MESH:D011878',
  'radiation alone': 'MESH:D011878',
  'radiation therapists': 'ungrounded',
  'radiation therapy': 'MESH:D011878',
  'radiation treatment': 'MESH:D011878',
  'radical transfer': 'ungrounded',
  'radical transport': 'ungrounded',
  'radio therapy': 'MESH:D011878',
  'radiotherapy': 'MESH:D011878',
  'radiotherapy treatment': 'MESH:D011878',
  'radix trichosanthis': 'MESH:C034634',
  'rainbow trout': 'MESH:D017686',
  'rats': 'MESH:D051381',
  'ray tracing': 'ungrounded',
  're verse transcriptase': 'ungrounded',
  'reaction time': 'MESH:D011930',
  'readthrough': 'ungrounded',
  'readthrough protein': 'ungrounded',
  'real time': 'ungrounded',
  'realthiol': 'ungrounded',
  'recalcification time': 'ungrounded',
  'recognition threshold': 'ungrounded',
  'recovery time': 'ungrounded',
  'rectal temperature': 'MESH:D001831',
  'recurrence tonsillitis': 'MESH:D014069',
  'reduced tillage': 'ungrounded',
  'regenerated tissue':

In [5]:
grounding_map, names, pos_labels = [{'r tomentosum': 'ungrounded',
  'radiation': 'MESH:D011878',
  'radiation alone': 'MESH:D011878',
  'radiation therapists': 'ungrounded',
  'radiation therapy': 'MESH:D011878',
  'radiation treatment': 'MESH:D011878',
  'radical transfer': 'ungrounded',
  'radical transport': 'ungrounded',
  'radio therapy': 'MESH:D011878',
  'radiotherapy': 'MESH:D011878',
  'radiotherapy treatment': 'MESH:D011878',
  'radix trichosanthis': 'MESH:C034634',
  'rainbow trout': 'MESH:D017686',
  'rats': 'MESH:D051381',
  'ray tracing': 'ungrounded',
  're verse transcriptase': 'ungrounded',
  'reaction time': 'MESH:D011930',
  'readthrough': 'ungrounded',
  'readthrough protein': 'ungrounded',
  'real time': 'ungrounded',
  'realthiol': 'ungrounded',
  'recalcification time': 'ungrounded',
  'recognition threshold': 'ungrounded',
  'recovery time': 'ungrounded',
  'rectal temperature': 'MESH:D001831',
  'recurrence tonsillitis': 'MESH:D014069',
  'reduced tillage': 'ungrounded',
  'regenerated tissue': 'ungrounded',
  'rehabilitative training': 'ungrounded',
  'relaxation time': 'ungrounded',
  'relaxation training': 'ungrounded',
  'renal transplantation': 'MESH:D016030',
  'renieramycin t': 'MESH:C000627233',
  'repetitive thought': 'ungrounded',
  'repolarization time': 'ungrounded',
  'residence time': 'ungrounded',
  'residential transience': 'ungrounded',
  'resistance': 'MESH:D055070',
  'resistance exercise': 'MESH:D055070',
  'resistance exercise training': 'MESH:D055070',
  'resistance training': 'MESH:D055070',
  'respiratory therapist': 'ungrounded',
  'respiratory therapists': 'ungrounded',
  'respiratory tract': 'ungrounded',
  'respiratory triggered': 'ungrounded',
  'response time': 'MESH:D011930',
  'resting tension': 'ungrounded',
  'restriction type': 'ungrounded',
  'retention time': 'ungrounded',
  'reticular': 'MESH:D013788',
  'reticular thalamic': 'MESH:D013788',
  'reticular thalamus': 'MESH:D013788',
  'retinal thickness': 'ungrounded',
  'reverse transcribed': 'GO:GO:0006351',
  'reverse transcriptase': 'MESH:D012194',
  'reverse transcriptase activity': 'GO:GO:0006351',
  'reverse transcriptases': 'MESH:D012194',
  'reverse transcription': 'GO:GO:0006351',
  'rhabdoid tumor': 'MESH:D018335',
  'rhabdoid tumors': 'MESH:D018335',
  'rhamnosyltransferase': 'ungrounded',
  'ribotype': 'ungrounded',
  'richter s transformation': 'ungrounded',
  'richter transformation': 'ungrounded',
  'ricin toxin': 'CHEBI:CHEBI:8852',
  'ricin toxoid': 'CHEBI:CHEBI:8852',
  'rise time': 'ungrounded',
  'rivastigmine tartrate': 'CHEBI:CHEBI:64358',
  'room temperature': 'room_temperature',
  'root time': 'ungrounded',
  'root tip': 'ungrounded',
  'rotary tillage': 'ungrounded',
  'rotenone': 'CHEBI:CHEBI:28201',
  'rt': 'ungrounded',
  'russian tarragon': 'ungrounded',
  'rutin': 'CHEBI:CHEBI:28527',
  'thalamic reticular nucleus': 'UBERON:UBERON:0001903'},
 {'MESH:D011878': 'Radiotherapy',
  'MESH:C034634': 'radix Trichosanthis',
  'MESH:D017686': 'Oncorhynchus mykiss',
  'MESH:D051381': 'Rats',
  'MESH:D011930': 'Reaction Time',
  'MESH:D001831': 'Body Temperature',
  'MESH:D014069': 'Tonsillitis',
  'MESH:D016030': 'Kidney Transplantation',
  'MESH:C000627233': 'renieramycin T',
  'MESH:D055070': 'Resistance Training',
  'MESH:D013788': 'Thalamus',
  'MESH:D012194': 'RNA-Directed DNA Polymerase',
  'GO:GO:0006351': 'transcription, DNA-templated',
  'MESH:D018335': 'Rhabdoid Tumor',
  'CHEBI:CHEBI:8852': 'Ricin',
  'CHEBI:CHEBI:64358': 'rivastigmine tartrate',
  'room_temperature': 'room_temperature',
  'CHEBI:CHEBI:28201': 'rotenone',
  'CHEBI:CHEBI:28527': 'rutin',
  'UBERON:UBERON:0001903': 'thalamic reticular nucleus'},
 ['CHEBI:CHEBI:8852',
  'GO:GO:0006351',
  'MESH:D001831',
  'MESH:D011878',
  'MESH:D011930',
  'MESH:D012194',
  'MESH:D013788',
  'MESH:D018335',
  'MESH:D055070']]

In [6]:
excluded_longforms = ['rt']

In [7]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [8]:
additional_entities = []

In [9]:
unambiguous_agent_texts = {}

In [10]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entities = pos_labels + additional_entities
entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in entities}

In [17]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [18]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [19]:
intersection1

[('CHEBI:CHEBI:47344', 'CHEBI:CHEBI:47344', 0),
 ('CHEBI:CHEBI:47344', 'CHEBI:CHEBI:5130', 0),
 ('CHEBI:CHEBI:47344', 'HGNC:9600', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005189', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005193', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D005454', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D008164', 0),
 ('CHEBI:CHEBI:47344', 'MESH:D014760', 0),
 ('CHEBI:CHEBI:47344', 'NCIT:C32623', 0),
 ('CHEBI:CHEBI:5130', 'CHEBI:CHEBI:47344', 0),
 ('CHEBI:CHEBI:5130', 'CHEBI:CHEBI:5130', 1314),
 ('CHEBI:CHEBI:5130', 'HGNC:9600', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005189', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005193', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D005454', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D008164', 0),
 ('CHEBI:CHEBI:5130', 'MESH:D014760', 0),
 ('CHEBI:CHEBI:5130', 'NCIT:C32623', 0),
 ('HGNC:9600', 'CHEBI:CHEBI:47344', 0),
 ('HGNC:9600', 'CHEBI:CHEBI:5130', 0),
 ('HGNC:9600', 'HGNC:9600', 0),
 ('HGNC:9600', 'MESH:D005189', 0),
 ('HGNC:9600', 'MESH:D005193', 0),
 ('HGNC:9600', 'MESH:D005454', 0),
 ('HGN

In [75]:
intersection2

[('CHEBI:CHEBI:27812', 'CHEBI:CHEBI:27812', 0),
 ('CHEBI:CHEBI:27812', 'CHEBI:CHEBI:7035', 0),
 ('CHEBI:CHEBI:27812', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('CHEBI:CHEBI:27812', 'HGNC:7321', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D000080364', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D000604', 0),
 ('CHEBI:CHEBI:27812', 'MESH:D059630', 0),
 ('MESH:D059630', 'CHEBI:CHEBI:27812', 0),
 ('MESH:D059630', 'CHEBI:CHEBI:7035', 0),
 ('MESH:D059630', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('MESH:D059630', 'HGNC:7321', 0),
 ('MESH:D059630', 'MESH:D000080364', 0),
 ('MESH:D059630', 'MESH:D000604', 0),
 ('MESH:D059630', 'MESH:D059630', 2261),
 ('ungrounded', 'CHEBI:CHEBI:27812', 0),
 ('ungrounded', 'CHEBI:CHEBI:7035', 0),
 ('ungrounded', 'FPLX:Mechanosensitive_ion_channels', 0),
 ('ungrounded', 'HGNC:7321', 0),
 ('ungrounded', 'MESH:D000080364', 0),
 ('ungrounded', 'MESH:D000604', 0),
 ('ungrounded', 'MESH:D059630', 0),
 ('MESH:D000604', 'CHEBI:CHEBI:27812', 0),
 ('MESH:D000604', 'CHEBI:CHEBI:7035', 0),


In [41]:
pos_labels

['CHEBI:CHEBI:8852',
 'GO:GO:0006351MESH:D001831',
 'MESH:D011878',
 'MESH:D011930',
 'MESH:D012194',
 'MESH:D013788',
 'MESH:D018335',
 'MESH:D055070']

In [91]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [11]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-22 04:34:47] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-22 04:37:38] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9459187271183287 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [12]:
classifier.stats

{'label_distribution': {'MESH:D012194': 1028,
  'MESH:D014069': 2,
  'MESH:D001831': 26,
  'MESH:D011878': 2416,
  'MESH:D055070': 240,
  'GO:GO:0006351': 212,
  'MESH:D011930': 444,
  'MESH:D016030': 19,
  'ungrounded': 142,
  'room_temperature': 97,
  'CHEBI:CHEBI:8852': 11,
  'MESH:D013788': 14,
  'UBERON:UBERON:0001903': 2,
  'MESH:D017686': 2,
  'MESH:D018335': 16,
  'CHEBI:CHEBI:28527': 6,
  'CHEBI:CHEBI:28201': 3,
  'MESH:C034634': 2,
  'MESH:D051381': 1,
  'MESH:C000627233': 2,
  'CHEBI:CHEBI:64358': 3},
 'f1': {'mean': 0.945919, 'std': 0.001781},
 'precision': {'mean': 0.938616, 'std': 0.003213},
 'recall': {'mean': 0.955071, 'std': 0.002234},
 'GO:GO:0006351': {'f1': {'mean': 0.670333, 'std': 0.016399},
  'pr': {'mean': 0.674419, 'std': 0.028349},
  'rc': {'mean': 0.669703, 'std': 0.045657}},
 'ungrounded': {'f1': {'mean': 0.678965, 'std': 0.02787},
  'pr': {'mean': 0.598276, 'std': 0.044733},
  'rc': {'mean': 0.788797, 'std': 0.032671}},
 'MESH:C000627233': {'f1': {'mean': 0

In [13]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [14]:
disamb.dump(model_name, results_path)

In [15]:
print(disamb.info())

Disambiguation model for RT, and RTs

Produces the disambiguations:
	Body Temperature*	MESH:D001831
	Kidney Transplantation	MESH:D016030
	Oncorhynchus mykiss	MESH:D017686
	RNA-Directed DNA Polymerase*	MESH:D012194
	Radiotherapy*	MESH:D011878
	Rats	MESH:D051381
	Reaction Time*	MESH:D011930
	Resistance Training*	MESH:D055070
	Rhabdoid Tumor*	MESH:D018335
	Ricin*	CHEBI:CHEBI:8852
	Thalamus*	MESH:D013788
	Tonsillitis	MESH:D014069
	radix Trichosanthis	MESH:C034634
	renieramycin T	MESH:C000627233
	rivastigmine tartrate	CHEBI:CHEBI:64358
	room_temperature	room_temperature
	rotenone	CHEBI:CHEBI:28201
	rutin	CHEBI:CHEBI:28527
	thalamic reticular nucleus	UBERON:UBERON:0001903
	transcription, DNA-templated*	GO:GO:0006351

Class level metrics:
--------------------
Grounding                   	Count	F1     
                Radiotherapy*	2416	0.98143
 RNA-Directed DNA Polymerase*	1028	 0.9277
               Reaction Time*	 444	0.97756
         Resistance Training*	 240	0.90433
transcription, DNA-tem

In [16]:
model_to_s3(disamb)