In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [9]:
shortforms = ['TM']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [10]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('transmembrane', 761),
 ('thrombomodulin', 685),
 ('tunicamycin', 649),
 ('trabecular meshwork', 555),
 ('tamoxifen', 124),
 ('tropomyosin', 111),
 ('tympanic membrane', 87),
 ('tetrathiomolybdate', 78),
 ('thalassemia major', 36),
 ('transition metal', 35),
 ('transcendental meditation', 24),
 ('tectorial membrane', 23),
 ('tail moment', 20),
 ('timolol maleate', 19),
 ('transverse magnetic', 19),
 ('turn motif', 18),
 ('thematic mapper', 18),
 ('traditional medicine', 16),
 ('trans membrane', 16),
 ('tumor microenvironment', 15),
 ('transcriptional mutagenesis', 14),
 ('thermal manipulation', 14),
 ('tenebrio molitor', 13),
 ('target module', 12),
 ('triple mutant', 12),
 ('transverse myelitis', 12),
 ('thimerosal', 11),
 ('telemedicine', 10),
 ('treadmill', 10),
 ('transmembrane region', 10),
 ('testicular microlithiasis', 9),
 ('total mastectomy', 8),
 ('traditional medicare', 8),
 ('transmembrane glycoprotein', 8),
 ('testicular macrophages', 7),
 ('tautomycin', 7),
 ('transmemb

In [11]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-03 02:48:48] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [12]:
result = [grounding_map, names, pos_labels]

In [13]:
result

[{'tail moment': 'ungrounded',
  'tamoxifen': 'CHEBI:CHEBI:41774',
  'target module': 'ungrounded',
  'target mutagenesis': 'ungrounded',
  'tautomycin': 'CHEBI:CHEBI:9414',
  'tectorial membrane': 'MESH:D013680',
  'telemedicine': 'MESH:D017216',
  'telemonitoring': 'ungrounded',
  'telmisartan': 'CHEBI:CHEBI:9434',
  'temozolomide': 'CHEBI:CHEBI:72564',
  'temperature': 'MESH:D013696',
  'tenebrio molitor': 'ungrounded',
  'tensile modulus': 'ungrounded',
  'test meal': 'MESH:D062407',
  'testicular macrophages': 'MESH:D008264',
  'testicular microlithiasis': 'MESH:C566478',
  'tetramethrin': 'CHEBI:CHEBI:39397',
  'tetrathiomolybdate': 'MESH:C020809',
  'text mining': 'MESH:D057225',
  'thalassaemia major': 'ungrounded',
  'thalassemia major': 'NCIT:C129699',
  'thematic mapper': 'ungrounded',
  'thermal manipulation': 'ungrounded',
  'thermal manipulation during embryogenesis': 'ungrounded',
  'thimerosal': 'ungrounded',
  'thioglycolate medium': 'ungrounded',
  'thiophanate methyl

In [15]:
grounding_map, names, pos_labels = [{'tail moment': 'ungrounded',
  'tamoxifen': 'CHEBI:CHEBI:41774',
  'target module': 'ungrounded',
  'target mutagenesis': 'ungrounded',
  'tautomycin': 'CHEBI:CHEBI:9414',
  'tectorial membrane': 'MESH:D013680',
  'telemedicine': 'MESH:D017216',
  'telemonitoring': 'ungrounded',
  'telmisartan': 'CHEBI:CHEBI:9434',
  'temozolomide': 'CHEBI:CHEBI:72564',
  'temperature': 'MESH:D013696',
  'tenebrio molitor': 'ungrounded',
  'tensile modulus': 'ungrounded',
  'test meal': 'MESH:D062407',
  'testicular macrophages': 'MESH:D008264',
  'testicular microlithiasis': 'MESH:C566478',
  'tetramethrin': 'CHEBI:CHEBI:39397',
  'tetrathiomolybdate': 'MESH:C020809',
  'text mining': 'MESH:D057225',
  'thalassaemia major': 'ungrounded',
  'thalassemia major': 'NCIT:C129699',
  'thematic mapper': 'ungrounded',
  'thermal manipulation': 'ungrounded',
  'thermal manipulation during embryogenesis': 'ungrounded',
  'thimerosal': 'ungrounded',
  'thioglycolate medium': 'ungrounded',
  'thiophanate methyl': 'CHEBI:CHEBI:35014',
  'thoracic myelopathy': 'MESH:D013118',
  'threshold measured': 'ungrounded',
  'thrombomodulin': 'HGNC:11784',
  'thrombotic microangiopathy': 'MESH:D057049',
  'thylakoid membrane': 'GO:GO:0042651',
  'thymic macrophages': 'MESH:D008264',
  'timolol maleate': 'CHEBI:CHEBI:9600',
  'tissue matrix': 'ungrounded',
  'tm': 'ungrounded',
  'tooth mousse': 'ungrounded',
  'tooth movement': 'ungrounded',
  'torus mandibularis': 'ungrounded',
  'total mastectomy': 'MESH:D015413',
  'total membrane': 'ungrounded',
  'total memory': 'ungrounded',
  'toxic metal': 'ungrounded',
  'trabecular mesh work': 'ungrounded',
  'trabecular meshwork': 'MESH:D014129',
  'trabecular meshwork cells': 'ungrounded',
  'trabecular metal': 'ungrounded',
  'trace metal': 'ungrounded',
  'trace minerals': 'ungrounded',
  'traditional medicare': 'MESH:D006278',
  'traditional medicine': 'MESH:D008519',
  'trans membrane': 'GO:GO:0016021',
  'transcendental meditation': 'MESH:D019122',
  'transcriptional mutagenesis': 'MESH:D016296',
  'transgenic mice': 'MESH:D008822',
  'transition memory': 'ungrounded',
  'transition metal': 'CHEBI:CHEBI:27081',
  'transmembrane': 'GO:GO:0016021',
  'transmembrane envelope': 'ungrounded',
  'transmembrane glycoprotein': 'IP:IPR005071',
  'transmembrane helical': 'ungrounded',
  'transmembrane helix': 'ungrounded',
  'transmembrane protein': 'ungrounded',
  'transmembrane region': 'GO:GO:0016021',
  'transmembrane subunit': 'ungrounded',
  'transmembrane unit': 'ungrounded',
  'transmethylation': 'ungrounded',
  'transverse magnetic': 'ungrounded',
  'transverse myelitis': 'MESH:D009187',
  'trapezius muscle': 'ungrounded',
  'treadmill': 'ungrounded',
  'treatment model': 'ungrounded',
  'trimebutine maleate': 'CHEBI:CHEBI:32261',
  'triple mutant': 'ungrounded',
  'triple mutations': 'ungrounded',
  'tropomyosin': 'MESH:D014335',
  'tuber melanosporum': 'ungrounded',
  'tuberculous meningitis': 'ungrounded',
  'tuberomammillary': 'ungrounded',
  'tuberomammillary nucleus': 'ungrounded',
  'tumor markers': 'ungrounded',
  'tumor microenvironment': 'MESH:D059016',
  'tunicamycin': 'CHEBI:CHEBI:29699',
  'turn motif': 'ungrounded',
  'tympanic membrane': 'MESH:D014432'},
 {'CHEBI:CHEBI:41774': 'tamoxifen',
  'CHEBI:CHEBI:9414': 'Tautomycin',
  'MESH:D013680': 'Tectorial Membrane',
  'MESH:D017216': 'Telemedicine',
  'CHEBI:CHEBI:9434': 'telmisartan',
  'CHEBI:CHEBI:72564': 'temozolomide',
  'MESH:D013696': 'Temperature',
  'MESH:D062407': 'Meals',
  'MESH:D008264': 'Macrophages',
  'MESH:C566478': 'Testicular Microlithiasis',
  'CHEBI:CHEBI:39397': 'tetramethrin',
  'MESH:C020809': 'tetrathiomolybdate',
  'MESH:D057225': 'Data Mining',
  'NCIT:C129699': 'Beta Thalassemia Major',
  'CHEBI:CHEBI:35014': 'thiophanate-methyl',
  'MESH:D013118': 'Spinal Cord Diseases',
  'HGNC:11784': 'THBD',
  'MESH:D057049': 'Thrombotic Microangiopathies',
  'GO:GO:0042651': 'thylakoid membrane',
  'CHEBI:CHEBI:9600': '(S)-timolol maleate',
  'MESH:D015413': 'Mastectomy, Simple',
  'MESH:D014129': 'Trabecular Meshwork',
  'MESH:D006278': 'Medicare',
  'MESH:D008519': 'Medicine, Traditional',
  'GO:GO:0016021': 'integral component of membrane',
  'MESH:D019122': 'Meditation',
  'MESH:D016296': 'Mutagenesis',
  'MESH:D008822': 'Mice, Transgenic',
  'CHEBI:CHEBI:27081': 'transition element atom',
  'IP:IPR005071': 'Transmembrane glycoprotein',
  'MESH:D009187': 'Myelitis',
  'CHEBI:CHEBI:32261': 'Trimebutine maleate',
  'MESH:D014335': 'Tropomyosin',
  'MESH:D059016': 'Tumor Microenvironment',
  'CHEBI:CHEBI:29699': 'tunicamycin',
  'MESH:D014432': 'Tympanic Membrane'},
 ['CHEBI:CHEBI:27081',
  'CHEBI:CHEBI:29699',
  'CHEBI:CHEBI:41774',
  'CHEBI:CHEBI:9600',
  'GO:GO:0016021',
  'HGNC:11784',
  'MESH:C020809',
  'MESH:D013680',
  'MESH:D014129',
  'MESH:D014335',
  'MESH:D014432',
  'MESH:D019122',
  'NCIT:C129699']]

In [16]:
excluded_longforms = []

In [17]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [18]:
additional_entities = {}

In [19]:
unambiguous_agent_texts = {}

In [20]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [21]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [22]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [23]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-03 03:11:08] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-03 03:13:14] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9403659888554319 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [24]:
classifier.stats

{'label_distribution': {'MESH:D014335': 101,
  'CHEBI:CHEBI:29699': 411,
  'MESH:D019122': 21,
  'HGNC:11784': 559,
  'GO:GO:0016021': 617,
  'MESH:D013696': 4,
  'IP:IPR005071': 7,
  'MESH:D014432': 61,
  'ungrounded': 216,
  'MESH:D013680': 16,
  'MESH:C020809': 57,
  'CHEBI:CHEBI:9414': 6,
  'NCIT:C129699': 27,
  'MESH:D014129': 390,
  'MESH:D009187': 10,
  'CHEBI:CHEBI:72564': 3,
  'CHEBI:CHEBI:9600': 15,
  'MESH:D008822': 2,
  'CHEBI:CHEBI:41774': 85,
  'MESH:D016296': 10,
  'MESH:C566478': 7,
  'MESH:D008519': 10,
  'MESH:D008264': 5,
  'MESH:D057049': 3,
  'MESH:D017216': 7,
  'CHEBI:CHEBI:27081': 33,
  'GO:GO:0042651': 2,
  'CHEBI:CHEBI:35014': 3,
  'CHEBI:CHEBI:32261': 3,
  'MESH:D057225': 3,
  'MESH:D015413': 5,
  'MESH:D006278': 4,
  'CHEBI:CHEBI:9434': 3,
  'MESH:D013118': 4,
  'MESH:D059016': 8,
  'CHEBI:CHEBI:39397': 1,
  'MESH:D062407': 1},
 'f1': {'mean': 0.940366, 'std': 0.007089},
 'precision': {'mean': 0.938339, 'std': 0.006991},
 'recall': {'mean': 0.948192, 'std': 

In [25]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [27]:
disamb.dump(model_name, results_path)

In [26]:
print(disamb.info())

Disambiguation model for TM

Produces the disambiguations:
	(S)-timolol maleate*	CHEBI:CHEBI:9600
	Beta Thalassemia Major*	NCIT:C129699
	Data Mining	MESH:D057225
	Macrophages	MESH:D008264
	Mastectomy, Simple	MESH:D015413
	Meals	MESH:D062407
	Medicare	MESH:D006278
	Medicine, Traditional	MESH:D008519
	Meditation*	MESH:D019122
	Mice, Transgenic	MESH:D008822
	Mutagenesis	MESH:D016296
	Myelitis	MESH:D009187
	Spinal Cord Diseases	MESH:D013118
	THBD*	HGNC:11784
	Tautomycin	CHEBI:CHEBI:9414
	Tectorial Membrane*	MESH:D013680
	Telemedicine	MESH:D017216
	Temperature	MESH:D013696
	Testicular Microlithiasis	MESH:C566478
	Thrombotic Microangiopathies	MESH:D057049
	Trabecular Meshwork*	MESH:D014129
	Transmembrane glycoprotein	IP:IPR005071
	Trimebutine maleate	CHEBI:CHEBI:32261
	Tropomyosin*	MESH:D014335
	Tumor Microenvironment	MESH:D059016
	Tympanic Membrane*	MESH:D014432
	integral component of membrane*	GO:GO:0016021
	tamoxifen*	CHEBI:CHEBI:41774
	telmisartan	CHEBI:CHEBI:9434
	temozolomide	CHEBI:CHE

In [28]:
model_to_s3(disamb)