In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['NS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('nephrotic syndrome', 235),
 ('normal saline', 173),
 ('noonan syndrome', 157),
 ('nucleostemin', 114),
 ('neurospheres', 69),
 ('novelty seeking', 61),
 ('nigella sativa', 59),
 ('nonstructural', 58),
 ('non structural', 54),
 ('non specific', 36),
 ('natural suppressor', 27),
 ('netherton syndrome', 27),
 ('nociceptive specific', 24),
 ('neural stem', 23),
 ('nonspecific', 20),
 ('neuroserpin', 20),
 ('normal salt', 16),
 ('nanospheres', 16),
 ('nerve stimulation', 14),
 ('non silencing', 14),
 ('nutrient solution', 13),
 ('negative staining', 13),
 ('nanoshells', 13),
 ('nervous system', 12),
 ('normal serum', 11),
 ('nanosuspension', 11),
 ('nanostructures', 11),
 ('nigella sativa l', 11),
 ('nanosilver', 10),
 ('non structural protein', 10),
 ('nasal swab', 8),
 ('nerve sparing', 8),
 ('nedocromil sodium', 8),
 ('nevus sebaceous', 7),
 ('neck suction', 7),
 ('not significantly', 7),
 ('nearshore', 7),
 ('nicolau syndrome', 7),
 ('neural subset', 6),
 ('neostriatum', 6),
 ('nonsy

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-09 03:52:43] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'increased slightly': 'ungrounded',
  'n terminal segment': 'ungrounded',
  'nano scaffold': 'ungrounded',
  'nano silica': 'ungrounded',
  'nano sio 2': 'ungrounded',
  'nanoshells': 'MESH:D057145',
  'nanosilver': 'ungrounded',
  'nanospheres': 'ungrounded',
  'nanosponges': 'ungrounded',
  'nanostructures': 'MESH:D049329',
  'nanosuspension': 'ungrounded',
  'nanosystems': 'ungrounded',
  'naproxen sodium': 'CHEBI:CHEBI:7477',
  'narrative self': 'ungrounded',
  'narrow spiking': 'ungrounded',
  'nasal swab': 'ungrounded',
  'native speakers': 'ungrounded',
  'native speakers of english': 'ungrounded',
  'natural almond skin': 'ungrounded',
  'natural sequence peptide': 'ungrounded',
  'natural suppressor': 'NCIT:C129908',
  'nearshore': 'ungrounded',
  'neck suction': 'ungrounded',
  'nedocromil sodium': 'CHEBI:CHEBI:7493',
  'negative selective': 'ungrounded',
  'negative staining': 'MESH:D016624',
  'nelumbinis semen': 'MESH:C501480',
  'neostriatum': 'MESH:D017072',
  'nephrom

In [9]:
grounding_map, names, pos_labels = [{'increased slightly': 'ungrounded',
  'n terminal segment': 'ungrounded',
  'nano scaffold': 'ungrounded',
  'nano silica': 'ungrounded',
  'nano sio 2': 'ungrounded',
  'nanoshells': 'MESH:D057145',
  'nanosilver': 'ungrounded',
  'nanospheres': 'ungrounded',
  'nanosponges': 'ungrounded',
  'nanostructures': 'MESH:D049329',
  'nanosuspension': 'ungrounded',
  'nanosystems': 'ungrounded',
  'naproxen sodium': 'CHEBI:CHEBI:7477',
  'narrative self': 'ungrounded',
  'narrow spiking': 'ungrounded',
  'nasal swab': 'ungrounded',
  'native speakers': 'ungrounded',
  'native speakers of english': 'ungrounded',
  'natural almond skin': 'ungrounded',
  'natural sequence peptide': 'ungrounded',
  'natural suppressor': 'NCIT:C129908',
  'nearshore': 'ungrounded',
  'neck suction': 'ungrounded',
  'nedocromil sodium': 'CHEBI:CHEBI:7493',
  'negative selective': 'ungrounded',
  'negative staining': 'MESH:D016624',
  'nelumbinis semen': 'MESH:C501480',
  'neostriatum': 'MESH:D017072',
  'nephrometry score': 'ungrounded',
  'nephrotic syndrome': 'MESH:D009404',
  'nerve sparing': 'ungrounded',
  'nerve stimulation': 'ungrounded',
  'nervous system': 'MESH:D009420',
  'netherton syndrome': 'MESH:D056770',
  'neural stem': 'NCIT:C12985',
  'neural subset': 'ungrounded',
  'neuroactive steroid': 'ungrounded',
  'neuroserpin': 'MESH:C100531',
  'neurospheres': 'neurospheres',
  'neurosteroids': 'MESH:D000081227',
  'nevus sebaceous': 'ungrounded',
  'nevus sebaceus': 'HP:HP:0025511',
  'new similar': 'ungrounded',
  'nicolau syndrome': 'MESH:D065148',
  'nigella sativa': 'MESH:D031881',
  'nigella sativa l': 'MESH:D031881',
  'nigella sativa linn': 'ungrounded',
  'night session': 'ungrounded',
  'nitrogen + straw addition': 'ungrounded',
  'nitrosative stress': 'ungrounded',
  'no serum': 'ungrounded',
  'nociceptive specific': 'ungrounded',
  'nodding syndrome': 'MESH:D064128',
  'nodular sclerosis': 'ungrounded',
  'non selective': 'ungrounded',
  'non sensitive': 'ungrounded',
  'non significantly': 'ungrounded',
  'non silencing': 'ungrounded',
  'non silencing sirna': 'ungrounded',
  'non smokers': 'MESH:D000078405',
  'non specific': 'ungrounded',
  'non structural': 'MESH:D017361',
  'non structural protein': 'MESH:D017361',
  'non surgerized': 'ungrounded',
  'non synonymous': 'ungrounded',
  'nonsense': 'ungrounded',
  'nonspecific': 'ungrounded',
  'nonstructural': 'MESH:D017361',
  'nonsynonymous': 'ungrounded',
  'noonan syndrome': 'MESH:D009634',
  'normal saline': 'MESH:D000077330',
  'normal salt': 'MESH:D000077330',
  'normal scar fibroblasts': 'ungrounded',
  'normal sera': 'ungrounded',
  'normal serum': 'ungrounded',
  'normal serum pool': 'ungrounded',
  'normal skin': 'ungrounded',
  'normal subjects': 'ungrounded',
  'nosocomial sepsis': 'ungrounded',
  'not significantly': 'ungrounded',
  'notch signaling': 'NCIT:C91520',
  'novasil': 'ungrounded',
  'novelty seeking': 'MESH:D005106',
  'ns': 'ungrounded',
  'ns 398': 'CHEBI:CHEBI:73458',
  'nucleosides': 'CHEBI:CHEBI:33838',
  'nucleostemin': 'HGNC:29931',
  'nutrient solution': 'ungrounded',
  'nutritional solution': 'ungrounded',
  'nutritional status': 'MESH:D009752',
  'nutritional supplement': 'CHEBI:CHEBI:50733'},
 {'MESH:D057145': 'Nanoshells',
  'MESH:D049329': 'Nanostructures',
  'CHEBI:CHEBI:7477': 'naproxen sodium',
  'NCIT:C129908': 'Myeloid-Derived Suppressor Cell',
  'CHEBI:CHEBI:7493': 'nedocromil sodium',
  'MESH:D016624': 'Negative Staining',
  'MESH:C501480': 'Nelumbinis Semen',
  'MESH:D017072': 'Neostriatum',
  'MESH:D009404': 'Nephrotic Syndrome',
  'MESH:D009420': 'Nervous System',
  'MESH:D056770': 'Netherton Syndrome',
  'NCIT:C12985': 'Neural Stem Cell',
  'MESH:C100531': 'neuroserpin',
  'neurospheres': 'neurospheres',
  'MESH:D000081227': 'Neurosteroids',
  'HP:HP:0025511': 'Nevus sebaceus',
  'MESH:D065148': 'Nicolau Syndrome',
  'MESH:D031881': 'Nigella sativa',
  'MESH:D064128': 'Nodding Syndrome',
  'MESH:D000078405': 'Non-Smokers',
  'MESH:D017361': 'Viral Nonstructural Proteins',
  'MESH:D009634': 'Noonan Syndrome',
  'MESH:D000077330': 'Saline Solution',
  'NCIT:C91520': 'Notch Signaling Pathway',
  'MESH:D005106': 'Exploratory Behavior',
  'CHEBI:CHEBI:73458': 'NS-398',
  'CHEBI:CHEBI:33838': 'nucleoside',
  'HGNC:29931': 'GNL3',
  'MESH:D009752': 'Nutritional Status',
  'CHEBI:CHEBI:50733': 'nutraceutical'},
 ['HGNC:29931',
  'MESH:C100531',
  'MESH:D000077330',
  'MESH:D005106',
  'MESH:D009404',
  'MESH:D009634',
  'MESH:D017361',
  'MESH:D031881',
  'MESH:D056770',
  'NCIT:C12985',
  'NCIT:C129908']]

In [10]:
excluded_longforms = ['ns']

In [11]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [12]:
additional_entities = {}

In [13]:
unambiguous_agent_texts = {}

In [14]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[('HGNC:17981', 'HGNC:17981', 29)]

In [18]:
intersection2

[('ungrounded', 'HGNC:17981', 0)]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [15]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-09 04:11:28] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-09 04:12:08] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.8856780183646145 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [16]:
classifier.stats

{'label_distribution': {'ungrounded': 211,
  'NCIT:C129908': 22,
  'CHEBI:CHEBI:7493': 7,
  'MESH:D009404': 189,
  'MESH:D017361': 98,
  'MESH:D017072': 4,
  'MESH:D000077330': 134,
  'MESH:D009420': 9,
  'MESH:D005106': 40,
  'CHEBI:CHEBI:7477': 3,
  'MESH:D009634': 111,
  'MESH:D031881': 54,
  'neurospheres': 35,
  'CHEBI:CHEBI:33838': 3,
  'HGNC:29931': 77,
  'MESH:C501480': 3,
  'NCIT:C12985': 14,
  'MESH:D056770': 22,
  'CHEBI:CHEBI:73458': 3,
  'MESH:D000081227': 5,
  'MESH:D057145': 9,
  'MESH:D016624': 7,
  'HP:HP:0025511': 4,
  'MESH:C100531': 13,
  'MESH:D000078405': 4,
  'MESH:D009752': 3,
  'MESH:D064128': 4,
  'CHEBI:CHEBI:50733': 4,
  'MESH:D065148': 4,
  'MESH:D049329': 6,
  'NCIT:C91520': 1},
 'f1': {'mean': 0.885678, 'std': 0.026309},
 'precision': {'mean': 0.883623, 'std': 0.028771},
 'recall': {'mean': 0.896756, 'std': 0.023664},
 'MESH:C501480': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}},
 'MESH:D031881': 

In [17]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [19]:
disamb.dump(model_name, results_path)

In [18]:
print(disamb.info())

Disambiguation model for NS

Produces the disambiguations:
	Exploratory Behavior*	MESH:D005106
	GNL3*	HGNC:29931
	Myeloid-Derived Suppressor Cell*	NCIT:C129908
	NS-398	CHEBI:CHEBI:73458
	Nanoshells	MESH:D057145
	Nanostructures	MESH:D049329
	Negative Staining	MESH:D016624
	Nelumbinis Semen	MESH:C501480
	Neostriatum	MESH:D017072
	Nephrotic Syndrome*	MESH:D009404
	Nervous System	MESH:D009420
	Netherton Syndrome*	MESH:D056770
	Neural Stem Cell*	NCIT:C12985
	Neurosteroids	MESH:D000081227
	Nevus sebaceus	HP:HP:0025511
	Nicolau Syndrome	MESH:D065148
	Nigella sativa*	MESH:D031881
	Nodding Syndrome	MESH:D064128
	Non-Smokers	MESH:D000078405
	Noonan Syndrome*	MESH:D009634
	Notch Signaling Pathway	NCIT:C91520
	Nutritional Status	MESH:D009752
	Saline Solution*	MESH:D000077330
	Viral Nonstructural Proteins*	MESH:D017361
	naproxen sodium	CHEBI:CHEBI:7477
	nedocromil sodium	CHEBI:CHEBI:7493
	neuroserpin*	MESH:C100531
	neurospheres	neurospheres
	nucleoside	CHEBI:CHEBI:33838
	nutraceutical	CHEBI:CHEBI:5

In [20]:
model_to_s3(disamb)