In [3]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [4]:
adeft_grounder = AdeftGrounder()

In [5]:
shortforms = ['DSS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [6]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 1]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [7]:
list(zip(longforms, counts))

[('dextran sulfate sodium', 1642),
 ('dextran sodium sulfate', 1161),
 ('disease specific survival', 468),
 ('dextran sulphate sodium', 208),
 ('dextran sodium sulphate', 200),
 ('dengue shock syndrome', 96),
 ('dahl salt sensitive', 39),
 ('danshensu', 38),
 ('dextran sulfate sodium salt', 33),
 ('decision support system', 20),
 ('disuccinimidyl suberate', 20),
 ('danggui shaoyao san', 17),
 ('disease specific', 11),
 ('daidzein sulfonic sodium', 9),
 ('dss', 8),
 ('dioctyl sodium sulfosuccinate', 7),
 ('dimethyl 4 silapentane 1 sulfonic acid', 7),
 ('dss administration group', 7),
 ('depression symptoms severe', 5),
 ('depression stigma scale', 5),
 ('digit symbol substitution', 4),
 ('dss induced colitis group', 4),
 ('dark soy sauce', 3),
 ('demographic surveillance site', 3),
 ('disubstituted sulfonamides', 3),
 ('dejerine sottas syndrome', 3),
 ('déjérine sottas syndrome', 3),
 ('disuccinyl suberate', 3),
 ('dextan sodium sulfate', 3),
 ('sodium dextran sulfate', 3),
 ('dahl salt

In [9]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-10-30 04:37:07] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl


In [10]:
result = [grounding_map, names, pos_labels]

In [11]:
result

[{'dahl salt sensitive': 'EFO:0001350',
  'dahl salt sensitive rats': 'EFO:0001350',
  'daidzein sulfonic sodium': 'ungrounded',
  'daily symptoms score': 'ungrounded',
  'dang gui shao yao san': 'ungrounded',
  'danggui shaoyao san': 'MESH:C531150',
  'danshensu': 'MESH:C035055',
  'dark soy sauce': 'ungrounded',
  'dark spot syndrome': 'ungrounded',
  'decision support system': 'ungrounded',
  'dejerine sottas syndrome': 'DOID:0050540',
  'demographic surveillance site': 'ungrounded',
  'dengue shock syndrome': 'DOID:DOID:0050125',
  'density of spikelets per spike': 'ungrounded',
  'depression stigma scale': 'ungrounded',
  'depression symptoms severe': 'ungrounded',
  'desulfococcus': 'ungrounded',
  'dewatered sewage sludge': 'ungrounded',
  'dextan sodium sulfate': 'CHEBI:CHEBI:34674',
  'dextran sodium sulfate': 'CHEBI:CHEBI:34674',
  'dextran sodium sulphate': 'CHEBI:CHEBI:34674',
  'dextran sulfate salt': 'ungrounded',
  'dextran sulfate sodium': 'CHEBI:CHEBI:34674',
  'dextra

In [31]:
grounding_map, names, pos_labels = [{'dahl salt sensitive': 'EFO:0001350',
  'dahl salt sensitive rats': 'EFO:0001350',
  'daidzein sulfonic sodium': 'ungrounded',
  'daily symptoms score': 'ungrounded',
  'dang gui shao yao san': 'ungrounded',
  'danggui shaoyao san': 'MESH:C531150',
  'danshensu': 'MESH:C035055',
  'dark soy sauce': 'ungrounded',
  'dark spot syndrome': 'ungrounded',
  'decision support system': 'ungrounded',
  'dejerine sottas syndrome': 'DOID:0050540',
  'demographic surveillance site': 'ungrounded',
  'dengue shock syndrome': 'DOID:DOID:0050125',
  'density of spikelets per spike': 'ungrounded',
  'depression stigma scale': 'ungrounded',
  'depression symptoms severe': 'ungrounded',
  'desulfococcus': 'ungrounded',
  'dewatered sewage sludge': 'ungrounded',
  'dextan sodium sulfate': 'CHEBI:CHEBI:34674',
  'dextran sodium sulfate': 'CHEBI:CHEBI:34674',
  'dextran sodium sulphate': 'CHEBI:CHEBI:34674',
  'dextran sulfate salt': 'ungrounded',
  'dextran sulfate sodium': 'CHEBI:CHEBI:34674',
  'dextran sulfate sodium 5000': 'ungrounded',
  'dextran sulfate sodium colitis': 'CHEBI:CHEBI:34674',
  'dextran sulfate sodium salt': 'CHEBI:CHEBI:34674',
  'dextran sulfate sodium sulfate': 'CHEBI:CHEBI:34674',
  'dextran sulphate sodium': 'CHEBI:CHEBI:34674',
  'dextran sulphate sodium salt': 'CHEBI:CHEBI:34674',
  'dextransulfate sodium': 'CHEBI:CHEBI:34674',
  'dextrose sulfate sodium': 'CHEBI:CHEBI:34674',
  'differentiation stress sensitive': 'ungrounded',
  'digit symbol substitution': 'ungrounded',
  'dimethyl 2 silapentane 5 sulfonic': 'ungrounded',
  'dimethyl 4 silapentane 1 sulfonic acid': 'ungrounded',
  'dioctyl sodium sulfosuccinate': 'ungrounded',
  'disease specific': 'disease_specific_survival',
  'disease specific death': 'ungrounded',
  'disease specific survival': 'disease_specific_survival',
  'disturbed shear stress': 'ungrounded',
  'disubstituted sulfonamides': 'ungrounded',
  'disuccimidyl suberate': 'ungrounded',
  'disuccinimidyl suberate': 'MESH:C019358',
  'disuccinyl suberate': 'CHEBI:CHEBI:132953',
  'dosage sensitive sex reversal': 'HGNC:7960',
  'dsr and salt sensitive': 'ungrounded',
  'dss': 'ungrounded',
  'dss administration group': 'ungrounded',
  'dss alone': 'ungrounded',
  'dss colitis mice': 'ungrounded',
  'dss group': 'ungrounded',
  'dss in drinking water': 'ungrounded',
  'dss induced colitis group': 'ungrounded',
  'dss treatment': 'ungrounded',
  'duplex stainless steel': 'ungrounded',
  'déjérine sottas syndrome': 'DOID:0050540',
  'sodium dextran sulfate': 'CHEBI:CHEBI:34674',
  'sodium dextran sulphate': 'CHEBI:CHEBI:34674'},
 {'EFO:0001350': 'Dahl salt-sensitive',
  'MESH:C531150': 'danggui-shaoyao-san',
  'MESH:C035055': '3,4-dihydroxyphenyllactic acid',
  'DOID:0050540': 'Charcot-Marie-Tooth disease type 3',
  'DOID:DOID:0050125': 'dengue shock syndrome',
  'CHEBI:CHEBI:34674': 'dextran sulfate',
  'disease_specific_survival': 'disease_specific_survival',
  'MESH:C019358': 'disuccinimidyl suberate',
  'CHEBI:CHEBI:132953': 'suberate',
  'HGNC:7960': 'NR0B1'},
 ['CHEBI:CHEBI:34674', 'DOID:DOID:0050125', 'EFO:0001350', 'HGNC:7960']]

In [32]:
excluded_longforms = ['dss', 'dss alone', 'dsr and salt sensitive',
                      'dss administration group', 'dss in drinking water',
                      'dss colitis mice', 'dss group', 'dss induced colitis group',
                      'dss treatment']

In [33]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [34]:
additional_entities = {'HGNC:7960': ['NR0B1', ['DSS', 'sex reversal']]}

In [35]:
unambiguous_agent_texts = {'HGNC:7960': ['NR0B1', ['dosage sensitive sex reversal']]}

In [36]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [20]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [21]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [22]:
intersection1

[('HGNC:7960', 'HGNC:7960', 233)]

In [23]:
intersection2

[('ungrounded', 'HGNC:7960', 0),
 ('EFO:0001350', 'HGNC:7960', 0),
 ('CHEBI:CHEBI:34674', 'HGNC:7960', 0),
 ('MESH:C019358', 'HGNC:7960', 0),
 ('DOID:0050540', 'HGNC:7960', 0),
 ('CHEBI:CHEBI:132953', 'HGNC:7960', 0),
 ('disease_specific_survival', 'HGNC:7960', 0),
 ('HGNC:3055', 'HGNC:7960', 0),
 ('DOID:DOID:0050125', 'HGNC:7960', 0),
 ('MESH:C531150', 'HGNC:7960', 0),
 ('MESH:C035055', 'HGNC:7960', 0)]

[('HGNC:7321', ['Musculin', 'musculin'])]

In [37]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [38]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [39]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-30 04:45:59] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-30 04:46:55] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9847838250645605 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [40]:
classifier.stats

{'label_distribution': {'ungrounded': 69,
  'EFO:0001350': 26,
  'CHEBI:CHEBI:34674': 2201,
  'MESH:C019358': 17,
  'DOID:0050540': 6,
  'CHEBI:CHEBI:132953': 2,
  'disease_specific_survival': 304,
  'HGNC:7960': 74,
  'DOID:DOID:0050125': 72,
  'MESH:C531150': 14,
  'MESH:C035055': 31},
 'f1': {'mean': 0.984784, 'std': 0.002354},
 'precision': {'mean': 0.974168, 'std': 0.003291},
 'recall': {'mean': 0.995789, 'std': 0.002657},
 'MESH:C531150': {'f1': {'mean': 0.18, 'std': 0.222711},
  'pr': {'mean': 0.133333, 'std': 0.163299},
  'rc': {'mean': 0.3, 'std': 0.4}},
 'DOID:DOID:0050125': {'f1': {'mean': 0.985696, 'std': 0.017537},
  'pr': {'mean': 0.972381, 'std': 0.03386},
  'rc': {'mean': 1.0, 'std': 0.0}},
 'MESH:C035055': {'f1': {'mean': 0.50987, 'std': 0.181122},
  'pr': {'mean': 0.390476, 'std': 0.174704},
  'rc': {'mean': 0.85, 'std': 0.2}},
 'DOID:0050540': {'f1': {'mean': 0.533333, 'std': 0.452155},
  'pr': {'mean': 0.5, 'std': 0.447214},
  'rc': {'mean': 0.6, 'std': 0.489898}},


In [41]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [42]:
disamb.dump(model_name, results_path)

In [43]:
print(disamb.info())

Disambiguation model for DSS

Produces the disambiguations:
	3,4-dihydroxyphenyllactic acid	MESH:C035055
	Charcot-Marie-Tooth disease type 3	DOID:0050540
	Dahl salt-sensitive*	EFO:0001350
	NR0B1*	HGNC:7960
	danggui-shaoyao-san	MESH:C531150
	dengue shock syndrome*	DOID:DOID:0050125
	dextran sulfate*	CHEBI:CHEBI:34674
	disease_specific_survival	disease_specific_survival
	disuccinimidyl suberate	MESH:C019358
	suberate	CHEBI:CHEBI:132953

Class level metrics:
--------------------
Grounding                         	Count	F1     
                   dextran sulfate*	2201	0.98452
         disease_specific_survival	 304	0.99019
                             NR0B1*	  74	    1.0
             dengue shock syndrome*	  72	 0.9857
                        Ungrounded	  69	0.67521
    3,4-dihydroxyphenyllactic acid	  31	0.50987
               Dahl salt-sensitive*	  26	 0.9596
           disuccinimidyl suberate	  17	   0.72
               danggui-shaoyao-san	  14	   0.18
Charcot-Marie-Tooth disease type 3

In [44]:
print(disamb.info())

Disambiguation model for DSS

Produces the disambiguations:
	3,4-dihydroxyphenyllactic acid	MESH:C035055
	Charcot-Marie-Tooth disease type 3	DOID:0050540
	Dahl salt-sensitive*	EFO:0001350
	NR0B1*	HGNC:7960
	danggui-shaoyao-san	MESH:C531150
	dengue shock syndrome*	DOID:DOID:0050125
	dextran sulfate*	CHEBI:CHEBI:34674
	disease_specific_survival	disease_specific_survival
	disuccinimidyl suberate	MESH:C019358
	suberate	CHEBI:CHEBI:132953

Class level metrics:
--------------------
Grounding                         	Count	F1     
                   dextran sulfate*	2201	0.98452
         disease_specific_survival	 304	0.99019
                             NR0B1*	  74	    1.0
             dengue shock syndrome*	  72	 0.9857
                        Ungrounded	  69	0.67521
    3,4-dihydroxyphenyllactic acid	  31	0.50987
               Dahl salt-sensitive*	  26	 0.9596
           disuccinimidyl suberate	  17	   0.72
               danggui-shaoyao-san	  14	   0.18
Charcot-Marie-Tooth disease type 3

In [45]:
model_to_s3(disamb)

In [46]:
preds = disamb.disambiguate(all_texts.values())

In [47]:
p = [(x, text) for x, text in zip(preds, all_texts.values()) if x[0].startswith('HGNC')]

In [48]:
p[1]

(('HGNC:7960',
  'NR0B1',
  {'CHEBI:CHEBI:132953': 8.985212953745439e-05,
   'CHEBI:CHEBI:34674': 0.0006598091069091671,
   'DOID:0050540': 0.0005837659340991881,
   'DOID:DOID:0050125': 0.000138460032754039,
   'EFO:0001350': 0.00013381204155976588,
   'HGNC:7960': 0.9971199665398686,
   'MESH:C019358': 0.0002539218312002947,
   'MESH:C035055': 0.0001576945133235649,
   'MESH:C531150': 8.814552731259378e-05,
   'disease_specific_survival': 0.00013873044530958862,
   'ungrounded': 0.0006358418981258324}),
 'Evidence that a gene located in the X chromosome might be involved in gonadal differentiation comes from the finding that duplication of Xp in association with a normal Y chromosome containing an intact SRY, can result in male-to-female sex reversal ( Bardoni et al., 1993; Am et al., 1994 ). This dosage-sensitive sex reversal locus (DSS) was mapped to a region on Xp21 adjacent to the adrenal hypoplasia congenital locus (AHC) ( Bardoni et al., 1994 ). Although DSS can interfere with 