In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['GL']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('bronchoalveolar lavage', 1661),
 ('broncho alveolar lavage', 64),
 ('bronchial alveolar lavage', 40),
 ('bronchoalveolar', 26),
 ('bioartificial liver', 22),
 ('blood alcohol levels', 20),
 ('british anti lewisite', 9),
 ('bronchoalveolar fluid', 5),
 ('betaine aldehyde', 3),
 ('breath alcohol levels', 3),
 ('benzaldehyde lyase', 3),
 ('bronchoalveolar larvage', 3),
 ('bile salt activity lipase', 3),
 ('bronchio alveolar lavage', 3),
 ('brochoalveolar lavage', 3),
 ('bronchalveolar lavage', 3),
 ('bronchial lavage', 3),
 ('bronchiolar lavage', 3)]

In [5]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-09-28 12:09:18] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [6]:
result = [grounding_map, names, pos_labels]

In [7]:
result

[{'bl green': 'ungrounded',
  'gaba contained liposomes': 'ungrounded',
  'galactono 1 4 lactone': 'ungrounded',
  'galiellalactone': 'MESH:C416043',
  'gamma lactone': 'CHEBI:CHEBI:37581',
  'ganoderma lucidum': 'MESH:D020940',
  'gastric lipase': 'HGNC:1863',
  'gastrocnemius lateralis': 'UBERON_0011908',
  'gate loop': 'ungrounded',
  'gaussia luciferase': 'MI_2219',
  'germ line': 'MESH:D005854',
  'germinal layer': 'ungrounded',
  'germline': 'MESH:D005854',
  'gi and load': 'ungrounded',
  'gl': 'ungrounded',
  'glargine': 'MESH:D000069036',
  'glibenclamide': 'CHEBI:CHEBI:5441',
  'global layer': 'ungrounded',
  'glomerular layer': 'UBERON_0005377',
  'glossopharyngeal': 'MESH:D005930',
  'glucan': 'CHEBI:CHEBI:37163',
  'glucolipotoxic': 'ungrounded',
  'glucosaminidase': 'PF:PF01832',
  'glucose': 'CHEBI:CHEBI:17234',
  'glucosinolate': 'CHEBI:CHEBI:24279',
  'glycaemic load': 'MESH:D000067940',
  'glycemic load': 'MESH:D000067940',
  'glycerol': 'CHEBI:CHEBI:17754',
  'glycog

In [19]:
grounding_map, names, pos_labels = [{'bl green': 'ungrounded',
  'gaba contained liposomes': 'ungrounded',
  'galactono 1 4 lactone': 'ungrounded',
  'galiellalactone': 'MESH:C416043',
  'gamma lactone': 'CHEBI:CHEBI:37581',
  'ganoderma lucidum': 'MESH:D020940',
  'gastric lipase': 'HGNC:1863',
  'gastrocnemius lateralis': 'UBERON_0011908',
  'gate loop': 'ungrounded',
  'gaussia luciferase': 'MI_2219',
  'germ line': 'MESH:D005854',
  'germinal layer': 'ungrounded',
  'germline': 'MESH:D005854',
  'gi and load': 'ungrounded',
  'gl': 'ungrounded',
  'glargine': 'MESH:D000069036',
  'glibenclamide': 'CHEBI:CHEBI:5441',
  'global layer': 'ungrounded',
  'glomerular layer': 'UBERON_0005377',
  'glossopharyngeal': 'MESH:D005930',
  'glucan': 'CHEBI:CHEBI:37163',
  'glucolipotoxic': 'ungrounded',
  'glucosaminidase': 'PF:PF01832',
  'glucose': 'CHEBI:CHEBI:17234',
  'glucosinolate': 'CHEBI:CHEBI:24279',
  'glycaemic load': 'MESH:D000067940',
  'glycemic load': 'MESH:D000067940',
  'glycerol': 'CHEBI:CHEBI:17754',
  'glycogen binding': 'GO:GO:2001069',
  'glycogenolysis': 'MESH:D050261',
  'glycolipid': 'CHEBI:CHEBI:33563',
  'glycolysis': 'GO:GO:0006096',
  'glycyrrhizin': 'CHEBI:CHEBI:15939',
  'gold lotion': 'ungrounded',
  'goloco': 'ungrounded',
  'grain length': 'grain_length',
  'granular layer': 'ungrounded',
  'granular lymphocytes': 'ungrounded',
  'granulosa luteal': 'MESH:D008184',
  'granulosa lutein': 'MESH:D008184',
  'green light': 'ungrounded',
  'green luminescence': 'ungrounded',
  'growth light': 'ungrounded',
  'gugulipid': 'ungrounded'},
 {'MESH:C416043': 'galiellalactone',
  'CHEBI:CHEBI:37581': 'gamma-lactone',
  'MESH:D020940': 'Reishi',
  'HGNC:1863': 'CES1',
  'UBERON_0011908': 'gastrocnemius lateralis',
  'MI_2219': 'gaussia luciferase protein tag',
  'MESH:D005854': 'Germ Cells',
  'MESH:D000069036': 'Insulin Glargine',
  'CHEBI:CHEBI:5441': 'glyburide',
  'UBERON_0005377': 'olfactory bulb glomerular layer',
  'MESH:D005930': 'Glossopharyngeal Nerve',
  'CHEBI:CHEBI:37163': 'glucan',
  'PF:PF01832': 'Glucosaminidase',
  'CHEBI:CHEBI:17234': 'glucose',
  'CHEBI:CHEBI:24279': 'glucosinolate',
  'MESH:D000067940': 'Glycemic Load',
  'CHEBI:CHEBI:17754': 'glycerol',
  'GO:GO:2001069': 'glycogen binding',
  'MESH:D050261': 'Glycogenolysis',
  'CHEBI:CHEBI:33563': 'glycolipid',
  'GO:GO:0006096': 'glycolytic process',
  'CHEBI:CHEBI:15939': 'glycyrrhizinic acid',
  'grain_length': 'grain_length',
  'MESH:D008184': 'Luteal Cells'},
 ['CHEBI:CHEBI:15939',
  'MESH:D000067940',
  'MESH:D005854']]

In [9]:
excluded_longforms = ['gl']

In [10]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [11]:
additional_entities = []

In [12]:
unambiguous_agent_texts = {}

In [13]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [19]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [20]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [21]:
intersection1

[('HGNC:1848', 'HGNC:1848', 86)]

In [22]:
intersection2

[('MESH:D018893', 'HGNC:1848', 0),
 ('HGNC:1848', 'HGNC:1848', 1),
 ('ungrounded', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:64198', 'HGNC:1848', 0),
 ('MESH:D019164', 'HGNC:1848', 0),
 ('MESH:C059416', 'HGNC:1848', 0),
 ('CHEBI:CHEBI:15710', 'HGNC:1848', 0)]

[('HGNC:7321', ['Musculin', 'musculin'])]

In [23]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [20]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-09-28 12:59:20] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-09-28 12:59:34] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9168576034317801 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [21]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:5441': 3,
  'CHEBI:CHEBI:15939': 143,
  'ungrounded': 54,
  'HGNC:1863': 3,
  'CHEBI:CHEBI:17234': 3,
  'GO:GO:0006096': 4,
  'GO:GO:2001069': 2,
  'MESH:D008184': 9,
  'MESH:D005854': 26,
  'MESH:D005930': 6,
  'CHEBI:CHEBI:33563': 4,
  'MESH:D020940': 13,
  'CHEBI:CHEBI:37581': 3,
  'MESH:D050261': 3,
  'MESH:D000067940': 188,
  'CHEBI:CHEBI:17754': 11,
  'CHEBI:CHEBI:37163': 3,
  'MI_2219': 4,
  'UBERON_0005377': 8,
  'grain_length': 14,
  'MESH:C416043': 6,
  'MESH:D000069036': 4,
  'PF:PF01832': 3,
  'UBERON_0011908': 4,
  'CHEBI:CHEBI:24279': 2},
 'f1': {'mean': 0.916858, 'std': 0.009065},
 'precision': {'mean': 0.86328, 'std': 0.017201},
 'recall': {'mean': 0.988772, 'std': 0.01053},
 'CHEBI:CHEBI:17234': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}},
 'MESH:D000069036': {'f1': {'mean': 0.4, 'std': 0.489898},
  'pr': {'mean': 0.4, 'std': 0.489898},
  'rc': {'mean': 0.4, 'std': 0.489898

In [22]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [24]:
disamb.dump(model_name, results_path)

In [23]:
print(disamb.info())

Disambiguation model for GL

Produces the disambiguations:
	CES1	HGNC:1863
	Germ Cells*	MESH:D005854
	Glossopharyngeal Nerve	MESH:D005930
	Glucosaminidase	PF:PF01832
	Glycemic Load*	MESH:D000067940
	Glycogenolysis	MESH:D050261
	Insulin Glargine	MESH:D000069036
	Luteal Cells	MESH:D008184
	Reishi	MESH:D020940
	galiellalactone	MESH:C416043
	gamma-lactone	CHEBI:CHEBI:37581
	gastrocnemius lateralis	UBERON_0011908
	gaussia luciferase protein tag	MI_2219
	glucan	CHEBI:CHEBI:37163
	glucose	CHEBI:CHEBI:17234
	glucosinolate	CHEBI:CHEBI:24279
	glyburide	CHEBI:CHEBI:5441
	glycerol	CHEBI:CHEBI:17754
	glycogen binding	GO:GO:2001069
	glycolipid	CHEBI:CHEBI:33563
	glycolytic process	GO:GO:0006096
	glycyrrhizinic acid*	CHEBI:CHEBI:15939
	grain_length	grain_length
	olfactory bulb glomerular layer	UBERON_0005377

Class level metrics:
--------------------
Grounding                      	Count	F1     
                  Glycemic Load*	188	0.97655
            glycyrrhizinic acid*	143	0.82998
                

In [25]:
model_to_s3(disamb)