In [2]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [3]:
adeft_grounder = AdeftGrounder()

In [4]:
shortforms = ['PAH', 'PAHs'] 
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [5]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())

In [16]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, no_browser=True, port=8891)

In [17]:
excluded_longforms = ['ph']

In [18]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [6]:
grounding_dict, names, pos_labels = [{'PAH': {'pulmonary arterial hypertension': 'MESH:D000081029',
   'polycyclic aromatic hydrocarbons': 'CHEBI:CHEBI:33848',
   'p aminohippurate': 'CHEBI:CHEBI:104011',
   'phenylalanine hydroxylase': 'HGNC:8582',
   'poly allylamine hydrochloride': 'CHEBI:CHEBI:53305',
   'p aminohippurate acid': 'CHEBI:CHEBI:104011',
   'para aminohippurate': 'CHEBI:CHEBI:104011',
   'polyaromatic hydrocarbons': 'CHEBI:CHEBI:33848',
   'para aminohippurate acid': 'CHEBI:CHEBI:104011',
   'pulmonary hypertension': 'MESH:D000081029',
   'polyallylamine': 'CHEBI:CHEBI:53305',
   'predicted adult height': 'ungrounded',
   'polynuclear aromatic hydrocarbons': 'CHEBI:CHEBI:33848',
   'polyallylamine hydrochloride': 'CHEBI:CHEBI:53305',
   'poly aromatic hydrocarbons': 'CHEBI:CHEBI:33848',
   'pregnancy associated hypertension': 'MESH:D046110',
   'paired amphipathic helix': 'IP:IPR003822',
   'pa hypertension': 'MESH:D000081029',
   'perillaldehyde': 'MESH:C033342',
   'poly allylamine': 'CHEBI:CHEBI:53305',
   'p amino hippurate': 'CHEBI:CHEBI:104011',
   'ofp aminohippurate': 'CHEBI:CHEBI:104011',
   'post atrophic hyperplasia': 'ungrounded',
   'pathological hypertrophy': 'ungrounded',
   'paraaminohippurate': 'CHEBI:CHEBI:104011',
   'paraaminohippurate acid': 'CHEBI:CHEBI:104011'},
  'PAHs': {'polycyclic aromatic hydrocarbons': 'CHEBI:CHEBI:33848'}},
 {'CHEBI:CHEBI:104011': 'p-aminohippuric acid',
  'MESH:D000081029': 'Pulmonary Arterial Hypertension',
  'IP:IPR003822': 'Paired amphipathic helix',
  'MESH:C033342': 'perillaldehyde',
  'HGNC:8582': 'PAH',
  'CHEBI:CHEBI:53305': 'poly(allylamine hydrochloride)',
  'CHEBI:CHEBI:33848': 'polycyclic arene',
  'MESH:D046110': 'Hypertension, Pregnancy-Induced'},
 ['CHEBI:CHEBI:104011',
  'CHEBI:CHEBI:33848',
  'CHEBI:CHEBI:53305',
  'HGNC:8582',
  'IP:IPR003822',
  'MESH:C033342',
  'MESH:D000081029',
  'MESH:D046110']]

In [7]:
additional_entities = []

In [8]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entities = pos_labels + additional_entities
entity_pmid_map = {entity: get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True) for entity in entities}

In [9]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(set(pmids1) & set(pmids2))))

In [10]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & set(pmids2))))

In [11]:
intersection1

[('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:104011', 922),
 ('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:33848', 0),
 ('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:53305', 0),
 ('CHEBI:CHEBI:104011', 'HGNC:8582', 0),
 ('CHEBI:CHEBI:104011', 'IP:IPR003822', 0),
 ('CHEBI:CHEBI:104011', 'MESH:C033342', 0),
 ('CHEBI:CHEBI:104011', 'MESH:D000081029', 0),
 ('CHEBI:CHEBI:104011', 'MESH:D046110', 0),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:104011', 0),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:33848', 100000),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:53305', 0),
 ('CHEBI:CHEBI:33848', 'HGNC:8582', 0),
 ('CHEBI:CHEBI:33848', 'IP:IPR003822', 0),
 ('CHEBI:CHEBI:33848', 'MESH:C033342', 0),
 ('CHEBI:CHEBI:33848', 'MESH:D000081029', 0),
 ('CHEBI:CHEBI:33848', 'MESH:D046110', 0),
 ('CHEBI:CHEBI:53305', 'CHEBI:CHEBI:104011', 0),
 ('CHEBI:CHEBI:53305', 'CHEBI:CHEBI:33848', 0),
 ('CHEBI:CHEBI:53305', 'CHEBI:CHEBI:53305', 0),
 ('CHEBI:CHEBI:53305', 'HGNC:8582', 0),
 ('CHEBI:CHEBI:53305', 'IP:IPR003822', 0),
 ('CHEBI:CHEBI:53305', 'MESH:C033342'

In [12]:
intersection2

[('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:104011', 101),
 ('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:33848', 2),
 ('CHEBI:CHEBI:104011', 'CHEBI:CHEBI:53305', 0),
 ('CHEBI:CHEBI:104011', 'HGNC:8582', 0),
 ('CHEBI:CHEBI:104011', 'IP:IPR003822', 0),
 ('CHEBI:CHEBI:104011', 'MESH:C033342', 0),
 ('CHEBI:CHEBI:104011', 'MESH:D000081029', 0),
 ('CHEBI:CHEBI:104011', 'MESH:D046110', 0),
 ('HGNC:8582', 'CHEBI:CHEBI:104011', 0),
 ('HGNC:8582', 'CHEBI:CHEBI:33848', 0),
 ('HGNC:8582', 'CHEBI:CHEBI:53305', 0),
 ('HGNC:8582', 'HGNC:8582', 45),
 ('HGNC:8582', 'IP:IPR003822', 0),
 ('HGNC:8582', 'MESH:C033342', 0),
 ('HGNC:8582', 'MESH:D000081029', 0),
 ('HGNC:8582', 'MESH:D046110', 0),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:104011', 0),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:33848', 541),
 ('CHEBI:CHEBI:33848', 'CHEBI:CHEBI:53305', 0),
 ('CHEBI:CHEBI:33848', 'HGNC:8582', 0),
 ('CHEBI:CHEBI:33848', 'IP:IPR003822', 0),
 ('CHEBI:CHEBI:33848', 'MESH:C033342', 0),
 ('CHEBI:CHEBI:33848', 'MESH:D000081029', 0),
 ('CHEBI:CHEBI

In [15]:
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys())
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [16]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-09-17 11:53:06] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-09-17 11:57:39] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9856359225261817 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [17]:
classifier.stats

{'label_distribution': {'CHEBI:CHEBI:104011': 1839,
  'HGNC:8582': 1364,
  'CHEBI:CHEBI:33848': 10729,
  'MESH:D000081029': 1585,
  'ungrounded': 8,
  'IP:IPR003822': 4,
  'CHEBI:CHEBI:53305': 55,
  'MESH:C033342': 4,
  'MESH:D046110': 2},
 'f1': {'mean': 0.985636, 'std': 0.002058},
 'precision': {'mean': 0.985679, 'std': 0.001841},
 'recall': {'mean': 0.986459, 'std': 0.001774},
 'HGNC:8582': {'f1': {'mean': 0.976118, 'std': 0.008131},
  'pr': {'mean': 0.961148, 'std': 0.015462},
  'rc': {'mean': 0.991713, 'std': 0.004956}},
 'CHEBI:CHEBI:53305': {'f1': {'mean': 0.617328, 'std': 0.184484},
  'pr': {'mean': 0.490909, 'std': 0.195824},
  'rc': {'mean': 0.946429, 'std': 0.065854}},
 'CHEBI:CHEBI:33848': {'f1': {'mean': 0.99027, 'std': 0.00129},
  'pr': {'mean': 0.995992, 'std': 0.001126},
  'rc': {'mean': 0.984614, 'std': 0.001649}},
 'IP:IPR003822': {'f1': {'mean': 0.333333, 'std': 0.421637},
  'pr': {'mean': 0.4, 'std': 0.489898},
  'rc': {'mean': 0.3, 'std': 0.4}},
 'CHEBI:CHEBI:10401

In [18]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [20]:
model_to_s3(disamb)

In [19]:
print(disamb.info())

Disambiguation model for PAH, and PAHs

Produces the disambiguations:
	Hypertension, Pregnancy-Induced*	MESH:D046110
	PAH*	HGNC:8582
	Paired amphipathic helix*	IP:IPR003822
	Pulmonary Arterial Hypertension*	MESH:D000081029
	p-aminohippuric acid*	CHEBI:CHEBI:104011
	perillaldehyde*	MESH:C033342
	poly(allylamine hydrochloride)*	CHEBI:CHEBI:53305
	polycyclic arene*	CHEBI:CHEBI:33848

Class level metrics:
--------------------
Grounding                      	Count	F1     
               polycyclic arene*	10729	0.99027
           p-aminohippuric acid*	 1839	0.98528
Pulmonary Arterial Hypertension*	 1585	0.98082
                            PAH*	 1364	0.97612
 poly(allylamine hydrochloride)*	   55	0.61733
                     Ungrounded	    8	0.33333
       Paired amphipathic helix*	    4	0.33333
                 perillaldehyde*	    4	    0.0
Hypertension, Pregnancy-Induced*	    2	    0.0

Weighted Metrics:
-----------------
	F1 score:	0.98564
	Precision:	0.98568
	Recall:		0.98646

* Positive 