In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['RAS']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [None]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [None]:
list(zip(longforms, counts))

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-09-25 14:47:31] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'benzaldehyde lyase': 'MESH:C059416',
  'betaine aldehyde': 'CHEBI:CHEBI:15710',
  'bile salt activity lipase': 'HGNC:1848',
  'bioartificial liver': 'MESH:D019164',
  'blood alcohol levels': 'ungrounded',
  'breath alcohol levels': 'ungrounded',
  'british anti lewisite': 'CHEBI:CHEBI:64198',
  'brochoalveolar lavage': 'MESH:D018893',
  'bronchalveolar lavage': 'MESH:D018893',
  'bronchial alveolar lavage': 'MESH:D018893',
  'bronchial lavage': 'MESH:D018893',
  'bronchio alveolar lavage': 'MESH:D018893',
  'bronchiolar lavage': 'MESH:D018893',
  'broncho alveolar lavage': 'MESH:D018893',
  'bronchoalveolar': 'MESH:D018893',
  'bronchoalveolar fluid': 'MESH:D018893',
  'bronchoalveolar larvage': 'MESH:D018893',
  'bronchoalveolar lavage': 'MESH:D018893'},
 {'MESH:C059416': 'benzaldehyde lyase',
  'CHEBI:CHEBI:15710': 'betaine aldehyde',
  'HGNC:1848': 'CEL',
  'MESH:D019164': 'Liver, Artificial',
  'CHEBI:CHEBI:64198': 'dimercaprol',
  'MESH:D018893': 'Bronchoalveolar Lavage'},
 ['H

In [None]:
grounding_map, names, pos_labels = ({'radial artery spasm': 'ungrounded',
  'radiation attenuated plasmodium sporozoites': 'MESH:D034101',
  'radiation attenuated sporozoites': 'MESH:D034101',
  'radix angelica sinensis': 'MESH:D029971',
  'ras': 'FPLX:RAS',
  'ras alone': 'FPLX:RAS',
  'ras g12v': 'FPLX:RAS',
  'rasv12': 'FPLX:RAS',
  'rat sarcoma': 'FPLX:RAS',
  'rat sarcoma oncogene': 'FPLX:RAS',
  'rat sarcoma viral oncogene': 'FPLX:RAS',
  'rat sarcoma virus': 'FPLX:RAS',
  'recirculating aquaculture system': 'ungrounded',
  'recurrent aphthous stomatitis': 'MESH:D013281',
  'reflectance anisotropy spectroscopy': 'ungrounded',
  'regulatory activation score': 'ungrounded',
  'related allele signaling': 'ungrounded',
  'renal artery stenosis': 'ungrounded',
  'renin ang system': 'MESH:D012084',
  'renin angiotensin': 'MESH:D012084',
  'renin angiotensin aldosterone': 'MESH:D012084',
  'renin angiotensin aldosterone system': 'MESH:D012084',
  'renin angiotensin ang system': 'MESH:D012084',
  'renin angiotensin system': 'MESH:D012084',
  'renin angiotensin system inhibitors': 'ungrounded',
  'renin angiotensinsystem': 'MESH:D012084',
  'renin − angiotensin system': 'MESH:D012084',
  'reninangiotensin system': 'MESH:D012084',
  'rennin angiotensin system': 'MESH:D012084',
  'resistance associated substitution': 'ungrounded',
  'restrictive allograft syndrome': 'ungrounded',
  'reticular activation system': 'ungrounded',
  'retinoic acid syndrome': 'ungrounded',
  'retroviruses associated dna sequences': 'ungrounded',
  'rhythmic auditory stimulation': 'MESH:D000161',
  'ribi adjuvant system': 'ungrounded',
  'robot assisted surgery': 'ungrounded'},
 {'MESH:D034101': 'Sporozoites',
  'MESH:D029971': 'Angelica sinensis',
  'FPLX:RAS': 'RAS',
  'MESH:D013281': 'Stomatitis, Aphthous',
  'MESH:D012084': 'Renin-Angiotensin System',
  'MESH:D000161': 'Acoustic Stimulation'},
 ['FPLX:RAS',
  'MESH:D000161',
  'MESH:D012084',
  'MESH:D013281',
  'MESH:D029971',
  'MESH:D034101'])

In [None]:
excluded_longforms = []

In [None]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]
if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [None]:
additional_entities = {}

In [None]:
unambiguous_agent_texts = {}

In [None]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [19]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [20]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [21]:
intersection1

[('HGNC:13974', 'HGNC:13974', 74)]

In [22]:
intersection2

[('MESH:D013281', 'HGNC:13974', 0),
 ('MESH:D012084', 'HGNC:13974', 0),
 ('ungrounded', 'HGNC:13974', 0),
 ('MESH:D029971', 'HGNC:13974', 0),
 ('MESH:D034101', 'HGNC:13974', 0),
 ('FPLX:RAS', 'HGNC:13974', 0),
 ('MESH:D000161', 'HGNC:13974', 0)]

In [23]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [24]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [None]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

In [26]:
classifier.stats

{'label_distribution': {'MESH:D013281': 40,
  'MESH:D012084': 1898,
  'ungrounded': 128,
  'MESH:D029971': 10,
  'MESH:D034101': 32,
  'FPLX:RAS': 62,
  'MESH:D000161': 4,
  'HGNC:13974': 11},
 'f1': {'mean': 0.972077, 'std': 0.009956},
 'precision': {'mean': 0.964013, 'std': 0.011602},
 'recall': {'mean': 0.982507, 'std': 0.00773},
 'MESH:D034101': {'f1': {'mean': 1.0, 'std': 0.0},
  'pr': {'mean': 1.0, 'std': 0.0},
  'rc': {'mean': 1.0, 'std': 0.0}},
 'ungrounded': {'f1': {'mean': 0.838516, 'std': 0.053776},
  'pr': {'mean': 0.742154, 'std': 0.075404},
  'rc': {'mean': 0.97, 'std': 0.04}},
 'MESH:D012084': {'f1': {'mean': 0.982399, 'std': 0.005503},
  'pr': {'mean': 0.997893, 'std': 0.00197},
  'rc': {'mean': 0.967429, 'std': 0.010168}},
 'FPLX:RAS': {'f1': {'mean': 0.85562, 'std': 0.093808},
  'pr': {'mean': 0.784615, 'std': 0.155683},
  'rc': {'mean': 0.964615, 'std': 0.043947}},
 'MESH:D013281': {'f1': {'mean': 0.900952, 'std': 0.057649},
  'pr': {'mean': 0.825, 'std': 0.1},
  'rc

In [27]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [28]:
disamb.dump(model_name, results_path)

In [29]:
print(disamb.info())

Disambiguation model for RAS

Produces the disambiguations:
	Acoustic Stimulation*	MESH:D000161
	Angelica sinensis*	MESH:D029971
	NELFE*	HGNC:13974
	RAS*	FPLX:RAS
	Renin-Angiotensin System*	MESH:D012084
	Sporozoites*	MESH:D034101
	Stomatitis, Aphthous*	MESH:D013281

Class level metrics:
--------------------
Grounding               	Count	F1     
Renin-Angiotensin System*	1898	 0.9824
              Ungrounded	 128	0.83852
                     RAS*	  62	0.85562
    Stomatitis, Aphthous*	  40	0.90095
             Sporozoites*	  32	    1.0
                   NELFE*	  11	    0.8
       Angelica sinensis*	  10	0.26667
    Acoustic Stimulation*	   4	    0.4

Weighted Metrics:
-----------------
	F1 score:	0.97208
	Precision:	0.96401
	Recall:		0.98251

* Positive labels
See Docstring for explanation



In [30]:
model_to_s3(disamb)