In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['MR']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('mineralocorticoid receptor', 1445),
 ('magnetic resonance', 1123),
 ('mitral regurgitation', 432),
 ('mannose receptor', 336),
 ('methionine restriction', 104),
 ('mineralocorticoid', 97),
 ('magnetoresistance', 50),
 ('milk replacer', 47),
 ('mendelian randomization', 43),
 ('mental retardation', 39),
 ('metabolic rate', 32),
 ('muscarinic receptor', 24),
 ('mre11 rad50', 21),
 ('modified release', 20),
 ('mitral valve regurgitation', 17),
 ('medial rectus', 15),
 ('maillard reaction', 14),
 ('membrane raft', 14),
 ('median raphe', 14),
 ('mitochondria rich', 11),
 ('metabolic ratio', 11),
 ('mineralcorticoid receptor', 11),
 ('mortality rate', 10),
 ('median raphe nucleus', 10),
 ('myogenic response', 9),
 ('meal replacer', 9),
 ('mendelian randomisation', 8),
 ('midbrain raphe', 8),
 ('middle region', 7),
 ('mitotic recombinant', 7),
 ('master regulation', 7),
 ('medical reconciliation', 7),
 ('metastatic relapse', 6),
 ('measles rubella', 6),
 ('mc receptor', 6),
 ('mixed realit

In [6]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-10-03 02:12:14] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [7]:
result = [grounding_map, names, pos_labels]

In [8]:
result

[{'magnetic reduction': 'ungrounded',
  'magnetic resonance': 'MESH:D008279',
  'magnetic resonance imaging': 'MESH:D008279',
  'magneto resistant': 'ungrounded',
  'magnetoresistance': 'ungrounded',
  'magnetorheological': 'ungrounded',
  'maillard reaction': 'MESH:D015416',
  'maintenance rituximab': 'CHEBI:CHEBI:64357',
  'malarial retinopathy': 'ungrounded',
  'mammary rudiment': 'ungrounded',
  'mandelate racemase': 'ungrounded',
  'manikya rasa': 'ungrounded',
  'mannose receptor': 'FPLX:MRC',
  'mannose resistant': 'ungrounded',
  'marital relationship': 'ungrounded',
  'master regulation': 'ungrounded',
  'maximal respiration': 'ungrounded',
  'mc receptor': 'HGNC:7979',
  'meal replacer': 'ungrounded',
  'mean survival time ratio': 'ungrounded',
  'measles rubella': 'ungrounded',
  'medial rectus': 'ungrounded',
  'median raphe': 'ungrounded',
  'median raphe nucleus': 'MESH:D011903',
  'medical reconciliation': 'ungrounded',
  'medical records': 'MESH:D008499',
  'membrane ra

In [9]:
grounding_map, names, pos_labels = [{'magnetic reduction': 'ungrounded',
  'magnetic resonance': 'MESH:D008279',
  'magnetic resonance imaging': 'MESH:D008279',
  'magneto resistant': 'ungrounded',
  'magnetoresistance': 'ungrounded',
  'magnetorheological': 'ungrounded',
  'maillard reaction': 'MESH:D015416',
  'maintenance rituximab': 'CHEBI:CHEBI:64357',
  'malarial retinopathy': 'ungrounded',
  'mammary rudiment': 'ungrounded',
  'mandelate racemase': 'ungrounded',
  'manikya rasa': 'ungrounded',
  'mannose receptor': 'FPLX:MRC',
  'mannose resistant': 'ungrounded',
  'marital relationship': 'ungrounded',
  'master regulation': 'ungrounded',
  'maximal respiration': 'ungrounded',
  'mc receptor': 'HGNC:7979',
  'meal replacer': 'ungrounded',
  'mean survival time ratio': 'ungrounded',
  'measles rubella': 'ungrounded',
  'medial rectus': 'ungrounded',
  'median raphe': 'ungrounded',
  'median raphe nucleus': 'MESH:D011903',
  'medical reconciliation': 'ungrounded',
  'medical records': 'MESH:D008499',
  'membrane raft': 'GO:GO:0045121',
  'mendelian randomisation': 'MESH:D057182',
  'mendelian randomization': 'MESH:D057182',
  'mental retardation': 'MESH:D008607',
  'mesenteric': 'ungrounded',
  'met restriction': 'ungrounded',
  'metabolic rate': 'ungrounded',
  'metabolic ratio': 'ungrounded',
  'metastatic relapse': 'ungrounded',
  'methicillin resistant': 'ungrounded',
  'methionine restriction': 'NCIT:C154555',
  'methyl red': 'CHEBI:CHEBI:49770',
  'midbody ring': 'GO:GO:0090543',
  'midbrain raphe': 'MESH:D066267',
  'middle region': 'ungrounded',
  'milk replacer': 'ungrounded',
  'mineralcorticoid': 'HGNC:7979',
  'mineralcorticoid receptor': 'HGNC:7979',
  'mineralocorticoid': 'HGNC:7979',
  'mineralocorticoid receptor': 'HGNC:7979',
  'mineralocorticosteroid receptor': 'HGNC:7979',
  'minireplicon': 'ungrounded',
  'minor response': 'ungrounded',
  'miroestrol': 'CHEBI:CHEBI:6949',
  'mitochondria rich': 'ungrounded',
  'mitochondrial restoration': 'ungrounded',
  'mitotic rate': 'ungrounded',
  'mitotic recombinant': 'ungrounded',
  'mitoxantrone': 'CHEBI:CHEBI:50729',
  'mitral regurgitation': 'MESH:D008944',
  'mitral valve regurgitation': 'MESH:D008944',
  'mixed reality': 'MESH:D000081411',
  'mixis rate': 'ungrounded',
  'mobile relay': 'ungrounded',
  'modified release': 'ungrounded',
  'molar refractivity': 'ungrounded',
  'molecular remission': 'ungrounded',
  'molecular replacer': 'ungrounded',
  'morphinone reductase': 'ungrounded',
  'morphological response': 'ungrounded',
  'mortality rate': 'MESH:D009026',
  'mother reared': 'ungrounded',
  'motor resonance': 'ungrounded',
  'mre11 rad50': 'FPLX:MRN_complex',
  'multiple regression': 'ungrounded',
  'muscarinic receptor': 'FPLX:CHRM',
  'myogenic response': 'ungrounded',
  'myricetin 3 rhamnoside': 'ungrounded',
  'resistant tme': 'ungrounded'},
 {'MESH:D008279': 'Magnetic Resonance Imaging',
  'MESH:D015416': 'Maillard Reaction',
  'CHEBI:CHEBI:64357': 'rituximab',
  'FPLX:MRC': 'MRC',
  'HGNC:7979': 'NR3C2',
  'MESH:D011903': 'Raphe Nuclei',
  'MESH:D008499': 'Medical Records',
  'GO:GO:0045121': 'membrane raft',
  'MESH:D057182': 'Mendelian Randomization Analysis',
  'MESH:D008607': 'Intellectual Disability',
  'NCIT:C154555': 'Methionine-Restricted Diet',
  'CHEBI:CHEBI:49770': 'methyl red',
  'GO:GO:0090543': 'Flemming body',
  'MESH:D066267': 'Midbrain Raphe Nuclei',
  'CHEBI:CHEBI:6949': 'Miroestrol',
  'CHEBI:CHEBI:50729': 'mitoxantrone',
  'MESH:D008944': 'Mitral Valve Insufficiency',
  'MESH:D000081411': 'Augmented Reality',
  'MESH:D009026': 'Mortality',
  'FPLX:MRN_complex': 'MRN_complex',
  'FPLX:CHRM': 'CHRM'},
 ['FPLX:CHRM',
  'FPLX:MRC',
  'FPLX:MRN_complex',
  'HGNC:7979',
  'MESH:D008279',
  'MESH:D008607',
  'MESH:D008944',
  'MESH:D057182',
  'NCIT:C154555']]

In [10]:
excluded_longforms = []

In [11]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [12]:
additional_entities = {}

In [13]:
unambiguous_agent_texts = {}

In [14]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [15]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [16]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [17]:
intersection1

[]

In [18]:
intersection2

[]

In [20]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=['RTCA', 'RTCD1', 'RPC', 'RTC1', 'RTC'])
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [None]:
names.update(additional_entitie)

In [19]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-03 02:39:14] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-03 02:40:44] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.976864242773471 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [20]:
classifier.stats

{'label_distribution': {'ungrounded': 243,
  'HGNC:7979': 1231,
  'MESH:D008279': 980,
  'FPLX:CHRM': 21,
  'MESH:D008944': 320,
  'MESH:D011903': 9,
  'MESH:D008607': 34,
  'MESH:D066267': 5,
  'MESH:D009026': 7,
  'FPLX:MRC': 252,
  'MESH:D015416': 11,
  'NCIT:C154555': 65,
  'FPLX:MRN_complex': 14,
  'GO:GO:0045121': 12,
  'CHEBI:CHEBI:64357': 4,
  'CHEBI:CHEBI:50729': 3,
  'MESH:D008499': 2,
  'CHEBI:CHEBI:49770': 5,
  'CHEBI:CHEBI:6949': 4,
  'MESH:D057182': 34,
  'GO:GO:0090543': 2,
  'MESH:D000081411': 4},
 'f1': {'mean': 0.976864, 'std': 0.004349},
 'precision': {'mean': 0.972227, 'std': 0.003904},
 'recall': {'mean': 0.983057, 'std': 0.003554},
 'CHEBI:CHEBI:49770': {'f1': {'mean': 0.0, 'std': 0.0},
  'pr': {'mean': 0.0, 'std': 0.0},
  'rc': {'mean': 0.0, 'std': 0.0}},
 'MESH:D008944': {'f1': {'mean': 0.989098, 'std': 0.006177},
  'pr': {'mean': 0.990625, 'std': 0.007655},
  'rc': {'mean': 0.987692, 'std': 0.011513}},
 'MESH:D057182': {'f1': {'mean': 1.0, 'std': 0.0},
  'pr': 

In [21]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [22]:
disamb.dump(model_name, results_path)

In [23]:
print(disamb.info())

Disambiguation model for MR

Produces the disambiguations:
	Augmented Reality	MESH:D000081411
	CHRM*	FPLX:CHRM
	Flemming body	GO:GO:0090543
	Intellectual Disability*	MESH:D008607
	MRC*	FPLX:MRC
	MRN_complex*	FPLX:MRN_complex
	Magnetic Resonance Imaging*	MESH:D008279
	Maillard Reaction	MESH:D015416
	Medical Records	MESH:D008499
	Mendelian Randomization Analysis*	MESH:D057182
	Methionine-Restricted Diet*	NCIT:C154555
	Midbrain Raphe Nuclei	MESH:D066267
	Miroestrol	CHEBI:CHEBI:6949
	Mitral Valve Insufficiency*	MESH:D008944
	Mortality	MESH:D009026
	NR3C2*	HGNC:7979
	Raphe Nuclei	MESH:D011903
	membrane raft	GO:GO:0045121
	methyl red	CHEBI:CHEBI:49770
	mitoxantrone	CHEBI:CHEBI:50729
	rituximab	CHEBI:CHEBI:64357

Class level metrics:
--------------------
Grounding                       	Count	F1     
                           NR3C2*	1231	0.98114
      Magnetic Resonance Imaging*	 980	0.98679
      Mitral Valve Insufficiency*	 320	 0.9891
                             MRC*	 252	0.96481
       

In [29]:
# model_to_s3(disamb)