In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [9]:
shortforms = ['PM', 'PMs']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [10]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    if len(pmids) > 10000:
        pmids = random.choices(pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [11]:
list(zip(longforms, counts))

[('particulate matter', 1565),
 ('plasma membrane', 1354),
 ('peritoneal macrophages', 99),
 ('prospective memory', 84),
 ('polymyositis', 64),
 ('powdery mildew', 58),
 ('peritrophic matrix', 53),
 ('pyridoxamine', 49),
 ('placental malaria', 39),
 ('papillary muscle', 31),
 ('peritrophic membrane', 31),
 ('peritoneal membrane', 25),
 ('pulmonary macrophages', 24),
 ('poor metabolizer', 22),
 ('phosphoramide mustard', 20),
 ('pacemaker', 19),
 ('pectoralis major', 18),
 ('purple membrane', 17),
 ('premutation', 16),
 ('pristimerin', 16),
 ('permethrin', 15),
 ('pm2 5', 15),
 ('peritoneal metastases', 12),
 ('peritoneal metastasis', 12),
 ('pneumococcal meningitis', 12),
 ('purmorphamine', 12),
 ('polygonum multiflorum', 11),
 ('particles matter', 11),
 ('pueraria mirifica', 10),
 ('particular matter', 10),
 ('puromycin', 10),
 ('premotor cortex', 10),
 ('protein malnutrition', 9),
 ('perfect match', 9),
 ('pig manure', 9),
 ('pomace', 9),
 ('phleomycin', 9),
 ('polygonum multiflorum t

In [12]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8890)

INFO: [2020-09-25 13:31:22] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.2/bio_ontology.pkl


In [13]:
result = [grounding_map, names, pos_labels]

In [14]:
result

[{'5p mimics': 'ungrounded',
  'matter air pollution': 'MESH:D052638',
  'p mirifica': 'ungrounded',
  'pacemaker': 'MESH:D010138',
  'palatal myoclonus': 'ungrounded',
  'palmultang': 'ungrounded',
  'papillary muscle': 'MESH:D010210',
  'paramagnetic': 'ungrounded',
  'paraxial mesoderm': 'MESH:D008648',
  'parental monitoring': 'ungrounded',
  'partial meniscectomy': 'ungrounded',
  'particles mass': 'ungrounded',
  'particles matter': 'MESH:D052638',
  'particular matter': 'MESH:D052638',
  'particulate matter': 'MESH:D052638',
  'passive movement': 'ungrounded',
  'pasteurella multocida': 'MESH:D016979',
  'pathological myopia': 'MESH:D047728',
  'pato de minas': 'ungrounded',
  'pearl millet': 'ungrounded',
  'peat moss': 'ungrounded',
  'pectoralis major': 'ungrounded',
  'pectoralis major muscle': 'ungrounded',
  'pectoralis minor': 'ungrounded',
  'penicillamine thiol maleimide': 'ungrounded',
  'penicillium marneffei': 'ungrounded',
  'perfect match': 'ungrounded',
  'pergoli

In [70]:
grounding_map, names, pos_labels = [{'5p mimics': 'ungrounded',
  'matter air pollution': 'MESH:D052638',
  'p mirifica': 'ungrounded',
  'pacemaker': 'MESH:D010138',
  'palatal myoclonus': 'ungrounded',
  'palmultang': 'ungrounded',
  'papillary muscle': 'MESH:D010210',
  'paramagnetic': 'ungrounded',
  'paraxial mesoderm': 'MESH:D008648',
  'parental monitoring': 'ungrounded',
  'partial meniscectomy': 'ungrounded',
  'particles mass': 'ungrounded',
  'particles matter': 'MESH:D052638',
  'particular matter': 'MESH:D052638',
  'particulate matter': 'MESH:D052638',
  'passive movement': 'ungrounded',
  'pasteurella multocida': 'MESH:D016979',
  'pathological myopia': 'MESH:D047728',
  'pato de minas': 'ungrounded',
  'pearl millet': 'ungrounded',
  'peat moss': 'ungrounded',
  'pectoralis major': 'ungrounded',
  'pectoralis major muscle': 'ungrounded',
  'pectoralis minor': 'ungrounded',
  'penicillamine thiol maleimide': 'ungrounded',
  'penicillium marneffei': 'ungrounded',
  'perfect match': 'ungrounded',
  'pergolide mesylate': 'CHEBI:CHEBI:8021',
  'perinatal mortality': 'MESH:D054238',
  'peripheral monocytes': 'MESH:D009000',
  'peripheral mononuclear': 'ungrounded',
  'peritoneal macrophages': 'MESH:D017737',
  'peritoneal membrane': 'MESH:D010537',
  'peritoneal metastases': 'ungrounded',
  'peritoneal metastasis': 'ungrounded',
  'peritrophic matrix': 'peritrophic_matrix',
  'peritrophic membrane': 'peritrophic_matrix',
  'permanent magnet': 'MESH:D059346',
  'permethrin': 'CHEBI:CHEBI:34911',
  'peru margin': 'ungrounded',
  'pesticide mixture': 'ungrounded',
  'petit manseng': 'ungrounded',
  'petrosaspongiolide m': 'MESH:C118309',
  'pharyngeal mesoderm': 'ungrounded',
  'pharyngeal muscle': 'ungrounded',
  'phase module': 'ungrounded',
  'pheomelanin': 'MESH:C018362',
  'philip morris': 'ungrounded',
  'philip morris usa': 'ungrounded',
  'phleomycin': 'CHEBI:CHEBI:75044',
  'phonological memory': 'ungrounded',
  'phosphamide mustard': 'CHEBI:CHEBI:8163',
  'phosphomycoketide': 'ungrounded',
  'phosphoramide mustard': 'CHEBI:CHEBI:8163',
  'physical mixture': 'ungrounded',
  'pig manure': 'MESH:D008372',
  'pit mud': 'ungrounded',
  'placental macrophages': 'ungrounded',
  'placental malaria': 'MESH:D008288',
  'plantamajoside': 'CHEBI:CHEBI:8256',
  'plasma membrane': 'GO:GO:0005886',
  'plasma membrane enriched fractions': 'ungrounded',
  'plasma membrane fractions': 'ungrounded',
  'plasma membranes': 'GO:GO:0005886',
  'plasmalemma': 'GO:GO:0005886',
  'plasmamembrane': 'ungrounded',
  'plasmepsins': 'ungrounded',
  'platelet membrane': 'ungrounded',
  'pleural malignancy': 'ungrounded',
  'pm2 5': 'ungrounded',
  'pneumococcal meningitis': 'ungrounded',
  'pneumomediastinum': 'MESH:D008478',
  'poloxamer morphine': 'ungrounded',
  'polygonum multiflorum': 'MESH:D000068676',
  'polygonum multiflorum thunb': 'ungrounded',
  'polymannuronic acid': 'ungrounded',
  'polymeric micelles': 'ungrounded',
  'polymyositis': 'MESH:D017285',
  'polyphenol mixture': 'ungrounded',
  'pomace': 'ungrounded',
  'poor metabolizer': 'ungrounded',
  'population mixed': 'ungrounded',
  'pore module': 'ungrounded',
  'post mortem': 'ungrounded',
  'postmenopausal': 'EFO:0002721',
  'postmortem': 'ungrounded',
  'poultry manure': 'MESH:D008372',
  'powder metallurgy': 'ungrounded',
  'powdery mildew': 'ungrounded',
  'prechordal mesoderm': 'ungrounded',
  'precision medicine': 'ungrounded',
  'pregnancy morbidity': 'ungrounded',
  'premixed insulin': 'ungrounded',
  'premotor': 'ungrounded',
  'premotor cortex': 'ungrounded',
  'premutation': 'ungrounded',
  'primary memory': 'ungrounded',
  'primary microcephaly': 'DOID:DOID:0070297',
  'primary microglia': 'MESH:D017628',
  'primary microglial': 'ungrounded',
  'primary myoblasts': 'ungrounded',
  'pringle maneuver': 'ungrounded',
  'pristimerin': 'CHEBI:CHEBI:8416',
  'probiotics mixture': 'ungrounded',
  'progressive micrographia': 'HP:HP:0031908',
  'prometaphase': 'ungrounded',
  'promethazine': 'CHEBI:CHEBI:8461',
  'propamocarb': 'CHEBI:CHEBI:82033',
  'proposed method': 'ungrounded',
  'prospective memory': 'MESH:D061212',
  'protein malnutrition': 'MESH:D044342',
  'protein meal': 'ungrounded',
  'psychomotor': 'ungrounded',
  'pterocarpus marsupium': 'ungrounded',
  'pueraria mirifica': 'ungrounded',
  'pulmonary macrophages': 'MESH:D016676',
  'pupillary membrane': 'ungrounded',
  'purmorphamine': 'CHEBI:CHEBI:63053',
  'puromycin': 'CHEBI:CHEBI:17939',
  'purple membrane': 'MESH:D018717',
  'purulent meningitis': 'ungrounded',
  'pyrenyl maleimide': 'CHEBI:CHEBI:16072',
  'pyridoxamine': 'CHEBI:CHEBI:16410'},
 {'MESH:D052638': 'Particulate Matter',
  'MESH:D010138': 'Pacemaker, Artificial',
  'MESH:D010210': 'Papillary Muscles',
  'MESH:D008648': 'Mesoderm',
  'MESH:D016979': 'Pasteurella multocida',
  'MESH:D047728': 'Myopia, Degenerative',
  'CHEBI:CHEBI:8021': 'pergolide mesylate',
  'MESH:D054238': 'Perinatal Mortality',
  'MESH:D009000': 'Monocytes',
  'MESH:D017737': 'Macrophages, Peritoneal',
  'MESH:D010537': 'Peritoneum',
  'peritrophic_matrix': 'peritrophic_matrix',
  'MESH:D059346': 'Magnets',
  'CHEBI:CHEBI:34911': 'permethrin',
  'MESH:C118309': 'petrosaspongiolide m',
  'MESH:C018362': 'pheomelanin',
  'CHEBI:CHEBI:75044': 'phleomycin',
  'CHEBI:CHEBI:8163': 'phosphoramide mustard',
  'MESH:D008372': 'Manure',
  'MESH:D008288': 'Malaria',
  'CHEBI:CHEBI:8256': 'Plantamajoside',
  'GO:GO:0005886': 'plasma membrane',
  'MESH:D008478': 'Mediastinal Emphysema',
  'MESH:D000068676': 'Fallopia multiflora',
  'MESH:D017285': 'Polymyositis',
  'EFO:0002721': 'postmenopausal',
  'DOID:DOID:0070297': 'primary microcephaly',
  'MESH:D017628': 'Microglia',
  'CHEBI:CHEBI:8416': 'Pristimerin',
  'HP:HP:0031908': 'Micrographia',
  'CHEBI:CHEBI:8461': 'promethazine',
  'CHEBI:CHEBI:82033': 'propamocarb',
  'MESH:D061212': 'Memory, Episodic',
  'MESH:D044342': 'Malnutrition',
  'MESH:D016676': 'Macrophages, Alveolar',
  'CHEBI:CHEBI:63053': 'purmorphamine',
  'CHEBI:CHEBI:17939': 'puromycin',
  'MESH:D018717': 'Purple Membrane',
  'CHEBI:CHEBI:16072': 'maleimide',
  'CHEBI:CHEBI:16410': 'pyridoxamine'},
 ['CHEBI:CHEBI:16410',
  'CHEBI:CHEBI:8163',
  'GO:GO:0005886',
  'MESH:D061212',
  'MESH:D008288',
  'MESH:D017285',
  'MESH:D010210',
  'MESH:D010537',
  'MESH:D017737',
  'MESH:D018717',
  'MESH:D052638']]


In [34]:
excluded_longforms = []

In [71]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [58]:
additional_entities = {}

In [59]:
unambiguous_agent_texts = {}

In [72]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [22]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [23]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [24]:
intersection1

[('HGNC:9337', 'HGNC:9337', 26)]

In [25]:
intersection2

[('MESH:D017737', 'HGNC:9337', 0),
 ('MESH:D018717', 'HGNC:9337', 0),
 ('GO:GO:0005886', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:75044', 'HGNC:9337', 0),
 ('ungrounded', 'HGNC:9337', 0),
 ('MESH:D044342', 'HGNC:9337', 0),
 ('MESH:D010210', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:17939', 'HGNC:9337', 0),
 ('MESH:D016676', 'HGNC:9337', 0),
 ('MESH:D017285', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:8163', 'HGNC:9337', 0),
 ('MESH:D016979', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:16410', 'HGNC:9337', 0),
 ('MESH:D010138', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:8461', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:34911', 'HGNC:9337', 0),
 ('MESH:D009000', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:16072', 'HGNC:9337', 0),
 ('MESH:D052638', 'HGNC:9337', 0),
 ('peritrophic_matrix', 'HGNC:9337', 0),
 ('EFO:0002721', 'HGNC:9337', 0),
 ('MESH:C118309', 'HGNC:9337', 0),
 ('MESH:D008372', 'HGNC:9337', 0),
 ('CHEBI:CHEBI:8021', 'HGNC:9337', 0),
 ('MESH:D010537', 'HGNC:9337', 0),
 ('MESH:D061212', 'HGNC:9337', 0),
 ('MESH:D047728', 'HGNC:9337', 0),
 ('MESH:D00828

[('HGNC:7321', ['Musculin', 'musculin'])]

In [39]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    text_dict = get_plaintexts_for_pmids(new_pmids)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items()])

In [40]:
names.update(additional_entities)

In [73]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-09-25 14:31:51] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-09-25 14:34:37] /adeft/PythonRepos/adeft/adeft/modeling/classify.py - Best f1 score of 0.9518464825891503 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [42]:
classifier.stats

{'label_distribution': {'MESH:D017737': 92,
  'MESH:D018717': 16,
  'GO:GO:0005886': 1025,
  'CHEBI:CHEBI:75044': 4,
  'ungrounded': 300,
  'MESH:D044342': 7,
  'MESH:D010210': 27,
  'CHEBI:CHEBI:17939': 8,
  'MESH:D016676': 14,
  'MESH:D017285': 50,
  'CHEBI:CHEBI:8163': 20,
  'MESH:D016979': 4,
  'CHEBI:CHEBI:16410': 34,
  'MESH:D010138': 17,
  'CHEBI:CHEBI:8461': 4,
  'CHEBI:CHEBI:34911': 13,
  'MESH:D009000': 5,
  'CHEBI:CHEBI:16072': 5,
  'MESH:D052638': 1175,
  'peritrophic_matrix': 57,
  'EFO:0002721': 4,
  'MESH:C118309': 3,
  'MESH:D008372': 11,
  'CHEBI:CHEBI:8021': 3,
  'MESH:D010537': 16,
  'MESH:D061212': 55,
  'MESH:D047728': 5,
  'MESH:D008288': 19,
  'CHEBI:CHEBI:63053': 6,
  'MESH:D008648': 4,
  'CHEBI:CHEBI:8416': 9,
  'DOID:DOID:0070297': 3,
  'MESH:D059346': 3,
  'MESH:D017628': 3,
  'MESH:D008478': 2,
  'MESH:C018362': 2,
  'MESH:D054238': 3,
  'CHEBI:CHEBI:8256': 2,
  'MESH:D000068676': 7,
  'CHEBI:CHEBI:82033': 3,
  'HP:HP:0031908': 1,
  'HGNC:9337': 26},
 'f1': 

In [74]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [76]:
disamb.dump(model_name, results_path)

In [75]:
print(disamb.info())

Disambiguation model for PM, and PMs

Produces the disambiguations:
	Fallopia multiflora	MESH:D000068676
	Macrophages, Alveolar	MESH:D016676
	Macrophages, Peritoneal*	MESH:D017737
	Magnets	MESH:D059346
	Malaria*	MESH:D008288
	Malnutrition	MESH:D044342
	Manure	MESH:D008372
	Mediastinal Emphysema	MESH:D008478
	Memory, Episodic*	MESH:D061212
	Mesoderm	MESH:D008648
	Microglia	MESH:D017628
	Micrographia	HP:HP:0031908
	Monocytes	MESH:D009000
	Myopia, Degenerative	MESH:D047728
	Pacemaker, Artificial	MESH:D010138
	Papillary Muscles*	MESH:D010210
	Particulate Matter*	MESH:D052638
	Pasteurella multocida	MESH:D016979
	Perinatal Mortality	MESH:D054238
	Peritoneum*	MESH:D010537
	Plantamajoside	CHEBI:CHEBI:8256
	Polymyositis*	MESH:D017285
	Pristimerin	CHEBI:CHEBI:8416
	Purple Membrane*	MESH:D018717
	maleimide	CHEBI:CHEBI:16072
	pergolide mesylate	CHEBI:CHEBI:8021
	peritrophic_matrix	peritrophic_matrix
	permethrin	CHEBI:CHEBI:34911
	petrosaspongiolide m	MESH:C118309
	pheomelanin	MESH:C018362
	phleomy

In [77]:
model_to_s3(disamb)