In [1]:
import os
import json
import pickle
import random
from collections import defaultdict, Counter

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator


from adeft_indra.ground.ground import AdeftGrounder
from adeft_indra.model_building.s3 import model_to_s3
from adeft_indra.model_building.escape import escape_filename
from adeft_indra.db.content import get_pmids_for_agent_text, get_pmids_for_entity, \
    get_plaintexts_for_pmids

In [2]:
adeft_grounder = AdeftGrounder()

In [3]:
shortforms = ['PA']
model_name = ':'.join(sorted(escape_filename(shortform) for shortform in shortforms))
results_path = os.path.abspath(os.path.join('../..', 'results', model_name))

In [4]:
miners = dict()
all_texts = {}
for shortform in shortforms:
    pmids = get_pmids_for_agent_text(shortform)
    text_dict = get_plaintexts_for_pmids(pmids, contains=shortforms)
    text_dict = {pmid: text for pmid, text in text_dict.items() if len(text) > 5}
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(text_dict.values())
    all_texts.update(text_dict)

longform_dict = {}
for shortform in shortforms:
    longforms = miners[shortform].get_longforms()
    longforms = [(longform, count, score) for longform, count, score in longforms
                 if count*score > 2]
    longform_dict[shortform] = longforms
    
combined_longforms = Counter()
for longform_rows in longform_dict.values():
    combined_longforms.update({longform: count for longform, count, score
                               in longform_rows})
grounding_map = {}
names = {}
for longform in combined_longforms:
    groundings = adeft_grounder.ground(longform)
    if groundings:
        grounding = groundings[0]['grounding']
        grounding_map[longform] = grounding
        names[grounding] = groundings[0]['name']
longforms, counts = zip(*combined_longforms.most_common())
pos_labels = []

In [5]:
list(zip(longforms, counts))

[('physical activity', 1877),
 ('phosphatidic acid', 1711),
 ('protective antigen', 531),
 ('pulmonary artery', 348),
 ('plasminogen activity', 328),
 ('palmitic', 309),
 ('pseudomonas aeruginosa', 276),
 ('photoacoustic', 243),
 ('primary aldosteronism', 123),
 ('positive affect', 99),
 ('proanthocyanidin', 87),
 ('platelet aggregation', 75),
 ('peptide amphiphile', 69),
 ('polyamine', 62),
 ('passive avoidance', 54),
 ('phosphatidic', 45),
 ('perinatal asphyxia', 39),
 ('patchouli alcohol', 35),
 ('plasma aldosteronism', 34),
 ('protein a', 32),
 ('phenylacetate', 32),
 ('polyamide', 30),
 ('puromycin aminonucleoside', 27),
 ('p aeruginosa', 24),
 ('pilocytic astrocytoma', 22),
 ('polyacrylamide', 22),
 ('procainamide', 22),
 ('acid protein', 22),
 ('phthalic anhydride', 21),
 ('pituitary adenoma', 21),
 ('posterior anterior', 20),
 ('pyrrolizidine alkaloids', 19),
 ('phonological awareness', 19),
 ('propionic acidemia', 19),
 ('power amplifier', 18),
 ('penicillamine', 18),
 ('pheny

In [6]:
try:
    disamb = load_disambiguator(shortforms[0])
    for shortform, gm in disamb.grounding_dict.items():
        for longform, grounding in gm.items():
            grounding_map[longform] = grounding
    for grounding, name in disamb.names.items():
        names[grounding] = name
    pos_labels = disamb.pos_labels
except Exception:
    pass

In [7]:
names

{'MESH:D015444': 'Exercise',
 'CHEBI:CHEBI:16337': 'phosphatidic acid',
 'MESH:D011651': 'Pulmonary Artery',
 'MESH:D011550': 'Pseudomonas aeruginosa',
 'MESH:D006929': 'Hyperaldosteronism',
 'MESH:D000339': 'Affect',
 'CHEBI:CHEBI:26267': 'proanthocyanidin',
 'GO:GO:0070527': 'platelet aggregation',
 'CHEBI:CHEBI:59941': 'amphiphile',
 'CHEBI:CHEBI:88061': 'polyamine',
 'MESH:D001238': 'Asphyxia Neonatorum',
 'CHEBI:CHEBI:7940': 'patchouli alcohol',
 'HGNC:23458': 'CALHM3',
 'CHEBI:CHEBI:18401': 'phenylacetate',
 'CHEBI:CHEBI:51953': 'polyamide',
 'CHEBI:CHEBI:42839': "3'-amino-3'-deoxy-N(6),N(6)-dimethyladenosine",
 'MESH:D001254': 'Astrocytoma',
 'MESH:C016679': 'polyacrylamide',
 'CHEBI:CHEBI:8428': 'procainamide',
 'CHEBI:CHEBI:36605': 'phthalic anhydride',
 'DOID:DOID:3829': 'pituitary adenoma',
 'CHEBI:CHEBI:64948': 'pyrrolizidine alkaloid',
 'MESH:D001364': 'Awareness',
 'MESH:D056693': 'Propionic Acidemia',
 'CHEBI:CHEBI:50868': 'penicillamine',
 'CHEBI:CHEBI:28044': 'phenylal

In [8]:
grounding_map, names, pos_labels = ground_with_gui(longforms, counts, 
                                                   grounding_map=grounding_map,
                                                   names=names, pos_labels=pos_labels, no_browser=True, port=8891)

INFO: [2020-10-31 04:07:46] indra.ontology.bio.ontology - Loading INDRA bio ontology from cache at /home/ubuntu/.indra/bio_ontology/1.4/bio_ontology.pkl
  dump_cookie(
  dump_cookie(
  dump_cookie(
  dump_cookie(
  dump_cookie(
  dump_cookie(


In [9]:
result = [grounding_map, names, pos_labels]

In [10]:
result

[{'acid protein': 'ungrounded',
  'alveolar pressure': 'MESH:D011312',
  'area production': 'ungrounded',
  'p aeruginosa': 'TAXONOMY:287',
  'pa': 'ungrounded',
  'pa alone': 'ungrounded',
  'pa rats treated with vehicle': 'ungrounded',
  'paclitaxel': 'CHEBI:CHEBI:45863',
  'paeoniflorin': 'CHEBI:CHEBI:7889',
  'paeonol': 'ungrounded',
  'paired': 'ungrounded',
  'paired associated': 'ungrounded',
  'palmitic': 'CHEBI:CHEBI:15756',
  'palmitic acid 16 0': 'ungrounded',
  'palmitic treated cells': 'MESH:D002477',
  'panama': 'MESH:D010176',
  'pancreas': 'MESH:D010179',
  'pancreatic': 'ungrounded',
  'pancreatic acinar': 'ungrounded',
  'pancreatic adenocarcinoma': 'HP:HP:0006725',
  'panduratin a': 'CHEBI:CHEBI:66725',
  'panic attacks': 'MESH:D016584',
  'para aortic': 'ungrounded',
  'paracetamol': 'CHEBI:CHEBI:46195',
  'paraformaldehyde': 'CHEBI:CHEBI:31962',
  'pararosaniline': 'CHEBI:CHEBI:87663',
  'parathion': 'CHEBI:CHEBI:27928',
  'parathyroid adenoma': 'MESH:D010282',
  '

In [12]:
grounding_map, names, pos_labels = [{'acid protein': 'ungrounded',
  'alveolar pressure': 'MESH:D011312',
  'area production': 'ungrounded',
  'p aeruginosa': 'TAXONOMY:287',
  'pa': 'ungrounded',
  'pa alone': 'ungrounded',
  'pa rats treated with vehicle': 'ungrounded',
  'paclitaxel': 'CHEBI:CHEBI:45863',
  'paeoniflorin': 'CHEBI:CHEBI:7889',
  'paeonol': 'ungrounded',
  'paired': 'ungrounded',
  'paired associated': 'ungrounded',
  'palmitic': 'CHEBI:CHEBI:15756',
  'palmitic acid 16 0': 'CHEBI:CHEBI:15756',
  'palmitic treated cells': 'ungrounded',
  'panama': 'ungrounded',
  'pancreas': 'MESH:D010179',
  'pancreatic': 'ungrounded',
  'pancreatic acinar': 'ungrounded',
  'pancreatic adenocarcinoma': 'HP:HP:0006725',
  'panduratin a': 'CHEBI:CHEBI:66725',
  'panic attacks': 'MESH:D016584',
  'para aortic': 'ungrounded',
  'paracetamol': 'CHEBI:CHEBI:46195',
  'paraformaldehyde': 'CHEBI:CHEBI:31962',
  'pararosaniline': 'CHEBI:CHEBI:87663',
  'parathion': 'CHEBI:CHEBI:27928',
  'parathyroid adenoma': 'MESH:D010282',
  'parenchymal arterioles': 'ungrounded',
  'parthenogenetic activity': 'ungrounded',
  'parthenolide': 'CHEBI:CHEBI:7939',
  'particle associated': 'ungrounded',
  'particle attached': 'ungrounded',
  'parvalbumin': 'HGNC:9704',
  'passive avoidance': 'ungrounded',
  'patchouli alcohol': 'ungrounded',
  'pathological age': 'ungrounded',
  'pcaggs haul42wt wt haul42pa': 'ungrounded',
  'peak acceleration': 'ungrounded',
  'peanut allergy': 'MESH:D021183',
  'pediococcus acidilactici': 'MESH:D000070016',
  'penicillamine': 'CHEBI:CHEBI:50868',
  'penicillin acylase': 'ungrounded',
  'penitrem a': 'ungrounded',
  'pennsylvania': 'ungrounded',
  'pepstatin a': 'MESH:C031375',
  'peptide amphiphile': 'ungrounded',
  'peptide aptamer': 'CHEBI:CHEBI:140491',
  'perillyl alcohol': 'CHEBI:CHEBI:15420',
  'perinatal asphyxia': 'ungrounded',
  'periostin binding dna aptamer': 'ungrounded',
  'peripheral available': 'ungrounded',
  'permeability coefficient of albumin': 'ungrounded',
  'pernicious anemia': 'MESH:D000752',
  'peters anomaly': 'DOID:DOID:0060673',
  'phagocytic activity': 'ungrounded',
  'pharyngeal arch': 'HGNC:24094',
  'phase angle': 'ungrounded',
  'phase array': 'ungrounded',
  'phenotype appearance': 'ungrounded',
  'phentolamine': 'CHEBI:CHEBI:8081',
  'phenylacetate': 'ungrounded',
  'phenylalanine': 'CHEBI:CHEBI:28044',
  'pheophorbide a': 'CHEBI:CHEBI:38257',
  'phonemic awareness': 'MESH:D001364',
  'phonological awareness': 'ungrounded',
  'phosphate': 'CHEBI:CHEBI:26020',
  'phosphatidic': 'CHEBI:CHEBI:16337',
  'phosphatidic acid': 'CHEBI:CHEBI:16337',
  'phosphonoacetate': 'CHEBI:CHEBI:15732',
  'phosphoramidon': 'ungrounded',
  'photoacoustic': 'ungrounded',
  'photoacoustic imaging': 'ungrounded',
  'photoactivatable': 'ungrounded',
  'phthalic anhydride': 'ungrounded',
  'phyllanthus amarus': 'ungrounded',
  'physical abuse': 'MESH:D000066550',
  'physical activity': 'MESH:D015444',
  'physical aggression': 'GO:GO:0002118',
  'physician assistant': 'MESH:D010823',
  'physiological apoptosis': 'GO:GO:0006915',
  'picolinamide': 'CHEBI:CHEBI:8200',
  'piericidin a': 'CHEBI:CHEBI:138511',
  'pilocytic astrocytoma': 'ungrounded',
  'pituitary adenoma': 'DOID:DOID:3829',
  'pituitary adrenal': 'ungrounded',
  'pituitary apoplexy': 'MESH:D010899',
  'placebo analgesia': 'MESH:D000698',
  'placental abruption': 'MESH:D000037',
  'plas amino ®': 'ungrounded',
  'plasma aldosteronism': 'ungrounded',
  'plasma aldosteronism concentrations': 'ungrounded',
  'plasmapheresis': 'MESH:D010956',
  'plasminogen activity': 'GO:GO:0031639',
  'platelet activity': 'ungrounded',
  'platelet aggregation': 'ungrounded',
  'plectranthus amboinicus': 'ungrounded',
  'pleomorphic adenoma': 'MESH:D008949',
  'podophyllotoxin acetate': 'MESH:C000613972',
  'polar assortant': 'ungrounded',
  'poly l arginine': 'CHEBI:CHEBI:53534',
  'polyacetylene': 'CHEBI:CHEBI:60552',
  'polyacrylamide': 'ungrounded',
  'polyacrylate': 'CHEBI:CHEBI:51134',
  'polyadenylation': 'MESH:D026723',
  'polyamide': 'ungrounded',
  'polyamine': 'CHEBI:CHEBI:88061',
  'polyanhydride': 'ungrounded',
  'polymerase': 'ungrounded',
  'polyphenolic acetate': 'CHEBI:CHEBI:30089',
  'positive affect': 'ungrounded',
  'postaddition': 'ungrounded',
  'posterior anterior': 'ungrounded',
  'posterior approach': 'ungrounded',
  'posterior atrophy': 'MESH:D001284',
  'potassium aspartate': 'CHEBI:CHEBI:132943',
  'potential asthmatics': 'ungrounded',
  'power amplifier': 'ungrounded',
  'pra aldosteronism': 'MESH:D006929',
  'praeruptorin a': 'MESH:C441070',
  'preadipocyte': 'ungrounded',
  'prealbumin': 'HGNC:12405',
  'predictive ability': 'ungrounded',
  'predictive accuracy': 'ungrounded',
  'prednisolone acetate': 'CHEBI:CHEBI:8380',
  'preferentially attached': 'ungrounded',
  'pregnanolone': 'MESH:D011280',
  'premature adrenarche': 'HP:HP:0012412',
  'prenatal androgenized': 'ungrounded',
  'primary aldosteronism': 'MESH:D006929',
  'primary angioplasty': 'MESH:D017130',
  'primary hyperaldosteronism': 'MESH:D006929',
  'principal axes registration': 'ungrounded',
  'prior authors': 'ungrounded',
  'prism adaptation': 'ungrounded',
  'prismatic adaptation': 'ungrounded',
  'proaerolysin': 'MESH:C107096',
  'proanthocyanidin': 'CHEBI:CHEBI:26267',
  'procainamide': 'ungrounded',
  'process aconitum jaluense': 'ungrounded',
  'prolonged smoking abstinence': 'ungrounded',
  'propagation angle': 'ungrounded',
  'propargyl alcohol': 'MESH:C028255',
  'prophylactic antibiotic': 'CHEBI:CHEBI:33281',
  'propionibacterium acnes': 'MESH:D011425',
  'propionic acidemia': 'ungrounded',
  'propionic aciduria': 'MESH:D056693',
  'propylamine': 'CHEBI:CHEBI:39870',
  'prostatic abscess': 'ungrounded',
  'protease activity': 'ungrounded',
  'proteasome activity': 'ungrounded',
  'protective ag': 'CHEBI:CHEBI:27569',
  'protective agents': 'CHEBI:CHEBI:50267',
  'protective antigen': 'MESH:C030325',
  'protein a': 'UP:P02976',
  'protein with the': 'ungrounded',
  'protocatechuic aldehyde': 'CHEBI:CHEBI:50205',
  'proton affinity': 'ungrounded',
  'proximal axon': 'GO:GO:0030424',
  'psammaplysene a': 'MESH:C501142',
  'pseudoaneurysm': 'MESH:D017541',
  'pseudomonas aeruginosa': 'TAXONOMY:287',
  'psoriatic arthritis': 'MESH:D015535',
  'pulmonary': 'ungrounded',
  'pulmonary artery': 'MESH:D011651',
  'pulmonary aspergillosis': 'MESH:D055732',
  'pulse alone': 'ungrounded',
  'pulse amplitude': 'ungrounded',
  'pulse artifact': 'MESH:D016477',
  'puromycin aminonucleoside': 'ungrounded',
  'pyridylaminated': 'ungrounded',
  'pyronaridine artesunate': 'CHEBI:CHEBI:63918',
  'pyrrolizidine alkaloids': 'CHEBI:CHEBI:64948'},
 {'MESH:D011312': 'Pressure',
  'TAXONOMY:287': 'Pseudomonas aeruginosa',
  'CHEBI:CHEBI:45863': 'paclitaxel',
  'CHEBI:CHEBI:7889': 'paeoniflorin',
  'CHEBI:CHEBI:15756': 'hexadecanoic acid',
  'MESH:D002477': 'Cells',
  'MESH:D010176': 'Panama',
  'MESH:D010179': 'Pancreas',
  'HP:HP:0006725': 'Pancreatic adenocarcinoma',
  'CHEBI:CHEBI:66725': 'Panduratin A',
  'MESH:D016584': 'Panic Disorder',
  'CHEBI:CHEBI:46195': 'paracetamol',
  'CHEBI:CHEBI:31962': 'paraformaldehyde macromolecule',
  'CHEBI:CHEBI:87663': 'pararosaniline',
  'CHEBI:CHEBI:27928': 'parathion',
  'MESH:D010282': 'Parathyroid Neoplasms',
  'CHEBI:CHEBI:7939': 'parthenolide',
  'HGNC:9704': 'PVALB',
  'MESH:D021183': 'Peanut Hypersensitivity',
  'MESH:D000070016': 'Pediococcus acidilactici',
  'CHEBI:CHEBI:50868': 'penicillamine',
  'MESH:D010414': 'Pennsylvania',
  'MESH:C031375': 'pepstatin',
  'CHEBI:CHEBI:140491': 'peptide aptamer',
  'CHEBI:CHEBI:15420': 'perillyl alcohol',
  'MESH:D000752': 'Anemia, Pernicious',
  'DOID:DOID:0060673': 'Peters anomaly',
  'HGNC:24094': 'ZBTB8OS',
  'CHEBI:CHEBI:8081': 'phentolamine',
  'CHEBI:CHEBI:28044': 'phenylalanine',
  'CHEBI:CHEBI:38257': 'pheophorbide a',
  'MESH:D001364': 'Awareness',
  'CHEBI:CHEBI:26020': 'phosphate',
  'CHEBI:CHEBI:16337': 'phosphatidic acid',
  'CHEBI:CHEBI:15732': 'phosphonoacetic acid',
  'MESH:D000066550': 'Physical Abuse',
  'MESH:D015444': 'Exercise',
  'GO:GO:0002118': 'aggressive behavior',
  'MESH:D010823': 'Physician Assistants',
  'GO:GO:0006915': 'apoptotic process',
  'CHEBI:CHEBI:8200': 'picolinamide',
  'CHEBI:CHEBI:138511': 'piericidin A',
  'DOID:DOID:3829': 'pituitary adenoma',
  'MESH:D010899': 'Pituitary Apoplexy',
  'MESH:D000698': 'Analgesia',
  'MESH:D000037': 'Abruptio Placentae',
  'MESH:D010956': 'Plasmapheresis',
  'GO:GO:0031639': 'plasminogen activation',
  'MESH:D008949': 'Adenoma, Pleomorphic',
  'MESH:C000613972': 'podophyllotoxin acetate',
  'CHEBI:CHEBI:53534': 'poly(L-arginine) macromolecule',
  'CHEBI:CHEBI:60552': 'polyacetylene polymer',
  'CHEBI:CHEBI:51134': 'acrylic macromolecule',
  'MESH:D026723': 'Polyadenylation',
  'CHEBI:CHEBI:88061': 'polyamine',
  'CHEBI:CHEBI:30089': 'acetate',
  'MESH:D001284': 'Atrophy',
  'CHEBI:CHEBI:132943': 'aspartate',
  'MESH:D006929': 'Hyperaldosteronism',
  'MESH:C441070': 'praeruptorin A',
  'HGNC:12405': 'TTR',
  'CHEBI:CHEBI:8380': 'Prednisolone acetate',
  'MESH:D011280': 'Pregnanolone',
  'HP:HP:0012412': 'Premature adrenarche',
  'MESH:D017130': 'Angioplasty',
  'MESH:C107096': 'proaerolysin',
  'CHEBI:CHEBI:26267': 'proanthocyanidin',
  'MESH:C028255': 'propargyl alcohol',
  'CHEBI:CHEBI:33281': 'antimicrobial agent',
  'MESH:D011425': 'Propionibacterium acnes',
  'MESH:D056693': 'Propionic Acidemia',
  'CHEBI:CHEBI:39870': 'propylamine',
  'CHEBI:CHEBI:27569': 'arabinogalactan',
  'CHEBI:CHEBI:50267': 'protective agent',
  'MESH:C030325': 'anthrax toxin',
  'UP:P02976': 'spa',
  'CHEBI:CHEBI:50205': '3,4-dihydroxybenzaldehyde',
  'GO:GO:0030424': 'axon',
  'MESH:C501142': 'psammaplysene A',
  'MESH:D017541': 'Aneurysm, False',
  'MESH:D015535': 'Arthritis, Psoriatic',
  'MESH:D011651': 'Pulmonary Artery',
  'MESH:D055732': 'Pulmonary Aspergillosis',
  'MESH:D016477': 'Artifacts',
  'CHEBI:CHEBI:63918': 'artesunate',
  'CHEBI:CHEBI:64948': 'pyrrolizidine alkaloid'},
 ['CHEBI:CHEBI:15756',
  'CHEBI:CHEBI:16337',
  'CHEBI:CHEBI:26267',
  'CHEBI:CHEBI:88061',
  'GO:GO:0031639',
  'MESH:C030325']]

In [13]:
excluded_longforms = ['pa', 'pa alone', 'pa rats treated with vehicle']

In [14]:
grounding_dict = {shortform: {longform: grounding_map[longform] 
                              for longform, _, _ in longforms if longform in grounding_map
                              and longform not in excluded_longforms}
                  for shortform, longforms in longform_dict.items()}
result = [grounding_dict, names, pos_labels]

if not os.path.exists(results_path):
    os.mkdir(results_path)
with open(os.path.join(results_path, f'{model_name}_preliminary_grounding_info.json'), 'w') as f:
    json.dump(result, f)

In [15]:
additional_entities = {}

In [16]:
unambiguous_agent_texts = {}

In [17]:
labeler = AdeftLabeler(grounding_dict)
corpus = labeler.build_from_texts((text, pmid) for pmid, text in all_texts.items())
agent_text_pmid_map = defaultdict(list)
for text, label, id_ in corpus:
    agent_text_pmid_map[label].append(id_)

entity_pmid_map = {entity: set(get_pmids_for_entity(*entity.split(':', maxsplit=1),
                                                major_topic=True))for entity in additional_entities}

In [18]:
intersection1 = []
for entity1, pmids1 in entity_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection1.append((entity1, entity2, len(pmids1 & pmids2)))

In [19]:
intersection2 = []
for entity1, pmids1 in agent_text_pmid_map.items():
    for entity2, pmids2 in entity_pmid_map.items():
        intersection2.append((entity1, entity2, len(set(pmids1) & pmids2)))

In [20]:
intersection1

[('HGNC:10967', 'HGNC:10967', 98)]

In [21]:
intersection2

[('GO:GO:0001837', 'HGNC:10967', 0),
 ('ungrounded', 'HGNC:10967', 0),
 ('HGNC:10967', 'HGNC:10967', 0),
 ('MESH:D055032', 'HGNC:10967', 0)]

In [18]:
all_used_pmids = set()
for entity, agent_texts in unambiguous_agent_texts.items():
    used_pmids = set()
    for agent_text in agent_texts[1]:
        pmids = set(get_pmids_for_agent_text(agent_text))
        new_pmids = list(pmids - all_texts.keys() - used_pmids)
        text_dict = get_plaintexts_for_pmids(new_pmids, contains=agent_texts)
        corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])
        used_pmids.update(new_pmids)
    all_used_pmids.update(used_pmids)
        
for entity, pmids in entity_pmid_map.items():
    new_pmids = list(set(pmids) - all_texts.keys() - all_used_pmids)
    if len(new_pmids) > 10000:
        new_pmids = random.choices(new_pmids, k=10000)
    _, contains = additional_entities[entity]
    text_dict = get_plaintexts_for_pmids(new_pmids, contains=contains)
    corpus.extend([(text, entity, pmid) for pmid, text in text_dict.items() if len(text) >= 5])

In [19]:
names.update({key: value[0] for key, value in additional_entities.items()})
names.update({key: value[0] for key, value in unambiguous_agent_texts.items()})
pos_labels = list(set(pos_labels) | additional_entities.keys() |
                  unambiguous_agent_texts.keys())

In [20]:
%%capture

classifier = AdeftClassifier(shortforms, pos_labels=pos_labels, random_state=1729)
param_grid = {'C': [100.0], 'max_features': [10000]}
texts, labels, pmids = zip(*corpus)
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=5)

INFO: [2020-10-31 04:28:11] /adeft/PP/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-10-31 04:35:53] /adeft/PP/adeft/adeft/modeling/classify.py - Best f1 score of 0.9354901091546155 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [21]:
classifier.stats

{'label_distribution': {'MESH:C030325': 375,
  'GO:GO:0031639': 301,
  'ungrounded': 884,
  'CHEBI:CHEBI:15732': 3,
  'HGNC:12405': 9,
  'UP:P02976': 27,
  'MESH:D011651': 294,
  'MESH:D000752': 8,
  'CHEBI:CHEBI:8081': 3,
  'CHEBI:CHEBI:16337': 1335,
  'CHEBI:CHEBI:15756': 201,
  'CHEBI:CHEBI:88061': 51,
  'TAXONOMY:287': 166,
  'MESH:D010956': 3,
  'MESH:D016584': 7,
  'CHEBI:CHEBI:50868': 14,
  'MESH:D006929': 100,
  'MESH:D011312': 4,
  'MESH:D015535': 6,
  'CHEBI:CHEBI:64948': 18,
  'CHEBI:CHEBI:28044': 8,
  'MESH:D011425': 8,
  'CHEBI:CHEBI:46195': 4,
  'MESH:D000037': 6,
  'CHEBI:CHEBI:53534': 2,
  'MESH:D011280': 2,
  'CHEBI:CHEBI:27928': 3,
  'MESH:D008949': 7,
  'MESH:D015444': 1197,
  'CHEBI:CHEBI:31962': 2,
  'HGNC:9704': 3,
  'CHEBI:CHEBI:15420': 11,
  'MESH:D017130': 4,
  'MESH:D010282': 6,
  'CHEBI:CHEBI:26267': 61,
  'CHEBI:CHEBI:45863': 6,
  'MESH:D026723': 3,
  'MESH:D010823': 4,
  'CHEBI:CHEBI:26020': 10,
  'CHEBI:CHEBI:60552': 3,
  'CHEBI:CHEBI:30089': 4,
  'MESH:C1

In [22]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [23]:
disamb.dump(model_name, results_path)

In [24]:
print(disamb.info())

Disambiguation model for PA

Produces the disambiguations:
	3,4-dihydroxybenzaldehyde	CHEBI:CHEBI:50205
	Abruptio Placentae	MESH:D000037
	Adenoma, Pleomorphic	MESH:D008949
	Analgesia	MESH:D000698
	Anemia, Pernicious	MESH:D000752
	Aneurysm, False	MESH:D017541
	Angioplasty	MESH:D017130
	Arthritis, Psoriatic	MESH:D015535
	Artifacts	MESH:D016477
	Atrophy	MESH:D001284
	Awareness	MESH:D001364
	Cells	MESH:D002477
	Exercise	MESH:D015444
	Hyperaldosteronism	MESH:D006929
	PVALB	HGNC:9704
	Panama	MESH:D010176
	Pancreas	MESH:D010179
	Pancreatic adenocarcinoma	HP:HP:0006725
	Panduratin A	CHEBI:CHEBI:66725
	Panic Disorder	MESH:D016584
	Parathyroid Neoplasms	MESH:D010282
	Peanut Hypersensitivity	MESH:D021183
	Pediococcus acidilactici	MESH:D000070016
	Pennsylvania	MESH:D010414
	Peters anomaly	DOID:DOID:0060673
	Physical Abuse	MESH:D000066550
	Physician Assistants	MESH:D010823
	Pituitary Apoplexy	MESH:D010899
	Plasmapheresis	MESH:D010956
	Polyadenylation	MESH:D026723
	Prednisolone acetate	CHEBI:CHEBI:8

In [25]:
model_to_s3(disamb)

In [30]:
preds = [disamb.disambiguate(text) for text in all_texts.values()]

In [31]:
texts = [text for pred, text in zip(preds, all_texts.values()) if pred[0] == 'HGNC:10967']

In [36]:
texts[3]

'The non-neuronal monoamine transporters (OCT1, OCT2, EMT, and PMAT) play a key role in the clearance of monoamines from extracellular compartments. In a previous report we described endometrial distribution and cyclic variation of the vesicular monoamine transporter (VMAT2) mRNA and the neuronal norepinephrine transporter (NET) mRNA. In the present study we used in situ hybridization, real-time PCR and immunohistochemistry to reveal tissue distribution and cyclic variation of mRNA for the non-neuronal monoamine transporters in the human endometrium and early pregnancy decidua. We found that non-neuronal monoamine transporters are predominantly expressed in the stroma. The plasma membrane monoamine transporter (PMAT) mRNA expression peaked in the proliferative phase, whereas the extra-neuronal monoamine transporter (EMT) mRNA expression peaked in the secretory phase. The organic cation transporter 2 (OCT2) mRNA expression was exclusively detected in few scattered stromal cells and OCT1