In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['HIR']
genes = ['KCNJ4', 'INSR']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [6]:
top = miners['HIR'].top()

In [7]:
longforms0 = miners['HIR'].get_longforms()

In [8]:
top

[('hepatic ischemia reperfusion', 16.666666666666664),
 ('the human insulin receptor', 12.0),
 ('human insulin receptor', 10.0),
 ('mab against the human insulin receptor', 9.23076923076923),
 ('reperfusion', 6.916666666666668),
 ('response', 6.363636363636363),
 ('risk', 4.444444444444445),
 ('monoclonal antibody mab against the human insulin receptor',
  3.6666666666666665),
 ('high intermediate risk', 3.6),
 ('high irradiation response', 3.2),
 ('hepatic irradiation', 3.0),
 ('high infertility risk', 3.0),
 ('hypoxia induced relaxation', 3.0),
 ('of human insulin receptor', 3.0),
 ('e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine',
  3),
 ('histone regulatory', 2.8),
 ('geissoschizine methyl ether gm hirsuteine hte hirsutine', 2.8),
 ('hypersensitive induced reaction', 2.5),
 ('preparative hepatic irradiation', 2.333333333333333),
 ('regulator', 2.0),
 ('hepatic ischaemia reperfusion', 2.0),
 ('high immune response', 2.0),
 ('the histone regu

In [9]:
longforms0

[('hepatic ischemia reperfusion', 16.666666666666664),
 ('the human insulin receptor', 12.0),
 ('response', 6.363636363636363),
 ('risk', 4.444444444444445),
 ('hepatic irradiation', 3.0),
 ('hypoxia induced relaxation', 3.0),
 ('e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine',
  3),
 ('histone regulatory', 2.8),
 ('hypersensitive induced reaction', 2.5),
 ('regulator', 2.0),
 ('a heat induced radiolabeling', 2.0),
 ('reconstruction', 1.5),
 ('high intensity running', 1.3333333333333333)]

In [10]:
longforms = [l for i, l in enumerate(longforms0) if i not in [1]]

In [12]:
longforms.extend([top[i] for i in [2]])

In [13]:
longforms.sort(key=lambda x: -x[1])

In [14]:
longforms

[('hepatic ischemia reperfusion', 16.666666666666664),
 ('human insulin receptor', 10.0),
 ('response', 6.363636363636363),
 ('risk', 4.444444444444445),
 ('hepatic irradiation', 3.0),
 ('hypoxia induced relaxation', 3.0),
 ('e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine',
  3),
 ('histone regulatory', 2.8),
 ('hypersensitive induced reaction', 2.5),
 ('regulator', 2.0),
 ('a heat induced radiolabeling', 2.0),
 ('reconstruction', 1.5),
 ('high intensity running', 1.3333333333333333)]

In [15]:
longforms, scores = zip(*longforms)

In [16]:
grounding_map = {}
for longform in longforms:
    try:
        grounding = gilda_ground(longform)
    except Exception:
        print(longform)
        continue
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [18]:
HIR_disamb = load_disambiguator('HIR', path='../results')



In [20]:
HIR_disamb.names

{'MESH:D015427': 'Reperfusion Injury', 'HGNC:6091': 'INSR'}

In [22]:
result = ground_with_gui(longforms, scores, grounding_map=HIR_disamb.grounding_dict['HIR'],
                         names=HIR_disamb.names)

In [24]:
grounding_map, names, pos_labels = result

In [25]:
names['HGNC:6265'] = 'KCNJ4'

In [26]:
pos_labels.append('HGNC:6265')

In [27]:
result = grounding_map, names, pos_labels

In [28]:
result

({'a heat induced radiolabeling': 'ungrounded',
  'e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine': 'ungrounded',
  'hepatic irradiation': 'ungrounded',
  'hepatic ischemia reperfusion': 'MESH:D015427',
  'high intensity running': 'ungrounded',
  'histone regulatory': 'ungrounded',
  'human insulin receptor': 'HGNC:6091',
  'hypersensitive induced reaction': 'ungrounded',
  'hypoxia induced relaxation': 'ungrounded',
  'reconstruction': 'ungrounded',
  'regulator': 'ungrounded',
  'response': 'ungrounded',
  'risk': 'ungrounded'},
 {'MESH:D015427': 'Reperfusion Injury',
  'HGNC:6091': 'INSR',
  'HGNC:6265': 'KCNJ4'},
 ['HGNC:6091', 'MESH:D015427', 'HGNC:6265'])

In [29]:
grounding_map, names, pos_labels = \
 ({'a heat induced radiolabeling': 'ungrounded',
  'e abbreviations glycycoumarin gc geissoschizine methyl ether gm hirsuteine hte hirsutine': 'ungrounded',
  'hepatic irradiation': 'ungrounded',
  'hepatic ischemia reperfusion': 'MESH:D015427',
  'high intensity running': 'ungrounded',
  'histone regulatory': 'ungrounded',
  'human insulin receptor': 'HGNC:6091',
  'hypersensitive induced reaction': 'ungrounded',
  'hypoxia induced relaxation': 'ungrounded',
  'reconstruction': 'ungrounded',
  'regulator': 'ungrounded',
  'response': 'ungrounded',
  'risk': 'ungrounded'},
 {'MESH:D015427': 'Reperfusion Injury',
  'HGNC:6091': 'INSR',
  'HGNC:6265': 'KCNJ4'},
 ['HGNC:6091', 'MESH:D015427', 'HGNC:6265'])

In [30]:
grounding_dict = {'HIR': grounding_map}

In [31]:
classifier = AdeftClassifier('HIR', pos_labels)

In [32]:
param_grid = {'C': [10.0, 100.0], 'max_features': [1000, 10000]}

In [33]:
labeler = AdeftLabeler(grounding_dict)

In [34]:
corpus = labeler.build_from_texts(shortform_texts)
corpus.extend(entrez_texts)

In [35]:
texts, labels = zip(*corpus)

In [36]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-19 14:45:42] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [1000, 10000]}
INFO: [2019-12-19 14:46:30] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.975740189131273 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [37]:
classifier.stats

{'label_distribution': {'HGNC:6091': 593,
  'ungrounded': 36,
  'MESH:D015427': 15,
  'HGNC:6265': 36},
 'f1': {'mean': 0.975740189131273, 'std': 0.012512247371838816},
 'precision': {'mean': 0.9799997587377456, 'std': 0.004034874113382134},
 'recall': {'mean': 0.9737022957662493, 'std': 0.022027914431823326}}

In [38]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [39]:
preds = disamb.disambiguate(shortform_texts)

In [40]:
a = [text for text, pred in zip(shortform_texts, preds) if pred[0] == 'HGNC:6265']

In [43]:
a[1]

'The effects of different extracellular cations or organic Ca(2+)-channel modulators on complement-induced changes in intracellular Ca2+ and cell death have been investigated in the transfected NIH-3T3 HIR 3.5 cell line, which overexpresses the human insulin receptor. Cells were incubated with mouse anti-(human insulin receptor) monoclonal antibodies before exposure to rabbit or human serum (sources of heterologous complement). Changes in intracellular Ca2+ were complement-dependent (measured by influx of 45Ca), as was cytotoxicity (monitored by leakage of lactate dehydrogenase into the culture supernatant). Addition of a dihydropyridine Ca(2+)-channel antagonist (nifedipine) or some bivalent inorganic cations caused inhibition of 45Ca entry via a novel channel distinct from endogenous voltage-gated Ca2+ channels. Nifedipine decreased, but conversely the addition of a phenylalkylamine Ca(2+)-channel antagonist (verapamil) or the inorganic Ca2+ agonists Ba2+ and Sr+ increased, complemen

In [44]:
model_to_s3(disamb)

In [45]:
disamb.dump('GARP', '../results')

In [47]:
model_to_s3(disamb)

In [2]:
d = load_disambiguator('GARP', '../results')

In [39]:
preds = disamb.disambiguate(shortform_texts)

In [40]:
a = [text for text, pred in zip(shortform_texts, preds) if pred[0] == 'HGNC:2151']

In [41]:
len(a)

8

In [44]:
a[2]

"Other support for this stabilizing hypothesis comes from studies on the beta-subunit of the rod cyclic nucleotide gated channel (CNG). It has been shown that PRPH2/RDS can bind to the beta subunit of rod CNG, as well as two other non-membrane bound isoforms also coded for by the  Cngb1  gene called glutamic acid rich proteins (GARP1/2) ( Poetsch et\xa0al., 2001; Ritter et\xa0al., 2011 ). The interactions between GARP and PRPH2/RDS were found to be strongest when PRPH2/RDS was in disulfide linked oligomers suggesting the larger PRPH2/RDS complexes might mediate this interaction ( Poetsch et\xa0al., 2001 ). Given the localization of CNGB1 on the plasma membrane, PRPH2/RDS on the disc rim, and GARP1/2 as free cytosolic forms in between, it has been hypothesized that interactions between PRPH2/RDS and GARP could serve as a mechanism by which PRPH2/RDS could regulate the sizing and orientation of the rod discs. In support of this, elimination of GARP in rods causes disorganization in disc 

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [33]:
i = 2
j = 4

In [34]:
i |= j

In [29]:
i = i | j

4

In [35]:
i

6

In [2]:
d = load_disambiguator('SNS', path='../results/')



In [3]:
d.grounding_dict

{'SNS': {'sacral nerve stimulation': 'ungrounded',
  'secondary neurospheres': 'ungrounded',
  'serevent nationwide surveillance': 'ungrounded',
  'short nascent dna strands': 'ungrounded',
  'sini san': 'ungrounded',
  'sinisan': 'ungrounded',
  'social networking sites': 'ungrounded',
  'sympathetic': 'MESH:D013564',
  'sympathetic nerve stimulation': 'MESH:D013564',
  'sympathetic nerve system': 'MESH:D013564',
  'sympathetic nervous system': 'MESH:D013564'}}