In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['BCR']
genes = ['BCR']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [8]:
len(shortform_texts)

3732

### top = miners['BCR'].top()

In [7]:
%time miners['BCR'].compute_alignment_scores()

CPU times: user 3.94 s, sys: 176 ms, total: 4.11 s
Wall time: 4.17 s


[('b cell receptor', 1533.7325648414985),
 ('receptor', 1022.0161943319838),
 ('the b cell receptor', 569.7206611570248),
 ('b cell antigen receptor', 392.6581352833638),
 ('biochemical recurrence', 277.2733333333333),
 ('the b cell antigen receptor', 268.8085106382979),
 ('of the b cell receptor', 163.02923976608187),
 ('of b cell receptor', 146.0),
 ('of the b cell antigen receptor', 85.91397849462368),
 ('and b cell receptor', 70.25974025974025),
 ('antigen receptor', 64.01322314049582),
 ('of biochemical recurrence', 54.57575757575758),
 ('through the b cell receptor', 45.24590163934426),
 ('in b cell receptor', 42.666666666666664),
 ('b cell ag receptor', 35.2),
 ('to the b cell receptor', 34.59090909090909),
 ('through the b cell antigen receptor', 32.97560975609756),
 ('by b cell receptor', 32.628571428571426),
 ('by the b cell receptor', 30.12121212121212),
 ('to b cell receptor', 28.952380952380953),
 ('and biochemical recurrence', 28.333333333333332),
 ('signaling through the

In [13]:
top[14]

('b cell ag receptor', 35.2)

In [8]:
longforms0 = miners['BCR'].get_longforms()

In [10]:
longforms0

[('b cell receptor', 1532.3416041546452),
 ('biochemical recurrence', 277.2733333333333),
 ('breakpoint cluster region', 25.515151515151516),
 ('rate', 5.25),
 ('receptor complex', 5.090909090909091),
 ('biochemical relapse', 4.666666666666667),
 ('bulbocavernosus reflex', 4.285714285714286),
 ('b cell receptor for antigen', 4.25),
 ('benefit cost ratio', 4.0),
 ('reductase', 3.6666666666666665),
 ('b cell', 3.6),
 ('blastocoel roof', 3.2),
 ('igm', 3.0),
 ('bromocriptine', 2.8),
 ('b cell receptor signaling', 2.0),
 ('from a rag2 deficient pro b cell lines were retrovirally transduced with μhc b1 8 hc or μhc + λlc',
  2),
 ('protein', 1.3333333333333333)]

In [11]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 1, 2, 5, 6, 7, 8, 10, 11, 12, 13, 14]]

In [12]:
longforms0

[('b cell receptor', 1532.3416041546452),
 ('biochemical recurrence', 277.2733333333333),
 ('breakpoint cluster region', 25.515151515151516),
 ('biochemical relapse', 4.666666666666667),
 ('bulbocavernosus reflex', 4.285714285714286),
 ('b cell receptor for antigen', 4.25),
 ('benefit cost ratio', 4.0),
 ('b cell', 3.6),
 ('blastocoel roof', 3.2),
 ('igm', 3.0),
 ('bromocriptine', 2.8),
 ('b cell receptor signaling', 2.0)]

In [None]:
longforms0.extend([x for i, x in enumerate(top) if i in [3, 14, ]])

In [14]:
longforms0

[('b cell receptor', 1532.3416041546452),
 ('biochemical recurrence', 277.2733333333333),
 ('breakpoint cluster region', 25.515151515151516),
 ('biochemical relapse', 4.666666666666667),
 ('bulbocavernosus reflex', 4.285714285714286),
 ('b cell receptor for antigen', 4.25),
 ('benefit cost ratio', 4.0),
 ('b cell', 3.6),
 ('blastocoel roof', 3.2),
 ('igm', 3.0),
 ('bromocriptine', 2.8),
 ('b cell receptor signaling', 2.0)]

In [15]:
longforms0.sort(key=lambda x: -x[1])

In [16]:
longforms, scores = zip(*longforms0)

In [17]:
longforms

('b cell receptor',
 'biochemical recurrence',
 'breakpoint cluster region',
 'biochemical relapse',
 'bulbocavernosus reflex',
 'b cell receptor for antigen',
 'benefit cost ratio',
 'b cell',
 'blastocoel roof',
 'igm',
 'bromocriptine',
 'b cell receptor signaling')

In [18]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [22]:
grounding_map

{'b cell': 'MESH:D011947',
 'b cell receptor': 'MESH:D011947',
 'b cell receptor for antigen': 'MESH:D011947',
 'b cell receptor signaling': 'MESH:D011947',
 'benefit cost ratio': 'ungrounded',
 'biochemical recurrence': 'ungrounded',
 'biochemical relapse': 'ungrounded',
 'blastocoel roof': 'ungrounded',
 'breakpoint cluster region': 'HGNC:1014',
 'bromocriptine': 'CHEBI:CHEBI:3181',
 'bulbocavernosus reflex': 'ungrounded',
 'igm': 'MESH:D011947'}

In [23]:
names

{'MESH:D011947': 'Receptors, Antigen, B-Cell',
 'HGNC:1014': 'BCR',
 'CHEBI:CHEBI:3181': 'bromocriptine'}

In [21]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [None]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [24]:
result = (grounding_map, names, pos_labels)

In [25]:
result

({'b cell': 'MESH:D011947',
  'b cell receptor': 'MESH:D011947',
  'b cell receptor for antigen': 'MESH:D011947',
  'b cell receptor signaling': 'MESH:D011947',
  'benefit cost ratio': 'ungrounded',
  'biochemical recurrence': 'ungrounded',
  'biochemical relapse': 'ungrounded',
  'blastocoel roof': 'ungrounded',
  'breakpoint cluster region': 'HGNC:1014',
  'bromocriptine': 'CHEBI:CHEBI:3181',
  'bulbocavernosus reflex': 'ungrounded',
  'igm': 'MESH:D011947'},
 {'MESH:D011947': 'Receptors, Antigen, B-Cell',
  'HGNC:1014': 'BCR',
  'CHEBI:CHEBI:3181': 'bromocriptine'},
 ['CHEBI:CHEBI:3181', 'HGNC:1014', 'MESH:D011947'])

In [26]:
grounding_map, names, pos_labels = ({'b cell': 'MESH:D011947',
  'b cell receptor': 'MESH:D011947',
  'b cell receptor for antigen': 'MESH:D011947',
  'b cell receptor signaling': 'MESH:D011947',
  'benefit cost ratio': 'ungrounded',
  'biochemical recurrence': 'ungrounded',
  'biochemical relapse': 'ungrounded',
  'blastocoel roof': 'ungrounded',
  'breakpoint cluster region': 'HGNC:1014',
  'bromocriptine': 'CHEBI:CHEBI:3181',
  'bulbocavernosus reflex': 'ungrounded',
  'igm': 'MESH:D011947'},
 {'MESH:D011947': 'Receptors, Antigen, B-Cell',
  'HGNC:1014': 'BCR',
  'CHEBI:CHEBI:3181': 'bromocriptine'},
 ['CHEBI:CHEBI:3181', 'HGNC:1014', 'MESH:D011947'])

In [27]:
grounding_dict = {'BCR': grounding_map}

In [28]:
classifier = AdeftClassifier('BCR', pos_labels=pos_labels)

In [29]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [30]:
labeler = AdeftLabeler(grounding_dict)

In [31]:
corpus = labeler.build_from_texts(shortform_texts)

In [32]:
corpus.extend(entrez_texts)

In [33]:
texts, labels = zip(*corpus)

In [34]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-13 10:58:17] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-13 10:59:01] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9751167595458782 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [35]:
classifier.stats

{'label_distribution': {'MESH:D011947': 1442,
  'ungrounded': 211,
  'HGNC:1014': 415,
  'CHEBI:CHEBI:3181': 4},
 'f1': {'mean': 0.9751167595458782, 'std': 0.007950771506375473},
 'precision': {'mean': 0.9737699556189972, 'std': 0.008757750168350295},
 'recall': {'mean': 0.9769032789548782, 'std': 0.0068867432080245445}}

In [36]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [37]:
d = disamb.disambiguate(shortform_texts)

In [38]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC']

In [39]:
len(a)

539

In [40]:
a[0]

'The development of drug resistance in ALL cells as occurs  in vivo  can be modeled by coculture of ALL cells in the presence of irradiated stromal cells during drug exposure. We have shown previously that this allows for the outgrowth of drug-insensitive leukemic cells (e.g. Zhang  et al 5 ), in a process that has been named environmental-mediated drug resistance. Microarray analysis on precursor B-lineage (pre-B) ALL cells from a  BCR/ABL  transgenic mouse model for Ph-positive ALL undergoing environmental-mediated drug resistance showed significantly increased expression of mRNAs related to stress and inflammation. 6  Among others, we found that transcript levels of the  Galectin-3  gene ( Lgals3 ) were significantly increased upon development of resistance to the Bcr/Abl-targeted tyrosine kinase inhibitor nilotinib, and this was also seen in pre-B ALL cells isolated from the BM of  BCR/ABL  transgenic mice that had been treated with nilotinib for 8 days. 6\n'

In [None]:
disamb.dump('LAK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [41]:
model_to_s3(disamb)

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('../results/light_kinase_scores_improved.csv', sep=',')
df2 = pd.read_csv('../results/light_kinase_scores_improved2.csv', sep=',')

In [4]:
df = pd.concat([df1, df2])

In [13]:
df[(df.gene_name == 'CSNK1A1') & (df.num_stmts == 3) & (df.num_entrez == 135)]

Unnamed: 0,gene_name,agent_text,num_stmts,num_entrez,CI_lower,CI_upper,adeft_lower,adeft_upper,theta_lower,theta_upper
9313,CSNK1A1,GSK-3 and CK1,3,135,0.0,1.0,0.0,1.0,0.48036,0.51964
9334,CSNK1A1,CK1α and GSK3β,3,135,0.399942,1.0,0.0,1.0,0.924739,0.939923
9351,CSNK1A1,"GSK3β, CK1α and Dyrk1A",3,135,0.0,1.0,0.0,1.0,0.0,1.0
9429,CSNK1A1,GRK2 and CK1α,3,135,0.0,1.0,0.0,1.0,0.0,1.0
