In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['IR']
genes = []
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
genes

[]

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts |= set(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [36]:
n = 1000
texts = [shortform_texts[i * n:(i + 1) * n] for i in range((len(shortform_texts) + n - 1) // n )]  

In [38]:
miners = [AdeftMiner('IR') for i in range(len(texts))]

In [39]:
len(miners)

11

In [40]:
for miner, text in zip(miners, texts):
    miner.process_texts(text)

In [41]:
%time out = compose(*miners)

CPU times: user 6.02 s, sys: 103 ms, total: 6.12 s
Wall time: 6.17 s


In [43]:
miner0 = AdeftMiner('IR')

In [44]:
miner0.process_texts(shortform_texts)

In [50]:
%memit


peak memory: 1000.41 MiB, increment: -0.07 MiB


In [8]:
miner1.process_texts(texts1)
miner2.process_texts(texts2)

In [9]:
from adeft.discover import compose

CPU times: user 98.5 ms, sys: 2.52 ms, total: 101 ms
Wall time: 101 ms


In [10]:
miner1.top()

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 24.999999999999996),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.307692307692307),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235293),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333334),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545455),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.66666

In [14]:
miner1.top()

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 25.0),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.307692307692307),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235293),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333336),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545455),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.666666666666667),
 

In [12]:
miners['TEC'].top()

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 25.0),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.307692307692307),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235293),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333334),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545455),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.666666666666667),
 

In [13]:
miner3.top()

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 25.0),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.307692307692303),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235293),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333336),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545455),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.666666666666667),
 

In [None]:
print(len(shortform_texts))

In [14]:
top = miners['TEC'].top()

In [15]:
top

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 25.0),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.3076923076923),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235297),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333334),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545453),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.666666666666667),
 ('

In [13]:
from copy import deepcopy

In [14]:
deepcopy(miner2)

<adeft.discover.AdeftMiner at 0x134267a10>

In [10]:
top[-1]

('pure ppc and pure phb had a t g at 60 21 and 0 ° c respect if the plasticizers content',
 0.0)

In [6]:
top

[('epithelial cells', 126.4090909090909),
 ('thymic epithelial cells', 120.23333333333333),
 ('cells', 80.7910447761194),
 ('tubular epithelial cells', 61.470588235294116),
 ('tumor endothelial cells', 25.0),
 ('endothelial cells', 21.80952380952381),
 ('the transcription elongation complex', 20.307692307692307),
 ('transcription elongation complex', 19.526315789473685),
 ('renal tubular epithelial cells', 17.176470588235297),
 ('thyroid epithelial cells', 11.692307692307692),
 ('human thymic epithelial cells', 11.333333333333334),
 ('elongation complex', 10.363636363636367),
 ('of thymic epithelial cells', 9.454545454545453),
 ('of the transcription elongation complex', 7.777777777777778),
 ('triethyl citrate', 6.75),
 ('proximal tubular epithelial cells', 5.714285714285714),
 ('to the transcription elongation complex', 5.428571428571429),
 ('a thymic epithelial cells line', 5.0),
 ('in thymic epithelial cells', 4.666666666666667),
 ('of tumor endothelial cells', 4.666666666666667),
 

In [8]:
%time miners['TEC'].compute_alignment_scores()

CPU times: user 29.7 ms, sys: 1.9 ms, total: 31.6 ms
Wall time: 30.6 ms


In [9]:
miners['TEC'].get_longforms(scale=True, cutoff=-0.2)

[('thymic epithelial cells', 0.9233072916666667),
 ('tubular epithelial cells', 0.8231209150326797),
 ('tumor endothelial cells', 0.7479091995221027),
 ('thyroid epithelial cells', 0.6490384615384616),
 ('the transcription elongation complex', 0.6206896551724138),
 ('triethyl citrate', 0.5),
 ('a thymic epithelial cells line', 0.4444444444444444),
 ('thromboembolic complications', 0.2857142857142857),
 ('thyrocyte', 0.275),
 ('thermal expansion coefficient', 0.275),
 ('total electron content', 0.2592592592592593),
 ('ternary elongation complex', 0.2592592592592593),
 ('total erythrocyte count', 0.16666666666666666),
 ('tissue engineered construct', 0.16666666666666666)]

In [7]:
miners['TEC'].get_longforms(scale=True, cutoff=0.0)

[('thymic epithelial cells', 0.9315104166666667),
 ('tubular epithelial cells', 0.8398692810457516),
 ('tumor endothelial cells', 0.7741935483870968),
 ('thyroid epithelial cells', 0.6682692307692307),
 ('the transcription elongation complex', 0.6657824933687002),
 ('triethyl citrate', 0.5227272727272727),
 ('a thymic epithelial cells line', 0.4444444444444444),
 ('total electron content', 0.3333333333333333),
 ('ternary elongation complex', 0.3333333333333333),
 ('thyrocyte', 0.325),
 ('thermal expansion coefficient', 0.325),
 ('thromboembolic complications', 0.2857142857142857),
 ('count', 0.2222222222222222),
 ('tissue engineered construct', 0.16666666666666666),
 ('coli', 0.055555555555555546),
 ('cooler', 0.055555555555555546)]

In [8]:
miners['TEC'].get_longforms(scale=True, smoothing_param=10)

[('thymic epithelial cells', 0.8898009950248756),
 ('tubular epithelial cells', 0.77526395173454),
 ('tumor endothelial cells', 0.6486486486486487),
 ('the transcription elongation complex', 0.5516483516483516),
 ('triethyl citrate', 0.3382352941176471),
 ('a thymic epithelial cells line', 0.26666666666666666),
 ('total electron content', 0.2),
 ('ternary elongation complex', 0.2),
 ('thyrocyte', 0.18571428571428572),
 ('thermal expansion coefficient', 0.18571428571428572),
 ('thromboembolic complications', 0.15384615384615385),
 ('count', 0.13333333333333333),
 ('tissue engineered construct', 0.08333333333333333),
 ('coli', 0.027777777777777773),
 ('cooler', 0.027777777777777773)]

In [10]:
%time miners['TEC'].compute_alignment_scores()

CPU times: user 253 ms, sys: 18.6 ms, total: 271 ms
Wall time: 285 ms


In [84]:
grounding_dict['TEC']

{'ternary elongation complex': 'transcription elongation complex',
 'thermal expansion coefficient': 'ungrounded',
 'thromboembolic complications': 'ungrounded',
 'thymic epithelial cells': 'NCIT:C33771',
 'thymic epithelial cells line': 'NCIT:C33771',
 'thyrocyte': 'MESH:D000072637',
 'thyroid epithelial cells': 'MESH:D000072637',
 'tissue engineered construct': 'ungrounded',
 'total electron content': 'ungrounded',
 'tracheal epithelial cells': 'ungrounded',
 'transcription elongation complex': 'transcription elongation complex',
 'triethyl citrate': 'PUBCHEM:6506',
 'tubular epithelial cells': 'NCIT:C61147',
 'tumor associated endothelial cells': 'NCIT:C37088',
 'tumor endothelial cells': 'NCIT:C37088'}

In [12]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [2, 4, 5, 6, 8, 9]]

In [13]:
longforms0

[('triethyl citrate', 6.75),
 ('total electron content', 4.0),
 ('thyrocyte', 3.6),
 ('thermal expansion coefficient', 3.6),
 ('thromboembolic complications', 3.0),
 ('tissue engineered construct', 2.0)]

In [14]:
top

[('tissue engineered chamber', 1),
 ('a ternary elongation complex', 1),
 ('rat thymic epithelial cells', 1),
 ('the traumatic experiments checklist', 1),
 ('a human thymic epithelial cells', 1),
 ('four efficiency translocating escherichia coli', 1),
 ('human endocrine thyroid epithelial cells', 1),
 ('exposure of renal tubular epithelial cells', 1),
 ('here we reported a thermal elastomer composite', 1),
 ('in autoimmune thyroid disease thyroid epithelial cells', 1),
 ('increased evidence indicating that tumor endothelial cells', 1),
 ('it has been proposed elsewhere that thyrocyte', 1),
 ('myeloablative conditioning results in thymic epithelial cells', 1),
 ('phenotype changes in the tubular epithelial cells', 1),
 ('the p63 gene regulation thymic epithelial cells', 1),
 ('despite the essential role of thymic epithelial cells', 1),
 ('in various inflammatory kidney disease tubular epithelial cells', 1),
 ('the susceptibility or resistance of tubular epithelial cells', 1),
 ('to eval

In [24]:
top[32]

('thymic epithelial cells line', 3.25)

In [15]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [1, 3, 4, 7, 9, 20, 21, 29, 32]])

In [16]:
longforms0

[('triethyl citrate', 6.75),
 ('total electron content', 4.0),
 ('thyrocyte', 3.6),
 ('thermal expansion coefficient', 3.6),
 ('thromboembolic complications', 3.0),
 ('tissue engineered construct', 2.0),
 ('thymic epithelial cells', 120.23333333333333),
 ('tubular epithelial cells', 61.47058823529413),
 ('tumor endothelial cells', 25.0),
 ('transcription elongation complex', 19.526315789473685),
 ('thyroid epithelial cells', 11.692307692307692),
 ('tumor associated endothelial cells', 4.6),
 ('ternary elongation complex', 4.0),
 ('tracheal epithelial cells', 3.6),
 ('thymic epithelial cells line', 3.25)]

In [17]:
longforms0.sort(key=lambda x: -x[1])

In [18]:
longforms, scores = zip(*longforms0)

In [19]:
longforms

('thymic epithelial cells',
 'tubular epithelial cells',
 'tumor endothelial cells',
 'transcription elongation complex',
 'thyroid epithelial cells',
 'triethyl citrate',
 'tumor associated endothelial cells',
 'total electron content',
 'ternary elongation complex',
 'thyrocyte',
 'thermal expansion coefficient',
 'tracheal epithelial cells',
 'thymic epithelial cells line',
 'thromboembolic complications',
 'tissue engineered construct')

In [48]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [49]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names, pos_labels=pos_labels)

In [37]:
TEC_pos_labels = ['MESH:D000072637',
 'NCIT:C33771',
 'NCIT:C37088',
 'NCIT:C61147',
 'PUBCHEM:6506']

In [34]:
TEC_names = {'transcription elongation complex': 'transcription elongation complex',
 'NCIT:C33771': 'Thymic Epithelial Cell',
 'MESH:D000072637': 'Thyroid Epithelial Cells',
 'PUBCHEM:6506': 'Triethyl citrate',
 'NCIT:C61147': 'Renal Tubular Epithelial Cell',
 'NCIT:C37088': 'Neoplastic Endothelial Cell'}

In [31]:
TEC_grounding_map = {'ternary elongation complex': 'transcription elongation complex',
 'thermal expansion coefficient': 'ungrounded',
 'thromboembolic complications': 'ungrounded',
 'thymic epithelial cells': 'NCIT:C33771',
 'thymic epithelial cells line': 'NCIT:C33771',
 'thyrocyte': 'MESH:D000072637',
 'thyroid epithelial cells': 'MESH:D000072637',
 'tissue engineered construct': 'ungrounded',
 'total electron content': 'ungrounded',
 'tracheal epithelial cells': 'ungrounded',
 'transcription elongation complex': 'transcription elongation complex',
 'triethyl citrate': 'PUBCHEM:6506',
 'tubular epithelial cells': 'NCIT:C61147',
 'tumor associated endothelial cells': 'NCIT:C37088',
 'tumor endothelial cells': 'NCIT:C37088'}

In [38]:
top = miners['TECs'].top()

In [40]:
longforms0 = miners['TECs'].get_longforms()

In [41]:
list(enumerate(longforms0))

[(0, ('epithelial cells', 54.734693877551024)),
 (1, ('transcription elongation complexes', 4.0)),
 (2, ('tumor ecs', 2.0))]

In [42]:
longforms0 = [x for i, x in enumerate(longforms0) if i not in [0]]

In [43]:
list(enumerate(top))

[(0, ('epithelial cells', 54.734693877551024)),
 (1, ('thymic epithelial cells', 52.111111111111114)),
 (2, ('tubular epithelial cells', 29.35483870967742)),
 (3, ('cells', 26.707964601769916)),
 (4, ('of thymic epithelial cells', 10.666666666666666)),
 (5, ('endothelial cells', 8.166666666666666)),
 (6, ('of tubular epithelial cells', 5.142857142857142)),
 (7, ('transcription elongation complexes', 4.0)),
 (8, ('in tubular epithelial cells', 3.0)),
 (9, ('renal tubular epithelial cells', 3.0)),
 (10, ('tumor endothelial cells', 2.8)),
 (11, ('that tumour endothelial cells', 2.5)),
 (12, ('tumor ecs', 2.0)),
 (13, ('by thymic epithelial cells', 2.0)),
 (14, ('of tumor endothelial cells', 2.0)),
 (15, ('primary tubular epithelial cells', 2.0)),
 (16,
  ('ns changed similarly between doxycycline dox treated and hypoxia exposed tubular epithelial cells',
   2)),
 (17,
  ('hif 2α activation affects expression of various genes in cultured primary tubular epithelial cells',
   2)),
 (18, ('e

In [44]:
longforms0.extend([(longform, score) for i, (longform, score) in enumerate(top)
                   if i in [1, 2, 23, 24, 25, 30]])

In [46]:
longforms0.sort(key=lambda x: -x[1])

In [47]:
longforms, scores = zip(*longforms0)

In [60]:
TECs_grounding_map = {'thymic epithelial cells': 'NCIT:C33771',
 'thymus epithelial cells': 'NCIT:C33771',
 'tissue engineered constructs': 'ungrounded',
 'transcription elongation complexes': 'transcription elongation complex',
 'tubular epithelial cells': 'NCIT:C61147',
 'tubule epithelial cells': 'NCIT:C61147',
 'tumor associated endothelial cells': 'NCIT:C37088',
 'tumor ecs': 'ungrounded'}

In [61]:
TECs_names = {'NCIT:C33771': 'Thymic Epithelial Cell',
 'transcription elongation complex': 'transcription elongation complex',
 'NCIT:C61147': 'Renal Tubular Epithelial Cell',
 'NCIT:C37088': 'Neoplastic Endothelial Cell'}

In [62]:
TECs_pos_labels = ['NCIT:C33771', 'NCIT:C37088', 'NCIT:C61147']

In [63]:
grounding_dict = {'TEC': TEC_grounding_map, 'TECs': TECs_grounding_map}

In [64]:
names = TECs_names.copy()

In [65]:
names.update(TEC_names.copy())

{'NCIT:C33771': 'Thymic Epithelial Cell',
 'transcription elongation complex': 'transcription elongation complex',
 'NCIT:C61147': 'Renal Tubular Epithelial Cell',
 'NCIT:C37088': 'Neoplastic Endothelial Cell',
 'MESH:D000072637': 'Thyroid Epithelial Cells',
 'PUBCHEM:6506': 'Triethyl citrate',
 'HGNC:11719': 'TEC'}

In [67]:
pos_labels = list(set(TEC_pos_labels) | set(TECs_pos_labels))

In [68]:
names['HGNC:11719'] = 'TEC'
pos_labels.append('HGNC:11719')

In [70]:
classifier = AdeftClassifier(['TEC', 'TECs'], pos_labels=pos_labels)

In [71]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [72]:
labeler = AdeftLabeler(grounding_dict)

In [85]:
corpus = labeler.build_from_texts(all_texts)

In [86]:
corpus.extend(entrez_texts)

In [87]:
texts, labels = zip(*corpus)

In [88]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-24 10:16:36] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-24 10:17:04] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9450482175776157 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [89]:
classifier.stats

{'label_distribution': {'NCIT:C33771': 190,
  'transcription elongation complex': 46,
  'NCIT:C61147': 107,
  'ungrounded': 20,
  'MESH:D000072637': 14,
  'NCIT:C37088': 18,
  'PUBCHEM:6506': 4,
  'HGNC:11719': 66},
 'f1': {'mean': 0.9450482175776157, 'std': 0.019430760004612618},
 'precision': {'mean': 0.9320960087893695, 'std': 0.021874467817276286},
 'recall': {'mean': 0.9648233992650994, 'std': 0.014966741748880642}}

In [90]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [100]:
d = disamb.disambiguate(all_texts)

In [101]:
a = [(text, pred) for pred, text in zip(d, all_texts) if pred[0].startswith('HGNC')]

In [102]:
len(a)

73

In [107]:
a[4]

("Recent findings indicate that the link between PI3K and STAT3 phosphorylation is mediated by TEC family kinases ( Vogt and Hart, 2011 ). LMP2A increases the activation of the TEC family kinase, Bruton's tyrosine kinase (BTK) ( Mano, 1999; Merchant and Longnecker, 2001 ), suggesting that this kinase may be the link connecting PI3K activation and STAT3 phosphorylation in LMP2A-expressing cell lines. If LMP2A activates BTK to enhance STAT3 phosphorylation, then a BTK inhibitor would block the LMP2A-mediated increase in STAT3 phosphorylation. To test this possibility, LMP2A-negative and –positive B cell lines were exposed to the BTK inhibitor, Ibrutinib, for 24 h and STAT3 phosphorylation was analyzed by Western blot analysis. As shown in  Fig. 3 A, the addition of Ibrutinib blocked the LMP2A-dependent phosphorylation of STAT3, without decreasing STAT3 phosphorylation in non-LMP2A-expressing cells. Furthermore, since Ibrutinib decreased STAT3 phosphorylation in LMP2A-expressing cells, we

In [109]:
disamb.dump('TEC:TEC_S', '../results')

In [79]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('TEC:TEC_S', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [108]:
model_to_s3(disamb)

In [80]:
d = load_disambiguator('TEC')

In [82]:
print(d.info())

Disambiguation model for TEC

Produces the disambiguations:
	Neoplastic Endothelial Cell*	NCIT:C37088
	Renal Tubular Epithelial Cell*	NCIT:C61147
	TEK*	HGNC:11724
	Thymic Epithelial Cell*	NCIT:C33771
	Thyroid Epithelial Cells*	MESH:D000072637
	Triethyl citrate*	PUBCHEM:6506
	transcription elongation complex	transcription elongation complex

Training data had class balance:
	Thymic Epithelial Cell*	115
	Ungrounded	66
	Renal Tubular Epithelial Cell*	64
	transcription elongation complex	34
	Neoplastic Endothelial Cell*	16
	Ungrounded	15
	Thyroid Epithelial Cells*	14
	Triethyl citrate*	4

Classification Metrics:
	F1 score:	0.92136
	Precision:	0.91935
	Recall:		0.93926

* Positive labels
See Docstring for explanation



In [110]:
from adeft import available_shortforms

In [111]:
available_shortforms

{'PC': 'PC',
 'DOG1': 'DOG1',
 'TEK': 'TEK',
 'EMT': 'EMT',
 'SP': 'SP',
 'PCS': 'PCS',
 'CKI': 'CKI',
 'PE': 'PE',
 'RET': 'RET',
 'TEC': 'TEC',
 'ROS': 'ROS',
 'NP': 'NP:NP_S',
 'NPs': 'NP:NP_S',
 'MS': 'MS',
 'MT': 'MT',
 'BP': 'BP',
 'GH': 'GH',
 'AD': 'AD',
 'GT': 'GT',
 'SNS': 'SNS',
 'GARP': 'GARP',
 'DA': 'DA',
 'BCR': 'BCR',
 'GR': 'GR',
 'HIR': 'HIR',
 'IR': 'IR',
 'HK2': 'HK2',
 'ARF': 'ARF',
 'CS': 'CS',
 'UFO': 'UFO',
 'LAK': 'LAK',
 'EC': 'EC',
 'ARG': 'ARG',
 'MOS': 'MOS',
 'STD': 'STD',
 'PD1': 'PD1',
 'TGH': 'TGH',
 'PKD': 'PKD',
 'MIP': 'MIP',
 'RA': 'RA',
 'PCP': 'PCP',
 'EAG': 'EAG',
 'PI': 'PI',
 'PS': 'PS',
 'PA': 'PA',
 'MB': 'MB',
 'HA': 'HA',
 'FES': 'FES',
 'AR': 'AR',
 'HR': 'HR',
 'NE': 'NE',
 'UBC': 'UBC',
 'GSC': 'GSC',
 'AA': 'AA',
 'FER': 'FER',
 'NIS': 'NIS',
 'GC': 'GC',
 'DLK': 'DLK',
 'HK1': 'HK1',
 'CM': 'CM',
 'RB': 'RB:R_B',
 'Rb': 'RB:R_B',
 'LH': 'LH',
 'ER': 'ER',
 'TF': 'TF',
 'PGP': 'PGP',
 'GCA': 'GCA',
 'RK': 'RK',
 'RAC': 'RAC',
 'MCT': 'M

In [12]:
def p(s, n, alpha, beta):
    return (s + alpha)/(n + alpha + beta)

In [17]:
p(6, 7, 1, 10)

0.3888888888888889

In [51]:
import re

In [43]:
x.defining_pattern_pattern = r'(?<=\()\s?\w+(?=\s?\))' 

In [42]:
re.findall(pattern, 'xxx ( ssdff ) b1a aaa (sdf ooss) xxosdf (2d9dff sssj) ssdfdfff (299 sss s)')

error: unbalanced parenthesis at position 64

In [259]:
_258.groups()

(' ssdff', None)

In [156]:
_104.groups()

('aaaaaaaaaaa)', 'aaaaaaaaaaa)')

In [6]:
from adeft_indra.mine import MiningOperation

In [7]:
x = MiningOperation('a', [1, 2, 3, 4, 5])


In [8]:
len(shortform_texts)

10527

In [8]:
shortforms = x.find_shortforms(shortform_texts)

In [11]:
'TEC' in shortforms

False

In [16]:
x = [text for text in shortform_texts if '(HEL 299)' in text]

In [18]:
x[0]

'Apoptosis is a key mechanism for enhanced cellular radiosensitivity in radiation therapy. Studies suggest that Akt signaling may play a role in apoptosis and radioresistance. This study evaluates the possible modulating role of amiloride, an antihypertensive agent with a modulating effect to alternative splicing for regulating apoptosis, in the antiproliferative effects induced by ionizing radiation (IR) in glioblastoma multiforme (GBM) 8401 cells. Analysis of cell viability showed that amiloride treatment significantly inhibited cell proliferation in irradiated GBM8401 cells (p<0.05) in a time-dependent manner, especially in cells treated with amiloride with IR post-treatment. In comparison with GBM8401 cells treated with amiloride alone, with GBM8401 cells treated with IR alone, and with human embryonic lung fibroblast control cells (HEL 299), GBM8401 cells treated with IR combined with amiloride showed increased overexpression of phosphorylated Akt, regardless of whether IR treatme

In [18]:
re.match('^(Figure)|(Table)', 'Table5 5')

<re.Match object; span=(0, 5), match='Table'>

In [15]:
import re

In [19]:
't'.startswith?

Object `startswith` not found.


In [None]:
't'.startswith

In [20]:
x = 'assdf'

In [21]:
x.startswith?

In [36]:
pattern2 = r'(^a.*b$)'

In [40]:
re.match(pattern2, 'abbb')

In [10]:
t = 'asddf'

In [11]:
t.split()

['asddf']

In [12]:
shortforms

Counter({'56': 6,
         '38': 22,
         '57': 16,
         '50': 13,
         '58': 6,
         '274': 2,
         '275': 1,
         '276': 1,
         '277': 1,
         '278': 1,
         '1971': 5,
         '1961': 1,
         '1970': 5,
         '1965': 1,
         '1968': 1,
         '25': 17,
         '26': 28,
         '12': 21,
         'αCTDs': 1,
         '16': 33,
         '17': 33,
         '30': 25,
         '24': 16,
         '21': 37,
         '32': 20,
         '11': 23,
         '33': 13,
         '35': 22,
         '42': 12,
         '3betaHSD': 1,
         'Δƒ': 1,
         '27': 25,
         '29': 17,
         '18': 32,
         '13': 20,
         '82': 4,
         '47': 12,
         '91': 3,
         '23': 22,
         '10': 22,
         '15': 28,
         '20': 38,
         '7PN': 2,
         '28': 32,
         '34': 23,
         '22': 21,
         '45': 17,
         '39': 9,
         '31': 19,
         '54': 2,
         '55': 4,
         '63': 9,
         

In [20]:
from adeft_indra.ground import gilda_ground

In [37]:
gilda_ground('RT')

(None, None, None)

In [38]:
t = [x for x in shortforms if gilda_ground(x)[0]]

In [40]:
len(t)

1511

In [48]:
len(shortforms)

15094

In [46]:
shortforms

{'p53K382me2',
 'CLox',
 'BP',
 'FOXO',
 'T5',
 'C2',
 'Ink4a',
 'Fucoidan',
 'unstable',
 'PDX',
 'EBP',
 'Vo',
 'IFN',
 'LAMA3',
 'CFHR1',
 'Sen',
 'blue',
 'IGFRKO',
 'EDP',
 'ischaemia',
 'mRNA',
 'gray',
 'CMs',
 'TEC1',
 'TRPV4',
 'TUNEL',
 'HOMA',
 'CDSs',
 'US7575746',
 'PAG',
 'arrow',
 'tyr',
 'SSC',
 'TBARS',
 'rs2943641C',
 'mHRE',
 'DHPAA',
 'DN',
 'dmso',
 'AVOs',
 'HCT',
 'below',
 'XO',
 'fibrosis',
 'S1',
 'ECLomas',
 'IMRT',
 'RBODD',
 'EVLP',
 'ANP',
 'FAA',
 'tm764',
 'ATP6V1A',
 'Rarβ2',
 'Apc3B',
 'MYH',
 'EWS',
 'HQIndol',
 'LEDs',
 'P2W15O56',
 'STR',
 'IPostC',
 '1JT0',
 'SEM',
 'HTRIdb',
 'TGs',
 'GHSRs',
 'Amersham',
 's5',
 'AQP5',
 'MGO',
 'RISK',
 'NASH',
 'waf1',
 'RA',
 'AGO2',
 'LMECs',
 'PTU',
 'γH2AX',
 'HR',
 'BRCA1',
 'OVXE',
 'rs12979860',
 'MTT',
 'Nlrp3',
 'SFA',
 'MSC',
 'HSP',
 'NP',
 'CTRL',
 'N_3',
 'CEX',
 'LY294002',
 'Italy',
 'MIC',
 'Con',
 'DMT1',
 'induced',
 'TKA',
 'w1118',
 'Pro12Ala',
 'lg12501',
 'S3F',
 'CCCP',
 'major',
 'XRD',


In [49]:
shortforms

{'CYP450',
 'RE',
 'before',
 'TOB1',
 'bR',
 'C2',
 'RAL',
 'pDC',
 'unstable',
 'retinal',
 'CAM',
 'IFN',
 'EXO',
 'SMI',
 'GD1a',
 'FGU',
 'TT4',
 'TC48',
 'BMDSC',
 'VMAT2',
 'Wako',
 'hS527',
 'PTBPs',
 'IGFRKO',
 'BMK1',
 'NAMD',
 'IEM',
 'non',
 'VRK1',
 'ischaemia',
 'gray',
 'conf',
 'rs361072',
 'Wo',
 'GRID',
 'pChk1',
 'PID1',
 'AβPP',
 'CBX8',
 'strong',
 'No',
 'TRF',
 'ROSs',
 'HVGGSSV',
 'twofold',
 'TPS',
 'hPDSCs',
 'YK594',
 'ROR',
 'PAG',
 'SYGYHQDEER',
 'wingless',
 'TBARS',
 'turmeric',
 'maps',
 'DN',
 'pHBSP',
 'partial',
 'sTIE2',
 'below',
 'PolIV',
 'sEPSC',
 'JMJD1C',
 'DHFR',
 'NW',
 'IMRT',
 'AN',
 'OBR',
 'NIST',
 'PGG',
 'ACs',
 '2PLSM',
 'CHN',
 'QoL',
 'SAE',
 'PKCz',
 'N913Q',
 'pERK',
 'HQIndol',
 'ARH',
 'NSRL',
 'eIF4E',
 'dmp',
 'NCPB',
 'SEM',
 'Malvern',
 'FDRs',
 'MRC5',
 'GHSRs',
 'Epb',
 'AQP5',
 'Glu744',
 'PKi',
 'FGFL',
 'CDR',
 'UNT',
 'waf1',
 'BSSE',
 'CYPs',
 'AGO2',
 'DiscoverX',
 'V143A',
 'ILC1s',
 'Hedt',
 'pic',
 '18F',
 'Nlrp3',

Writing large_scale_op_test.py


INFO: [2020-02-13 13:54:43] sample_op - Batch: 0
INFO: [2020-02-13 13:56:33] sample_op - Batch: 1
INFO: [2020-02-13 13:58:13] sample_op - Batch: 2
INFO: [2020-02-13 13:59:51] sample_op - Batch: 3
INFO: [2020-02-13 14:01:32] sample_op - Batch: 4





In [4]:
%load_ext memory_profiler

In [9]:
%time op.mine()

INFO: [2020-02-13 14:20:55] sample_op - Batch: 0
INFO: [2020-02-13 14:20:59] sample_op - Batch: 1
INFO: [2020-02-13 14:21:03] sample_op - Batch: 2
INFO: [2020-02-13 14:21:06] sample_op - Batch: 3
INFO: [2020-02-13 14:21:10] sample_op - Batch: 4


CPU times: user 15.1 s, sys: 1.9 s, total: 17 s
Wall time: 18 s


In [1]:
from indra_db.util.content_scripts import get_text_content_from_trids

In [3]:
get_text_content_from_trids([100172])

({100172: (100172, 'pubmed', 'text', 'abstract')},
 {(100172,
   'pubmed',
   'text',
   'abstract'): 'The aim of this paper is to evaluate whether pulse pressure is an independent risk factor for coronary and stroke mortality in 3282 subjects (1281 males and 2001 females) aged +/- 65 years, taking part in the CArdiovascular STudy in the Elderly (CASTEL). After dividing subjects into tertiles of pulse pressure, adjusted relative risk (RR) and confidence intervals (CI) for 14-year coronary and stroke mortality was evaluated for each tertile. Among females, coronary mortality rate was 2.7% in the first tertile of pulse pressure, 4.7% in the second (RR 1.38, 95% CI [1.15-2.66]) and 6.2% in the third (RR 2, CI [1.20-3.51]). Stroke mortality was 3.6%, 4.1% (RR 1.23, CI [1.02-2.23]) and 8.3% (RR 2.27, CI [1.37-3.74]), respectively. This trend was recognizable in normotensive, borderline and sustained hypertensive women, where mortality increased with rising pulse pressure. No relationship wa

In [7]:
text = list(_3[1].values())[0]

In [13]:
op.find_shortforms(text)

{'CASTEL', 'CI', 'RR'}

In [15]:
miner1 = AdeftMiner('CASTEL')

In [16]:
miner1.process_texts([text])

In [17]:
miner2 = AdeftMiner('CI')

In [18]:
miner2.process_texts([text])

In [19]:
miner3 = AdeftMiner('RR')

In [20]:
miner3.process_texts([text])

In [23]:
miner3.top()

[('e elderly castel after dividing subjects into tertiles of pulse pressure adjusted relative risk',
  1),
 ('risk', 0.0),
 ('relative risk', 0.0),
 ('adjusted relative risk', 0.0),
 ('pressure adjusted relative risk', 0.0),
 ('pulse pressure adjusted relative risk', 0.0),
 ('of pulse pressure adjusted relative risk', 0.0),
 ('tertiles of pulse pressure adjusted relative risk', 0.0),
 ('into tertiles of pulse pressure adjusted relative risk', 0.0),
 ('subjects into tertiles of pulse pressure adjusted relative risk', 0.0),
 ('dividing subjects into tertiles of pulse pressure adjusted relative risk',
  0.0),
 ('after dividing subjects into tertiles of pulse pressure adjusted relative risk',
  0.0),
 ('castel after dividing subjects into tertiles of pulse pressure adjusted relative risk',
  0.0),
 ('elderly castel after dividing subjects into tertiles of pulse pressure adjusted relative risk',
  0.0)]

In [23]:
t = {'x': {'a': 1, 'b': 2}}

In [5]:
from collections import defaultdict

In [25]:
y = defaultdict(lambda: defaultdict(int), {key: defaultdict(int, value)
                                           for key, value in t.items()})

In [26]:
y['x']['c'] += 1

In [18]:
v = {'a': 1, 'b': 2}

In [20]:
s = defaultdict(int, v)

In [21]:
s['c'] += 1

In [22]:
s

defaultdict(int, {'a': 1, 'b': 2, 'c': 1})

In [27]:
y

defaultdict(<function __main__.<lambda>()>,
            {'x': defaultdict(int, {'a': 1, 'b': 2, 'c': 1})})

In [8]:
!du -h /Users/albertsteppi/msc/sample_op/

74M	/Users/albertsteppi/msc/sample_op/


In [9]:
rm -r /Users/albertsteppi/msc/sample_op/

In [13]:
3e7

30000000.0

In [14]:
3e3

3000.0

In [15]:
3000 * 45

135000

In [17]:
3000/60

50.0

SyntaxError: invalid syntax (<ipython-input-18-05b6e7515269>, line 1)

In [3]:
from sqlalchemy import text
from indra_db.util import get_primary_db