In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['ARG']
genes = ['ABL2', 'RERE', 'AREG']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['ARG'].top()

In [6]:
top

[('l arginine', 13.571428571428571),
 ('arginine', 12.478260869565217),
 ('arctigenin', 6.0),
 ('of l arginine', 4.0),
 ('amphiregulin', 1.3333333333333333),
 ('arginase', 1.0),
 ('and amphiregulin', 1.0),
 ('and arginine', 1.0),
 ('aggressive response generation', 1.0),
 ('autologous retinal graft', 1.0),
 ('herein the interaction of l arginine', 1),
 ('the therapeutic benefits of l arginine', 1),
 ('a salient aspect of maladaptive sip aggressive response generation', 1),
 ('to evaluate the surgical technique using autologous retinal graft', 1),
 ('this study was aim to examine the effect of l arginine', 1),
 ('turally and functionally related polypeptides including vaccinia growth factor vgf amphiregulin',
  1),
 ('results of in vivo study indicate dietary n carbamylglutamate ncg and arginine',
  1),
 ('we examine the related contributory role of extracellular vs intracellular l arginine',
  1),
 ('by various etiological factor the seeds derived from arctium lappa that containing arc

In [7]:
longforms0 = miners['ARG'].get_longforms()

In [8]:
longforms0

[('l arginine', 13.571428571428571),
 ('arctigenin', 6.0),
 ('amphiregulin', 1.3333333333333333)]

In [9]:
longforms0.append(top[1])

In [10]:
longforms0 = longforms0[1:]

In [45]:
longforms0.sort(key=lambda x: -x[1])

In [46]:
longforms = longforms0

In [47]:
longforms, scores = zip(*longforms)

In [48]:
longforms

('arginine', 'arctigenin', 'amphiregulin')

In [49]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [50]:
grounding_map

{'arginine': 'CHEBI:CHEBI:29016', 'amphiregulin': 'HGNC:651'}

In [51]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [54]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [55]:
result = (grounding_map, names, pos_labels)

In [56]:
result

({'amphiregulin': 'HGNC:651',
  'arctigenin': 'CHEBI:CHEBI:79',
  'arginine': 'CHEBI:CHEBI:29016'},
 {'HGNC:651': 'AREG',
  'CHEBI:CHEBI:79': '(-)-Arctigenin',
  'CHEBI:CHEBI:29016': 'arginine',
  'HGNC:77': 'ABL2',
  'HGNC:9965': 'RERE'},
 ['HGNC:9965', 'HGNC:651', 'CHEBI:CHEBI:29016', 'HGNC:77', 'CHEBI:CHEBI:79'])

In [11]:
grounding_map, names, pos_labels = ({'amphiregulin': 'HGNC:651',
  'arctigenin': 'CHEBI:CHEBI:79',
  'arginine': 'CHEBI:CHEBI:29016'},
 {'HGNC:651': 'AREG',
  'CHEBI:CHEBI:79': '(-)-Arctigenin',
  'CHEBI:CHEBI:29016': 'arginine',
  'HGNC:77': 'ABL2',
  'HGNC:9965': 'RERE'},
 ['HGNC:9965', 'HGNC:651', 'CHEBI:CHEBI:29016', 'HGNC:77', 'CHEBI:CHEBI:79'])

In [12]:
grounding_dict = {'ARG': grounding_map}

In [13]:
classifier = AdeftClassifier('ARG', pos_labels=pos_labels)

In [14]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [15]:
labeler = AdeftLabeler(grounding_dict)

In [16]:
corpus = labeler.build_from_texts(shortform_texts)

In [17]:
corpus.extend(entrez_texts)

In [18]:
texts, labels = zip(*corpus)

In [19]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-13 16:32:42] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-13 16:33:03] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.8079007829680392 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [20]:
imp = classifier.feature_importances()

In [22]:
imp['HGNC:651'][0:10]

[('areg', 1.4972089252377179),
 ('ar', 1.2208487384038054),
 ('amphiregulin', 0.8431062870077384),
 ('egfr', 0.5519674764084078),
 ('egf', 0.4501358811472624),
 ('cancer', 0.3576551829899575),
 ('cells', 0.3119225815027824),
 ('expression', 0.20636241454874607),
 ('breast', 0.18044446067764217),
 ('agr2', 0.17452552023646034)]

classifier._std

In [68]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [69]:
d = disamb.disambiguate(shortform_texts)

In [70]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC:651']

In [72]:
a[2]

'The selection of patients with non-small cell lung cancer (NSCLC) for epidermal growth factor receptor (EGFR) inhibitor (EGFR-tyrosine kinase inhibitors [TKIs]) therapy is suboptimal as tumor tissue is often unavailable. Ligands of EGFR, transforming growth factor-alpha (TGFa) and amphiregulin (ARG), and the insulin-like growth factor (IGF) family have been associated with resistance to EGFR-TKIs. The aim of our study was to explore whether concentrations of these factors measured in serum were predictive of response to EGFR-TKIs. We assessed serum levels of marker candidates using enzyme-linked immunosorbent (TGFa and ARG) and chemiluminescent (IGF1 and IGF-binding protein-3) assays in 61 patients with advanced NSCLC treated with EGFR-TKIs and 63 matched advanced NSCLC control patients without EGFR-TKIs treatment. We dichotomized marker levels at the 20th, 50th, or 80th percentile and evaluated whether the effect of EGFR-TKIs treatment on disease-specific survival (DSS) differed by m

In [74]:
disamb.dump('MED', '../results')

In [188]:
from adeft.disambiguate import load_disambiguator

In [189]:
d = load_disambiguator('HIR', '../results')

In [190]:
d.disambiguate(texts[0])

('HGNC:6091',
 'INSR',
 {'HGNC:6091': 0.9495047219792215,
  'MESH:D015427': 0.03378744088190508,
  'ungrounded': 0.016707837138873322})

In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [89]:
model_to_s3(disamb)

In [14]:
import numpy as np

In [15]:
t = np.array([1, 1, 2, 2, 3, 3, 4])

In [16]:
np.square(t)

array([ 1,  1,  4,  4,  9,  9, 16])

In [16]:
classifier._std

matrix([[9.50383849e-06, 3.38021732e-04, 2.66352572e-06, ...,
         2.33005627e-05, 2.86057449e-06, 5.92565616e-05]])

In [19]:
coef = classifier.estimator.named_steps['logit'].coef_

In [20]:
coef.shape

(5, 10000)

In [21]:
logit = classifier.estimator.named_steps['logit']

In [22]:
logit.classes_

array(['CHEBI:CHEBI:29016', 'CHEBI:CHEBI:79', 'HGNC:651', 'HGNC:77',
       'HGNC:9965'], dtype='<U17')

In [25]:
classifier._std.shape

(10000,)

In [37]:
coef[0,:].shape

(10000,)

In [29]:
import numpy as np

In [41]:
np.squeeze(np.asarray(classifier._std))

array([9.50383849e-06, 3.38021732e-04, 2.66352572e-06, ...,
       2.33005627e-05, 2.86057449e-06, 5.92565616e-05])

In [44]:
np.squeeze(classifier._std)

matrix([[9.50383849e-06, 3.38021732e-04, 2.66352572e-06, ...,
         2.33005627e-05, 2.86057449e-06, 5.92565616e-05]])

In [45]:
classifier._std

matrix([[9.50383849e-06, 3.38021732e-04, 2.66352572e-06, ...,
         2.33005627e-05, 2.86057449e-06, 5.92565616e-05]])

In [24]:
std = np.squeeze(np.asarray(classifier._std))

NameError: name 'np' is not defined

In [49]:
sum(std * coef[0,:])

0.04813629584466277

In [50]:
t = [1, 2, 3]
v = [5, 4, 2]

In [53]:
sorted(zip(t, v), key=lambda x: -x[1])

[(1, 5), (2, 4), (3, 2)]

In [23]:
a = std

NameError: name 'std' is not defined

In [30]:
np.sqrt(classifier._std)

array([0.00308283, 0.01838537, 0.00163203, ..., 0.00482707, 0.00169132,
       0.00769783])

In [32]:
std = classifier._std

In [33]:
std.shape = (len(std), 1)

In [34]:
std

array([[9.50383849e-06],
       [3.38021732e-04],
       [2.66352572e-06],
       ...,
       [2.33005627e-05],
       [2.86057449e-06],
       [5.92565616e-05]])

In [35]:
np.sqrt(std)

array([[0.00308283],
       [0.01838537],
       [0.00163203],
       ...,
       [0.00482707],
       [0.00169132],
       [0.00769783]])