In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [2]:
shortforms = ['ALK']
genes = ['ALK', 'ACVRL1', 'TGFBR1']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['ALK'].top()

In [8]:
len(shortform_texts)

1322

In [7]:
%time miners['ALK'].compute_alignment_scores()

CPU times: user 2.01 s, sys: 86 ms, total: 2.09 s
Wall time: 2.12 s


In [198]:
longforms0 = miners['ALK'].get_longforms()

In [188]:
longforms0[

[('anaplastic lymphoma kinase', 948.8728813559322),
 ('alkaline phosphatase', 6.75),
 ('kinase gene', 6.428571428571429)]

In [199]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 1, 3, 4]]

In [185]:
top[8]

('activin receptor like kinase', 20.454545454545457)

In [200]:
longforms0.extend([x for i, x in enumerate(top) if i in [8, ]])

In [201]:
longforms0

[('anaplastic lymphoma kinase', 948.8728813559322),
 ('alkaline phosphatase', 6.75),
 ('2p23', 3.2),
 ('anaplastic lymphomakinase', 2.0),
 ('activin receptor like kinase', 20.454545454545457)]

In [202]:
longforms0.sort(key=lambda x: -x[1])

In [204]:
longforms, scores = zip(*longforms0)

In [205]:
longforms

('anaplastic lymphoma kinase',
 'activin receptor like kinase',
 'alkaline phosphatase',
 '2p23',
 'anaplastic lymphomakinase')

In [206]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [207]:
grounding_map

{'anaplastic lymphoma kinase': 'HGNC:427',
 'activin receptor like kinase': 'MESH:D029404',
 'alkaline phosphatase': 'MESH:D000469'}

In [208]:
names

{'HGNC:427': 'ALK',
 'MESH:D029404': 'Activin Receptors',
 'MESH:D000469': 'Alkaline Phosphatase'}

In [209]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [231]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [232]:
result = (grounding_map, names, pos_labels)

In [233]:
result

({'2p23': 'HGNC:427',
  'activin receptor like kinase': 'HGNC:175',
  'alkaline phosphatase': 'MESH:D000469',
  'anaplastic lymphoma kinase': 'HGNC:427',
  'anaplastic lymphomakinase': 'HGNC:427'},
 {'HGNC:427': 'ALK',
  'HGNC:175': 'ACVRL1',
  'MESH:D000469': 'Alkaline Phosphatase',
  'HGNC:11772': 'TGFBR1'},
 ['HGNC:175', 'HGNC:11772', 'HGNC:427', 'MESH:D000469'])

In [234]:
grounding_map, names, pos_labels = ({'2p23': 'HGNC:427',
  'activin receptor like kinase': 'HGNC:175',
  'alkaline phosphatase': 'MESH:D000469',
  'anaplastic lymphoma kinase': 'HGNC:427',
  'anaplastic lymphomakinase': 'HGNC:427'},
 {'HGNC:427': 'ALK',
  'HGNC:175': 'ACVRL1',
  'MESH:D000469': 'Alkaline Phosphatase',
  'HGNC:11772': 'TGFBR1'},
 ['HGNC:175', 'HGNC:11772', 'HGNC:427', 'MESH:D000469'])

In [235]:
grounding_dict = {'ALK': grounding_map}

In [236]:
classifier = AdeftClassifier('ALK', pos_labels=pos_labels)

In [237]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [238]:
labeler = AdeftLabeler(grounding_dict)

In [239]:
corpus = labeler.build_from_texts(shortform_texts)

In [240]:
corpus.extend(entrez_texts)

In [241]:
texts, labels = zip(*corpus)

In [242]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-10 17:03:48] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-10 17:04:31] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.926249207369229 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [243]:
classifier.stats

{'label_distribution': {'HGNC:427': 1181,
  'MESH:D000469': 6,
  'HGNC:175': 155,
  'HGNC:11772': 422},
 'f1': {'mean': 0.926249207369229, 'std': 0.015151303723191564},
 'precision': {'mean': 0.9258002623248398, 'std': 0.014684118845845862},
 'recall': {'mean': 0.9308644669977111, 'std': 0.010179655973384}}

In [244]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [245]:
d = disamb.disambiguate(shortform_texts)

In [246]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0] == 'HGNC:175']

In [247]:
len(a)

24

In [248]:
a[0]

'Transforming growth factor beta (TGF-β) is a multi-potential growth factor expressed in many tissues that affects a wide range of physiological phenomena such as cell proliferation, differentiation, migration, adhesion, and induction of extracellular matrix (ECM) synthesis [ 4 ]. TGF-β superfamily molecules transmit their signals through both types I and II serine/threonine kinase receptors [ 5 ]. The type II receptors are constitutively activated and they phosphorylate intracellular domain of type I receptors upon ligand ligation. There are seven type I receptors, named activin receptor-like kinases (ALK) 1–7 in mammals. TGF-β variants and activins bind to ALK-1, -4, and -5, whereas bone morphogenetic proteins (BMPs) bind to ALK-2, -3, and -6. ALK-7 transduces nodal signaling. Type II receptors activate these type I ALK receptors. Once activated, the ALK receptors transduce signals from the cell surface and cytosolic compartments to the nucleus to activate targeted genes via Smad pro

In [163]:
disamb.dump('LAK', '../results')

In [74]:
from adeft.disambiguate import load_disambiguator

In [75]:
d = load_disambiguator('ARG', '../results')



In [76]:
d.disambiguate(texts[0])

('CHEBI:CHEBI:29016',
 'arginine',
 {'CHEBI:CHEBI:29016': 0.9667882794828576,
  'CHEBI:CHEBI:79': 0.004343534773612251,
  'HGNC:651': 0.00798509086185913,
  'HGNC:77': 0.00973784879530277,
  'HGNC:9965': 0.011145246086368127})

In [79]:
print(d.info())

Disambiguation model for ARG

Produces the disambiguations:
	(-)-Arctigenin*	CHEBI:CHEBI:79
	ABL2*	HGNC:77
	AREG*	HGNC:651
	RERE*	HGNC:9965
	arginine*	CHEBI:CHEBI:29016

Training data had class balance:
	AREG*	157
	ABL2*	88
	RERE*	39
	arginine*	15
	(-)-Arctigenin*	2

Classification Metrics:
	F1 score:	0.81188
	Precision:	0.82394
	Recall:		0.8205

* Positive labels
See Docstring for explanation



In [32]:
a = load_disambiguator('AR')

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [164]:
model_to_s3(disamb)