In [4]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator, load_disambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

In [5]:
shortforms = ['SNS']
genes = ['SCN10A']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [6]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [7]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [8]:
top = miners['SNS'].top()

In [10]:
longforms0 = miners['SNS'].get_longforms()

In [11]:
top

[('sympathetic nervous system', 410.4762633996937),
 ('the sympathetic nervous system', 338.08743169398906),
 ('of the sympathetic nervous system', 84.88059701492537),
 ('activation of the sympathetic nervous system', 75.97435897435898),
 ('increased sympathetic nervous system', 40.34146341463415),
 ('nerve stimulation', 29.955555555555556),
 ('and sympathetic nervous system', 29.5),
 ('of sympathetic nervous system', 28.8125),
 ('and the sympathetic nervous system', 27.02857142857143),
 ('by the sympathetic nervous system', 21.44),
 ('system', 19.76980568011959),
 ('sympathetic nerve stimulation', 17.05263157894737),
 ('sacral nerve stimulation', 15.833333333333334),
 ('activation the sympathetic nervous system', 13.466666666666667),
 ('sympathetic', 11.571428571428568),
 ('the activation of the sympathetic nervous system', 10.833333333333334),
 ('that the sympathetic nervous system', 10.666666666666666),
 ('from the sympathetic nervous system', 10.153846153846153),
 ('via the sympath

In [12]:
longforms0 

[('sympathetic nervous system', 410.4762633996937),
 ('nerve stimulation', 29.955555555555556),
 ('sympathetic', 11.571428571428568),
 ('social networking sites', 4.666666666666667),
 ('sinisan', 4.285714285714286),
 ('sini san', 3.5),
 ('secondary neurospheres', 2.0),
 ('short nascent dna strands', 2.0),
 ('of a specific nutrient synergy', 2.0)]

In [13]:
longforms = [l for i, l in enumerate(longforms0) if i not in [1, 8]]

In [14]:
longforms

[('sympathetic nervous system', 410.4762633996937),
 ('sympathetic', 11.571428571428568),
 ('social networking sites', 4.666666666666667),
 ('sinisan', 4.285714285714286),
 ('sini san', 3.5),
 ('secondary neurospheres', 2.0),
 ('short nascent dna strands', 2.0)]

In [15]:
top[11]

('sympathetic nerve stimulation', 17.05263157894737)

In [16]:
longforms.extend([top[i] for i in [11]])

In [17]:
longforms.append(('specific nutrient synergy', 1.0))

In [18]:
longforms.sort(key=lambda x: -x[1])

In [19]:
longforms

[('sympathetic nervous system', 410.4762633996937),
 ('sympathetic nerve stimulation', 17.05263157894737),
 ('sympathetic', 11.571428571428568),
 ('social networking sites', 4.666666666666667),
 ('sinisan', 4.285714285714286),
 ('sini san', 3.5),
 ('secondary neurospheres', 2.0),
 ('short nascent dna strands', 2.0),
 ('specific nutrient synergy', 1.0)]

In [20]:
longforms, scores = zip(*longforms)

In [21]:
grounding_map = {}
for longform in longforms:
    try:
        grounding = gilda_ground(longform)
    except Exception:
        print(longform)
        continue
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [22]:
grounding_map

{'sympathetic nervous system': 'MESH:D013564'}

In [None]:
result = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [14]:
result

({'glutamic acid rich protein': 'HGNC:2151',
  'glycoprotein a repetitions predominant': 'HGNC:4161',
  'the golgi associated retrograde protein': 'FPLX:GARP'},
 {'HGNC:2151': 'CNGB1', 'HGNC:4161': 'LRRC32', 'FPLX:GARP': 'GARP'},
 ['FPLX:GARP', 'HGNC:2151', 'HGNC:4161'])

In [15]:
grounding_map, names, pos_labels = \
    ({'glutamic acid rich protein': 'HGNC:2151',
      'glycoprotein a repetitions predominant': 'HGNC:4161',
      'the golgi associated retrograde protein': 'FPLX:GARP'},
     {'HGNC:2151': 'CNGB1', 'HGNC:4161': 'LRRC32', 'FPLX:GARP': 'GARP'},
     ['FPLX:GARP', 'HGNC:2151', 'HGNC:4161'])

In [16]:
grounding_dict = {'GARP': grounding_map}

In [17]:
classifier = AdeftClassifier('GARP', pos_labels)

In [24]:
param_grid = {'C': [10.0, 100.0], 'max_features': [1000, 10000]}

In [18]:
labeler = AdeftLabeler(grounding_dict)

In [20]:
corpus = labeler.build_from_texts(shortform_texts)
corpus.extend(entrez_texts)

In [21]:
texts, labels = zip(*corpus)

In [25]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-19 12:22:18] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [1000, 10000]}
INFO: [2019-12-19 12:23:01] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9256037270620888 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [36]:
classifier.stats

{'label_distribution': {'HGNC:4161': 66, 'FPLX:GARP': 19, 'HGNC:2151': 30},
 'f1': {'mean': 0.9256037270620888, 'std': 0.06414141547348061},
 'precision': {'mean': 0.9353599661208356, 'std': 0.0561687741217047},
 'recall': {'mean': 0.9292490118577075, 'std': 0.060234871783553336}}

In [37]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [134]:
disamb.disambiguate(texts[0])

('HGNC:4161',
 'LRRC32',
 {'FPLX:GARP': 0.04563332106023921, 'HGNC:4161': 0.9543666789397608})

In [139]:
texts[2]

'Mesenchymal stem cells (MSCs), which have multipotential differentiation and self-renewal potential, are possible cells for tissue engineering. Transforming growth factor β1 (TGFβ1) can be produced by MSCs in an inactive form, and the activation of TGFβ1 functions as an important regulator of osteogenic differentiation in MSCs. Recently, studies showed that GARP participated in the activation of latent TGFβ1, but the interaction between GARP and TGFβ1 is still undefined. In our study, we successfully isolated the MSCs from bone marrow of rats, and showed that GARP was detected in bone mesenchymal stem cells (BMSCs). During the osteogenic differentiation of BMSCs, GARP expression was increased over time. To elucidate the interaction between GARP and TGFβ1, we downregulated GARP expression in BMSCs to examine the level of active TGFβ1. We then verified that the downregulation of GARP decreased the secretion of active TGFβ1. Furthermore, osteogenic differentiation experiments, alkaline p

In [45]:
disamb.dump('GARP', '../results')

In [47]:
model_to_s3(disamb)

In [2]:
d = load_disambiguator('GARP', '../results')

In [39]:
preds = disamb.disambiguate(shortform_texts)

In [40]:
a = [text for text, pred in zip(shortform_texts, preds) if pred[0] == 'HGNC:2151']

In [41]:
len(a)

8

In [44]:
a[2]

"Other support for this stabilizing hypothesis comes from studies on the beta-subunit of the rod cyclic nucleotide gated channel (CNG). It has been shown that PRPH2/RDS can bind to the beta subunit of rod CNG, as well as two other non-membrane bound isoforms also coded for by the  Cngb1  gene called glutamic acid rich proteins (GARP1/2) ( Poetsch et\xa0al., 2001; Ritter et\xa0al., 2011 ). The interactions between GARP and PRPH2/RDS were found to be strongest when PRPH2/RDS was in disulfide linked oligomers suggesting the larger PRPH2/RDS complexes might mediate this interaction ( Poetsch et\xa0al., 2001 ). Given the localization of CNGB1 on the plasma membrane, PRPH2/RDS on the disc rim, and GARP1/2 as free cytosolic forms in between, it has been hypothesized that interactions between PRPH2/RDS and GARP could serve as a mechanism by which PRPH2/RDS could regulate the sizing and orientation of the rod discs. In support of this, elimination of GARP in rods causes disorganization in disc 

In [33]:
a.disambiguate('Androgen')

('HGNC:644',
 'AR',
 {'FPLX:ADRB': 4.719192246728643e-10,
  'GO:GO:0007340': 1.3624879544267196e-09,
  'HGNC:381': 5.2828214761836554e-12,
  'HGNC:644': 0.9999384718638125,
  'HGNC:651': 1.8656671170325e-12,
  'ungrounded': 6.152629463170691e-05})

In [36]:
logit = d.classifier.estimator.named_steps['logit']

In [37]:
logit.classes_

array(['FPLX:AQP', 'FPLX:Macrophage_inflammatory_pro', 'MESH:D000072277',
       'polymer', 'ungrounded'], dtype='<U32')

In [33]:
i = 2
j = 4

In [34]:
i |= j

In [29]:
i = i | j

4

In [35]:
i

6

In [2]:
d = load_disambiguator('SNS', path='../results/')



In [3]:
d.grounding_dict

{'SNS': {'sacral nerve stimulation': 'ungrounded',
  'secondary neurospheres': 'ungrounded',
  'serevent nationwide surveillance': 'ungrounded',
  'short nascent dna strands': 'ungrounded',
  'sini san': 'ungrounded',
  'sinisan': 'ungrounded',
  'social networking sites': 'ungrounded',
  'sympathetic': 'MESH:D013564',
  'sympathetic nerve stimulation': 'MESH:D013564',
  'sympathetic nerve system': 'MESH:D013564',
  'sympathetic nervous system': 'MESH:D013564'}}