In [1]:
import re
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

INFO: [2020-01-24 16:30:28] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [2]:
shortforms = ['TAK']
genes = ['CDK9', 'MAP3K7']
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [3]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [4]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [13]:
top = miners['TAK'].top()

In [14]:
top

[('takayasu arteritis', 22.3),
 ('arteritis', 12.342857142857142),
 ('activated kinase', 7.166666666666667),
 ('takayasu s arteritis', 6.75),
 ('with takayasu arteritis', 5.555555555555555),
 ('β activated kinase', 3.4285714285714284),
 ('tgf β activated kinase', 3.0),
 ('patients with takayasu arteritis', 2.4),
 ('transforming growth factor β activated kinase', 2.0),
 ('rations of il 12p40 and il 12p70 were significantly higher in patients pt with takayasu arteritis',
  2),
 ('of patients with takayasu arteritis', 1.3333333333333333),
 ('tak 242', 1.0),
 ('beta activated kinase', 1.0),
 ('of takayasu arteritis', 1.0),
 ('activated takayasu s arteritis', 1.0),
 ('in patients with takayasu arteritis', 1.0),
 ('characteristics of patients with takayasu arteritis', 1.0),
 ('with the exception of takayasu arteritis', 1),
 ('the tlr4 signaling pathway inhibitors tak 242', 1),
 ('a previous study revealed the associated between susceptibility to takayasu arteritis',
  1),
 ('ependent pathway

In [15]:
longforms0 = miners['TAK'].get_longforms()

In [16]:
longforms0

[('takayasu arteritis', 22.3), ('activated kinase', 7.166666666666667)]

In [17]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0]]

In [18]:
list(enumerate(top))

[(0, ('takayasu arteritis', 22.3)),
 (1, ('arteritis', 12.342857142857142)),
 (2, ('activated kinase', 7.166666666666667)),
 (3, ('takayasu s arteritis', 6.75)),
 (4, ('with takayasu arteritis', 5.555555555555555)),
 (5, ('β activated kinase', 3.4285714285714284)),
 (6, ('tgf β activated kinase', 3.0)),
 (7, ('patients with takayasu arteritis', 2.4)),
 (8, ('transforming growth factor β activated kinase', 2.0)),
 (9,
  ('rations of il 12p40 and il 12p70 were significantly higher in patients pt with takayasu arteritis',
   2)),
 (10, ('of patients with takayasu arteritis', 1.3333333333333333)),
 (11, ('tak 242', 1.0)),
 (12, ('beta activated kinase', 1.0)),
 (13, ('of takayasu arteritis', 1.0)),
 (14, ('activated takayasu s arteritis', 1.0)),
 (15, ('in patients with takayasu arteritis', 1.0)),
 (16, ('characteristics of patients with takayasu arteritis', 1.0)),
 (17, ('with the exception of takayasu arteritis', 1)),
 (18, ('the tlr4 signaling pathway inhibitors tak 242', 1)),
 (19,
  (

In [19]:
longforms0.extend([x for i, x in enumerate(top) if i in [6, 8]])

In [20]:
longforms0

[('takayasu arteritis', 22.3),
 ('tgf β activated kinase', 3.0),
 ('transforming growth factor β activated kinase', 2.0)]

In [21]:
longforms0.sort(key=lambda x: -x[1])

In [22]:
longforms, scores = zip(*longforms0)

In [23]:
longforms

('takayasu arteritis',
 'tgf β activated kinase',
 'transforming growth factor β activated kinase')

In [24]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [25]:
grounding_map

{'takayasu arteritis': 'MESH:D013625'}

In [26]:
names

{'MESH:D013625': 'Takayasu Arteritis'}

In [27]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [28]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [30]:
pos_labels

['HGNC:6859', 'HGNC:1780', 'MESH:D013625']

In [31]:
result = (grounding_map, names, pos_labels)

In [32]:
result

({'takayasu arteritis': 'MESH:D013625',
  'tgf β activated kinase': 'HGNC:6859',
  'transforming growth factor β activated kinase': 'HGNC:6859'},
 {'MESH:D013625': 'Takayasu Arteritis',
  'HGNC:6859': 'MAP3K7',
  'HGNC:1780': 'CDK9'},
 ['HGNC:6859', 'HGNC:1780', 'MESH:D013625'])

In [5]:
grounding_map, names, pos_labels = ({'takayasu arteritis': 'MESH:D013625',
  'tgf β activated kinase': 'HGNC:6859',
  'transforming growth factor β activated kinase': 'HGNC:6859'},
 {'MESH:D013625': 'Takayasu Arteritis',
  'HGNC:6859': 'MAP3K7',
  'HGNC:1780': 'CDK9'},
 ['HGNC:6859', 'HGNC:1780', 'MESH:D013625'])

In [6]:
grounding_dict = {'TAK': grounding_map}

In [7]:
classifier = AdeftClassifier('TAK', pos_labels=pos_labels, random_state=1729)

In [8]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [9]:
labeler = AdeftLabeler(grounding_dict)

In [10]:
corpus = labeler.build_from_texts(shortform_texts)

In [11]:
corpus.extend(entrez_texts)

In [12]:
takeda_texts = [(text, 'ungrounded') for text in shortform_texts if re.search(r'TAK[\s-]?[1-9]{3}', text)]

In [14]:
len(takeda_texts)

34

In [15]:
corpus.extend(takeda_texts)

In [16]:
texts, labels = zip(*corpus)

In [17]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

AttributeError: 'AdeftClassifier' object has no attribute 'random_state'

In [63]:
classifier.stats

{'label_distribution': {'MESH:D013625': 13,
  'HGNC:6859': 324,
  'HGNC:1780': 384,
  'ungrounded': 34},
 'f1': {'mean': 0.9465550518709265, 'std': 0.016695552638681083},
 'precision': {'mean': 0.9433116657968711, 'std': 0.01511936783513322},
 'recall': {'mean': 0.9514626841508823, 'std': 0.0199721773884108}}

In [65]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [66]:
d = disamb.disambiguate(shortform_texts)

In [73]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0].startswith('HGNC:1')]

In [77]:
a[2]

'EFS (2–32 Hz) induced frequency-dependent contractions of prostate strips, which were inhibited by SBE 13 (1 μM), cyclapolin 9 (3 μM), TAK 960 (100 nM), and Ro 3280 (100 nM) ( Figure  4 ). Two-way ANOVA was conducted to compare inhibitor and control groups, and indicated that the inhibition by SBE 13 ( p  < 0.02), cyclapolin 9 ( p  < 0.03), TAK 960 ( p  < 0.007), and Ro 3280 ( p  < 0.04) was significant. Multivariate analysis revealed that inhibition was significant for all four inhibitors at least at 32 Hz ( Figure  4 ).\nEffects of SBE 13 (1 μM), cyclapolin 9 (3 μM), TAK 960 (100 nM), and Ro 3280 (100 nM) on EFS-induced prostate contractions. In an organ bath, contractions of human prostate strips were induced by EFS. Effects of inhibitors on contractions were compared with corresponding controls (DMSO) in separate sets of experiments. To eliminate heterogeneities including any individual variations, different degree of BPH, or varying smooth muscle content (compare  Figure  1 ), te

In [88]:
a[16]

'The OsPCS2 exhibits root- and shoot-specific differential ratios of alternatively spliced transcripts in indica rice under Cd stress, and plays role in Cd and As stress tolerance and accumulation. Enzymatic activity of phytochelatin synthase (PCS) in plant produces phytochelatins, which help in sequestration of heavy metal(loid)s inside the cell vacuole to alleviate toxicity. Here we report that among the two PCS genes-OsPCS1 and OsPCS2 in indica rice (Oryza sativa) cultivar, the OsPCS2 produces an alternatively spliced OsPCS2b transcript that bears the unusual premature termination codon besides the canonically spliced OsPCS2a transcript. Root- and shoot-specific differential ratios of alternatively spliced OsPCS2a and OsPCS2b transcript expressions were observed under cadmium stress. Saccharomyces cerevisiae cells transformed with OsPCS2a exhibited increased cadmium (Cd) and arsenic (As) tolerance and accumulation, unlike the OsPCS2b transformed yeast cells. An intron-containing hai

In [78]:
disamb.dump('TAK', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [79]:
model_to_s3(disamb)

In [80]:
a = frozenset([(1, 1), (1, 2), (1, 3)])

In [81]:
dict(a)

{1: 1}

In [82]:
a = frozenset([(1, 2), (2, 2), (3, 2)])

In [84]:
x = dict(a)

In [86]:
frozenset(x.items())

frozenset({(1, 2), (2, 2), (3, 2)})

In [87]:
x[_86] = 2

In [88]:
x

{1: 2, 3: 2, 2: 2, frozenset({(1, 2), (2, 2), (3, 2)}): 2}

In [90]:
from indra_db.util import get_primary_db

In [91]:
db = get_primary_db()

In [92]:
dir(db)

['Base',
 'Curation',
 'DBInfo',
 'EvidenceCounts',
 'FastRawPaLink',
 'MeshRefAnnotations',
 'NameMeta',
 'OtherMeta',
 'PAAgents',
 'PAMods',
 'PAMuts',
 'PAStatements',
 'PASupportLinks',
 'PaMeta',
 'PaRefLink',
 'PaSourceLookup',
 'PreassemblyUpdates',
 'RawAgents',
 'RawMods',
 'RawMuts',
 'RawStatements',
 'RawStmtSrc',
 'RawUniqueLinks',
 'Reading',
 'ReadingRefLink',
 'ReadingUpdates',
 'RejectedStatements',
 'SourceFile',
 'TextContent',
 'TextMeta',
 'TextRef',
 'Updates',
 '_DatabaseManager__foreign_key_graph',
 '_PrincipalDatabaseManager__PaStmtSrc',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_clear',
 '_conn',
 '_form_pg_args',
 '_get_fo

In [93]:
db.

['text_ref', 'mesh_ref_annotations', 'source_file', 'updates', 'text_content', 'reading', 'reading_updates', 'db_info', 'raw_statements', 'rejected_statements', 'raw_agents', 'raw_mods', 'raw_muts', 'raw_unique_links', 'preassembly_updates', 'pa_statements', 'pa_agents', 'pa_mods', 'pa_muts', 'pa_support_links', 'curation']


In [1]:
from indra_db.util.content_scripts import get_text_content_from_text_refs

In [3]:
%time x = get_text_content_from_text_refs({'PMID': '28684774'}, use_cache=True)

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 13.1 µs


In [95]:
from indra_db.util import get_text_content_from_pmids

ImportError: cannot import name 'get_text_content_from_pmids' from 'indra_db.util' (/Users/albertsteppi/indra_db/indra_db/util/__init__.py)