In [1]:
import json
import pickle

from indra.literature.adeft_tools import universal_extract_text
from indra.databases.hgnc_client import get_hgnc_name, get_hgnc_id

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.s3 import model_to_s3
from adeft_indra.ground import gilda_ground

INFO: [2020-01-13 11:03:42] /Users/albertsteppi/adeft/adeft/recognize.py - OneShotRecognizer not available. AdeftLongformScorer has not been built successfully.


In [62]:
shortforms = ['PCS']
genes = []
families = {}
groundings = [f'HGNC:{get_hgnc_id(gene)}' for gene in genes]
for family, members in families.items():
    genes.extend(members)
    groundings.append(f'FPLX:{family}')
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [63]:
entrez_texts = []
entrez_refs = set()
for gene, grounding in zip(genes, groundings):
    try:
        pmids = all_pmids[gene]
    except KeyError:
        continue
    _, content = get_text_content_from_pmids(pmids)
    entrez_texts.extend([(universal_extract_text(text), grounding)
                          for text in content.values() if text])
    entrez_refs.update(content.keys())

In [64]:
miners = dict()
all_texts = set()
for shortform in shortforms:
    stmts = get_stmts_with_agent_text_like(shortform)[shortform]
    _, content = get_text_content_from_stmt_ids(stmts)
    shortform_texts = [universal_extract_text(text, contains=shortforms)
                       for ref, text in content.items() if text and ref not in entrez_refs]
    miners[shortform] = AdeftMiner(shortform)
    miners[shortform].process_texts(shortform_texts)
    all_texts.union(shortform_texts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [5]:
top = miners['PCS'].top()

In [7]:
top

[('physical component summary', 82.6315789473684),
 ('score', 35.193548387096776),
 ('the physical component summary', 29.548387096774192),
 ('sulfate', 29.0188679245283),
 ('physical component score', 26.967741935483872),
 ('p cresyl sulfate', 25.82758620689655),
 ('physical', 23.379310344827587),
 ('scale', 18.064516129032256),
 ('phytochelatin synthase', 16.705882352941178),
 ('physical component summary score', 14.666666666666666),
 ('summary', 13.057692307692307),
 ('portacaval shunt', 11.571428571428571),
 ('p cresol sulfate', 10.736842105263158),
 ('pain catastrophizing scale', 10.666666666666668),
 ('the pain catastrophizing scale', 9.555555555555557),
 ('the physical', 9.454545454545453),
 ('syndrome', 8.625),
 ('post concussive syndrome', 8.333333333333334),
 ('the physical component score', 8.0),
 ('posterior circulation stroke', 7.25),
 ('component score', 7.085714285714285),
 ('indoxyl sulfate is and p cresyl sulfate', 6.666666666666667),
 ('post concussive symptoms', 6.25

In [24]:
top[50]

('postconcussion syndrome', 2.5)

In [12]:
longforms0 = miners['PCS'].get_longforms()

In [13]:
longforms0

[('physical component summary', 82.6315789473684),
 ('score', 35.193548387096776),
 ('sulfate', 29.0188679245283),
 ('physical', 23.379310344827587),
 ('scale', 18.064516129032256),
 ('phytochelatin synthase', 16.705882352941178),
 ('portacaval shunt', 11.571428571428571),
 ('syndrome', 8.625),
 ('posterior circulation stroke', 7.25),
 ('post concussive symptoms', 6.25),
 ('porous calcium silicate', 5.428571428571429),
 ('premature chromatid separation', 5.0),
 ('physical health', 4.666666666666667),
 ('psoralea corylifolia seeds', 4.666666666666667),
 ('primary cytoreductive surgery', 3.6),
 ('catastrophizing', 2.4),
 ('sites', 1.3333333333333333),
 ('post error slowing pes and post conflict slowing', 1.3333333333333333)]

In [14]:
longforms0 = [x for i, x in enumerate(longforms0) if i in [0, 5, 6 , 8, 9, 10, 11, 12, 13, 14]]

In [15]:
longforms0

[('physical component summary', 82.6315789473684),
 ('phytochelatin synthase', 16.705882352941178),
 ('portacaval shunt', 11.571428571428571),
 ('posterior circulation stroke', 7.25),
 ('post concussive symptoms', 6.25),
 ('porous calcium silicate', 5.428571428571429),
 ('premature chromatid separation', 5.0),
 ('physical health', 4.666666666666667),
 ('psoralea corylifolia seeds', 4.666666666666667),
 ('primary cytoreductive surgery', 3.6)]

In [25]:
longforms0.extend([x for i, x in enumerate(top) if i in [4, 5, 9, 12, 13, 27, 50]])

In [26]:
longforms0

[('physical component summary', 82.6315789473684),
 ('phytochelatin synthase', 16.705882352941178),
 ('portacaval shunt', 11.571428571428571),
 ('posterior circulation stroke', 7.25),
 ('post concussive symptoms', 6.25),
 ('porous calcium silicate', 5.428571428571429),
 ('premature chromatid separation', 5.0),
 ('physical health', 4.666666666666667),
 ('psoralea corylifolia seeds', 4.666666666666667),
 ('primary cytoreductive surgery', 3.6),
 ('physical component score', 26.967741935483872),
 ('p cresyl sulfate', 25.82758620689655),
 ('physical component summary score', 14.666666666666666),
 ('p cresol sulfate', 10.736842105263158),
 ('pain catastrophizing scale', 10.666666666666668),
 ('physical component scale', 5.25),
 ('postconcussion syndrome', 2.5)]

In [27]:
longforms0.sort(key=lambda x: -x[1])

In [28]:
longforms, scores = zip(*longforms0)

In [29]:
longforms

('physical component summary',
 'physical component score',
 'p cresyl sulfate',
 'phytochelatin synthase',
 'physical component summary score',
 'portacaval shunt',
 'p cresol sulfate',
 'pain catastrophizing scale',
 'posterior circulation stroke',
 'post concussive symptoms',
 'porous calcium silicate',
 'physical component scale',
 'premature chromatid separation',
 'physical health',
 'psoralea corylifolia seeds',
 'primary cytoreductive surgery',
 'postconcussion syndrome')

In [30]:
grounding_map = {}
names = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'
        names[grounding_map[longform]] = grounding[2]

In [34]:
grounding_map

{'p cresol sulfate': 'CHEBI:CHEBI:82914',
 'p cresyl sulfate': 'CHEBI:CHEBI:82914',
 'pain catastrophizing scale': 'ungrounded',
 'physical component scale': 'physical component summary',
 'physical component score': 'physical component summary',
 'physical component summary': 'physical component summary',
 'physical component summary score': 'physical component summary',
 'physical health': 'ungrounded',
 'phytochelatin synthase': 'UP:Q9S7Z3',
 'porous calcium silicate': 'ungrounded',
 'portacaval shunt': 'MESH:D011167',
 'post concussive symptoms': 'MESH:D038223',
 'postconcussion syndrome': 'MESH:D038223',
 'posterior circulation stroke': 'ungrounded',
 'premature chromatid separation': 'ungrounded',
 'primary cytoreductive surgery': 'ungrounded',
 'psoralea corylifolia seeds': 'ungrounded'}

In [35]:
names

{'CHEBI:CHEBI:82914': 'p-cresol sulfate',
 'physical component summary': 'physical component summary',
 'UP:Q9S7Z3': 'PCS1',
 'MESH:D011167': 'Portacaval Shunt, Surgical',
 'MESH:D038223': 'Post-Concussion Syndrome'}

In [33]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map, names=names)

In [37]:
names.update({f'HGNC:{get_hgnc_id(gene)}': gene for gene in genes})
pos_labels = list(set(pos_labels) | set(f'HGNC:{get_hgnc_id(gene)}' for gene in genes))

In [38]:
result = (grounding_map, names, pos_labels)

In [39]:
result

({'p cresol sulfate': 'CHEBI:CHEBI:82914',
  'p cresyl sulfate': 'CHEBI:CHEBI:82914',
  'pain catastrophizing scale': 'ungrounded',
  'physical component scale': 'physical component summary',
  'physical component score': 'physical component summary',
  'physical component summary': 'physical component summary',
  'physical component summary score': 'physical component summary',
  'physical health': 'ungrounded',
  'phytochelatin synthase': 'UP:Q9S7Z3',
  'porous calcium silicate': 'ungrounded',
  'portacaval shunt': 'MESH:D011167',
  'post concussive symptoms': 'MESH:D038223',
  'postconcussion syndrome': 'MESH:D038223',
  'posterior circulation stroke': 'ungrounded',
  'premature chromatid separation': 'ungrounded',
  'primary cytoreductive surgery': 'ungrounded',
  'psoralea corylifolia seeds': 'ungrounded'},
 {'CHEBI:CHEBI:82914': 'p-cresol sulfate',
  'physical component summary': 'physical component summary',
  'UP:Q9S7Z3': 'PCS1',
  'MESH:D011167': 'Portacaval Shunt, Surgical',


In [69]:
grounding_map, names, pos_labels = ({'p cresol sulfate': 'CHEBI:CHEBI:82914',
  'p cresyl sulfate': 'CHEBI:CHEBI:82914',
  'pain catastrophizing scale': 'ungrounded',
  'physical component scale': 'physical component summary',
  'physical component score': 'physical component summary',
  'physical component summary': 'physical component summary',
  'physical component summary score': 'physical component summary',
  'physical health': 'ungrounded',
  'phytochelatin synthase': 'UP:Q9S7Z3',
  'porous calcium silicate': 'ungrounded',
  'portacaval shunt': 'MESH:D011167',
  'post concussive symptoms': 'MESH:D038223',
  'postconcussion syndrome': 'MESH:D038223',
  'posterior circulation stroke': 'ungrounded',
  'premature chromatid separation': 'ungrounded',
  'primary cytoreductive surgery': 'ungrounded',
  'psoralea corylifolia seeds': 'ungrounded'},
 {'CHEBI:CHEBI:82914': 'p-cresol sulfate',
  'physical component summary': 'physical component summary',
  'UP:Q9S7Z3': 'PCS1',
  'MESH:D011167': 'Portacaval Shunt, Surgical',
  'MESH:D038223': 'Post-Concussion Syndrome'},
 ['CHEBI:CHEBI:82914', 'UP:Q9S7Z3'])

In [70]:
grounding_dict = {'PCS': grounding_map}

In [71]:
classifier = AdeftClassifier('PCS', pos_labels=pos_labels)

In [72]:
param_grid = {'C': [100.0], 'max_features': [10000]}

In [73]:
labeler = AdeftLabeler(grounding_dict)

In [74]:
corpus = labeler.build_from_texts(shortform_texts)

In [46]:
corpus.extend(entrez_texts)

In [75]:
texts, labels = zip(*corpus)

In [76]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2020-01-13 13:48:26] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [100.0], 'max_features': [10000]}
INFO: [2020-01-13 13:48:29] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 0.9659340659340658 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 10000}


In [77]:
classifier.stats

{'label_distribution': {'MESH:D038223': 8,
  'physical component summary': 105,
  'ungrounded': 32,
  'CHEBI:CHEBI:82914': 29,
  'MESH:D011167': 13,
  'UP:Q9S7Z3': 12},
 'f1': {'mean': 0.9659340659340658, 'std': 0.04690361407328684},
 'precision': {'mean': 0.9401360544217686, 'std': 0.08099524389734541},
 'recall': {'mean': 1.0, 'std': 0.0}}

In [78]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [79]:
d = disamb.disambiguate(shortform_texts)

In [86]:
a = [text for pred, text in zip(d, shortform_texts)if pred[0].startswith('UP')]

In [87]:
len(a)

23

In [88]:
a[16]

'The OsPCS2 exhibits root- and shoot-specific differential ratios of alternatively spliced transcripts in indica rice under Cd stress, and plays role in Cd and As stress tolerance and accumulation. Enzymatic activity of phytochelatin synthase (PCS) in plant produces phytochelatins, which help in sequestration of heavy metal(loid)s inside the cell vacuole to alleviate toxicity. Here we report that among the two PCS genes-OsPCS1 and OsPCS2 in indica rice (Oryza sativa) cultivar, the OsPCS2 produces an alternatively spliced OsPCS2b transcript that bears the unusual premature termination codon besides the canonically spliced OsPCS2a transcript. Root- and shoot-specific differential ratios of alternatively spliced OsPCS2a and OsPCS2b transcript expressions were observed under cadmium stress. Saccharomyces cerevisiae cells transformed with OsPCS2a exhibited increased cadmium (Cd) and arsenic (As) tolerance and accumulation, unlike the OsPCS2b transformed yeast cells. An intron-containing hai

In [89]:
disamb.dump('PCS', '../results')

In [None]:
from adeft.disambiguate import load_disambiguator

In [None]:
d = load_disambiguator('ARG', '../results')

In [None]:
d.disambiguate(texts[0])

In [None]:
print(d.info())

In [None]:
a = load_disambiguator('AR')

In [None]:
a.disambiguate('Androgen')

In [None]:
logit = d.classifier.estimator.named_steps['logit']

In [None]:
logit.classes_

In [90]:
model_to_s3(disamb)