In [83]:
import json
import pickle

from indra.databases.hgnc_client import get_hgnc_name
from indra.literature.adeft_tools import universal_extract_text

from indra_db.util.content_scripts import get_text_content_from_pmids
from indra_db.util.content_scripts import get_stmts_with_agent_text_like
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft.discover import AdeftMiner
from adeft.gui import ground_with_gui
from adeft.modeling.label import AdeftLabeler
from adeft.modeling.classify import AdeftClassifier
from adeft.disambiguate import AdeftDisambiguator

from adeft_indra.ground import gilda_ground

In [85]:
with open('../data/entrez_all_pmids.json', 'r') as f:
    all_pmids = json.load(f)

In [86]:
ANO1_pmids = all_pmids['ANO1']
ANO1_entrez_refs, ANO1_entrez_content = get_text_content_from_pmids(ANO1_pmids)
ANO1_entrez_texts = [universal_extract_text(text) for text in ANO1_entrez_content.values() if text]

In [87]:
shortform = 'DOG1'

In [88]:
stmts = get_stmts_with_agent_text_like(shortform)[shortform]

refs, content = get_text_content_from_stmt_ids(stmts)

It's then necessary to check if Acromine produced the correct results. We must fix errors manually

In [89]:
overlap = set(refs.values()) & set(ANO1_entrez_refs.values())

In [90]:
len(overlap)

2

In [91]:
shortform_texts = [universal_extract_text(text, contains=[shortform]) for ref, text in content.items()
                   if text and ref not in overlap and len(text) >= 5]

In [92]:
miner = AdeftMiner(shortform)
miner.process_texts(shortform_texts)

In [93]:
top = miner.top()

In [94]:
top

[('delay of germination 1', 11.142857142857139),
 ('delay of germination1', 7.0),
 ('the delay of germination 1', 3.0),
 ('gene delay of germination 1', 2.0),
 ('1', 1.8666666666666671),
 ('protein delay of germination 1', 1.3333333333333333),
 ('and delay of germination 1', 1.0),
 ('the arabidopsis delay of germination1', 1),
 ('the arabidopsis protein delay of germination 1', 1.0),
 ('ing seed maturation induces several gene associated with dormancy included delay of germination1',
  1),
 ('up dup regulated gene e g at5g43580 pr peptide at5g45540 unknown protein at5g45830',
  1),
 ('and polysaccharides are produced during seed maturation here we report that delay of germination1',
  1),
 ('nse to seasonal temperature patterns temporal sensing involved the gene delay of germination 1',
  1),
 ('temperature drives seed specific temporal sensing via the accumulation of delay of germination 1',
  1),
 ('favourable conditions the plant hormone abscisic acid aba and the protein delay of ge

In [95]:
miner.get_longforms()

[('delay of germination 1', 11.142857142857139),
 ('delay of germination1', 7.0)]

In [96]:
longforms = miner.get_longforms()

In [97]:
longforms, scores = zip(*longforms)

In [98]:
grounding_map = {}
for longform in longforms:
    grounding = gilda_ground(longform)
    if grounding[0]:
        grounding_map[longform] = f'{grounding[0]}:{grounding[1]}'

In [99]:
grounding_map

{}

In [100]:
grounding_map, names, pos_labels = ground_with_gui(longforms, scores, grounding_map=grounding_map)

In [101]:
names['HGNC:21625'] = 'ANO1'
pos_labels.extend(['HGNC:21625'])

In [102]:
result = (grounding_map, names, pos_labels)

In [103]:
result

({'delay of germination 1': 'UP:A0SVK0', 'delay of germination1': 'UP:A0SVK0'},
 {'UP:A0SVK0': 'Protein DELAY OF GERMINATION 1', 'HGNC:21625': 'ANO1'},
 ['UP:A0SVK0', 'HGNC:21625'])

In [104]:
grounding_map, names, pos_labels = ({'delay of germination 1': 'UP:A0SVK0', 'delay of germination1': 'UP:A0SVK0'},
                                     {'UP:A0SVK0': 'Protein DELAY OF GERMINATION 1', 'HGNC:21625': 'ANO1'},
                                     ['UP:A0SVK0', 'HGNC:21625'])

In [105]:
grounding_dict = {'DOG1': grounding_map}

In [106]:
classifier = AdeftClassifier('DOG1', pos_labels=pos_labels)

In [107]:
param_grid = {'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}

In [108]:
labeler = AdeftLabeler(grounding_dict)

In [109]:
corpus = labeler.build_from_texts(shortform_texts)

In [110]:
corpus.extend([(text, 'HGNC:21625') for text in ANO1_entrez_texts])

In [111]:
texts, labels = zip(*corpus)

In [112]:
classifier.cv(texts, labels, param_grid, cv=5, n_jobs=8)

INFO: [2019-12-17 15:58:38] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Beginning grid search in parameter space:
{'C': [10.0, 100.0], 'max_features': [100, 1000, 10000]}
INFO: [2019-12-17 15:58:46] /Users/albertsteppi/adeft/adeft/modeling/classify.py - Best f1 score of 1.0 found for parameter values:
{'logit__C': 100.0, 'tfidf__max_features': 1000}


In [113]:
classifier.stats

{'label_distribution': {'UP:A0SVK0': 16, 'HGNC:21625': 126},
 'f1': {'mean': 0.9714285714285715, 'std': 0.05714285714285716},
 'precision': {'mean': 1.0, 'std': 0.0},
 'recall': {'mean': 0.95, 'std': 0.09999999999999999}}

In [114]:
disamb = AdeftDisambiguator(classifier, grounding_dict, names)

In [123]:
disamb.dump('DOG1', '../results')

In [116]:
disambs = disamb.disambiguate(shortform_texts)

In [124]:
a = [text for text, d in zip(shortform_texts, disambs) if d[0] != 'HGNC:21625']

In [125]:
len(a)

25

In [127]:
a[1]

'The salinization of land is a major factor limiting crop production worldwide. Halophytes adapted to high levels of salinity are likely to possess useful genes for improving crop tolerance to salt stress. In addition, halophytes could provide a food source on marginal lands. However, despite halophytes being salt-tolerant plants, the seeds of several halophytic species will not germinate on saline soils. Yet, little is understood regarding biochemical and gene expression changes underlying salt-mediated inhibition of halophyte seed germination. We have used the halophytic  Arabidopsis  relative model system,  Eutrema  ( Thellungiella )  salsugineum  to explore salt-mediated inhibition of germination. We show that  E. salsugineum  seed germination is inhibited by salt to a far greater extent than in  Arabidopsis , and that this inhibition is in response to the osmotic component of salt exposure.  E. salsugineum  seeds remain viable even when germination is completely inhibited, and ger

In [128]:
from adeft.disambiguate import load_disambiguator

In [130]:
disamb = load_disambiguator('DOG1', path='../results')

In [131]:
from adeft_indra.s3 import model_to_s3

In [132]:
model_to_s3(disamb)