In [1]:
import numpy as np
import deeplift
import evautils
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import impscoringutils
from __future__ import print_function
from collections import OrderedDict
import os
import h5py

Using TensorFlow backend.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
REGION_SIZE = 400
CELL_LINE = 'H1ESC'
POS_PREFIX = CELL_LINE +'_' + str(REGION_SIZE)
MASTER_DIR='/users/eprakash/benchmarking/H1ESC/400bp_hg38'
DL_BASE_DIR=MASTER_DIR+'/deeplift'
PREPROCESSING_BASE_DIR = MASTER_DIR+'/preprocessing'
TRAINING_BASE_DIR=MASTER_DIR+'/training'
MOMMA_DRAGONN=TRAINING_BASE_DIR+'/'+'momma_dragonn'
IMPLANTED_POS_BED_FILE = PREPROCESSING_BASE_DIR + '/' + 'implanted_' + POS_PREFIX + '.bed.gz'
MODEL_PREFIX='record_1_model_XHjBt_'
MODEL=MOMMA_DRAGONN+'/examples/fasta_sequential_model/model_files/'+MODEL_PREFIX+'modelJson.json'
WEIGHTS=MOMMA_DRAGONN+'/examples/fasta_sequential_model/model_files/'+MODEL_PREFIX+'modelWeights.h5'

In [4]:
dirutils.createDir(DL_BASE_DIR, mustcreate=False)

Directory /users/eprakash/benchmarking/H1ESC/400bp_hg38/deeplift already exists


In [5]:
data_filename_positive = IMPLANTED_POS_BED_FILE
labeled_sequences = sequtils.load_sequences_from_bedfile(data_filename_positive)
print("Got %d positive sequences" % len(labeled_sequences))
positive_labels = labeled_sequences.keys()
labels = labeled_sequences.keys()
sequences =labeled_sequences.values()
print("Sequences length: ", len(sequences))

#Loading /users/eprakash/benchmarking/H1ESC/400bp_hg38/preprocessing/implanted_H1ESC_400.bed.gz ...
#Loaded 96663 sequences from /users/eprakash/benchmarking/H1ESC/400bp_hg38/preprocessing/implanted_H1ESC_400.bed.gz
Got 96663 positive sequences
Sequences length:  96663


In [None]:
sequtils.removeUnsupportedChars(sequences, labels, labeled_sequences)

In [None]:
onehot_data = np.array([sequtils.one_hot_encode_along_channel_axis(seq) for seq in sequences])
print(onehot_data.shape)

In [None]:
keras_model=kerasutils.load_keras_model_using_json(MODEL, WEIGHTS)
keras_model.summary()

In [None]:
preds = keras_model.predict(onehot_data)
preds.shape

In [None]:
top_5k_pos_labels, top_pos_seqs=kerasutils.getTopNumPos(labels, labeled_sequences, preds, 5000)

In [None]:
h5f = h5py.File(DL_BASE_DIR+'/'+POS_PREFIX+'_top_5K_pos_labels.h5', 'w')
h5f.create_dataset('labels', data=top_5k_pos_labels)
h5f.close()

In [None]:
#Restrict onehot_data to the top pos seqs
onehot_data = np.array([sequtils.one_hot_encode_along_channel_axis(seq) for seq in top_pos_seqs])
print(onehot_data.shape)

In [None]:
method_to_model=kerasutils.prepareDLModel(WEIGHTS, MODEL)

In [None]:
#make sure predictions are the same as the original model
from deeplift.util import compile_func
model_to_test = method_to_model['rescale_conv_revealcancel_fc']
kerasutils.sanityCheck(model_to_test, onehot_data, keras_model)

In [None]:
method_to_scoring_func = impscoringutils.compileScoringFunctions(method_to_model)
print("Done!")

In [None]:
method_to_task_to_scores = OrderedDict()
all_zeroes_methods=['grad_times_inp', 'rescale_all_layers']
avg_gc_methods=['rescale_all_layers']
multiref_methods=['rescale_all_layers', 'rescale_conv_revealcancel_fc']
ig=['integrated_gradients10']
h5f = h5py.File(DL_BASE_DIR+'/'+POS_PREFIX+'_dl_scores_5K.h5', 'w')
h5f.create_dataset("labels", data=top_5k_pos_labels)

In [None]:
impscoringutils.flatRefScore(method_to_task_to_scores, method_to_scoring_func, ig, onehot_data, 0)
print("Done!")
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
method_to_task_to_scores.clear()

In [None]:
impscoringutils.flatRefScore(method_to_task_to_scores, method_to_scoring_func, all_zeroes_methods, onehot_data, 0)
print("Done!")
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
method_to_task_to_scores.clear()

In [None]:
impscoringutils.flatRefScore(method_to_task_to_scores, method_to_scoring_func, ig, onehot_data, 1)
print("Done!")
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
method_to_task_to_scores.clear()

In [None]:
impscoringutils.flatRefScore(method_to_task_to_scores, method_to_scoring_func, avg_gc_methods, onehot_data, 1)
print("Done!")
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
method_to_task_to_scores.clear()

In [None]:
impscoringutils.multirefScore(method_to_task_to_scores, method_to_scoring_func, ig, top_pos_seqs)
print("Done!")
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
method_to_task_to_scores.clear()

In [None]:
impscoringutils.multirefScore(method_to_task_to_scores, method_to_scoring_func, multiref_methods, top_pos_seqs)
print("Done!")

In [None]:
for meth in method_to_task_to_scores.keys():
    print("Storing scores for " + str(meth))
    h5f.create_dataset(meth, data=method_to_task_to_scores[meth][0])
h5f.close()
method_to_task_to_scores.clear()