In [1]:
import h5py
import numpy as np
import deeplift
import evautils
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from __future__ import print_function
from collections import OrderedDict, defaultdict
import os

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400
CELL_LINE = 'H1ESC'
POS_PREFIX = CELL_LINE +'_' + str(REGION_SIZE)
MASTER_DIR='/users/eprakash/benchmarking/H1ESC/400bp_hg38'
DL_BASE_DIR=MASTER_DIR+'/deeplift'
SCORES_FILE=DL_BASE_DIR+'/'+POS_PREFIX+'_dl_scores.h5'
TOP_POS_LABELS=DL_BASE_DIR+'/'+POS_PREFIX+'_top_1k_pos_labels.h5'
PREPROCESSING_BASE_DIR = MASTER_DIR+'/preprocessing'
MOTIF_MATCHES=PREPROCESSING_BASE_DIR + '/' + POS_PREFIX + '_motif_matches.txt'

In [3]:
h5f = h5py.File(SCORES_FILE,'r')
labels=np.array(h5f.get("labels"))
size=len(labels)
labels=sequtils.renameAll(labels)

In [4]:
grad_times_input_scores=np.array(h5f.get("grad_times_inp_all_zeros_ref"))
rescale_all_multiref_scores=np.array(h5f.get("rescale_all_layers_multiref_10"))
rescale_all_avg_gc_scores=np.array(h5f.get("rescale_all_layers_avg_gc_ref"))
rescale_conv_scores=np.array(h5f.get("rescale_conv_revealcancel_fc_multiref_10"))
ig_multiref_scores=np.array(h5f.get("integrated_gradients10_multiref_10"))
ig_avg_gc_scores=np.array(h5f.get("integrated_gradients10_avg_gc_ref"))
h5f.close()

In [5]:
seq_len=grad_times_input_scores.shape[1]
motif_matches=sequtils.load_motif_matches(MOTIF_MATCHES, True)
seq_ids_of_interest = motif_matches.keys()
seq_ids_of_interest=sequtils.renameAll(seq_ids_of_interest)
print(len(seq_ids_of_interest))

#Loading /users/eprakash/benchmarking/H1ESC/400bp_hg38/preprocessing/H1ESC_400_motif_matches.txt ...
#Loaded 3422379 motif matches in 96662 sequences
96662


In [6]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(labels, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)
ig_h5f=h5py.File(TOP_POS_LABELS,'r')
ig_seq_ids_of_interest=list(ig_h5f.get("labels"))
ig_seq_ids_of_interest=sequtils.renameAll(ig_seq_ids_of_interest)
grad_times_input_scores=sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_scores, seq_len)
rescale_all_multiref_scores=sequtils.get_relevant_scores(relevant_indices_list, rescale_all_multiref_scores, seq_len)
rescale_all_avg_gc_scores=sequtils.get_relevant_scores(relevant_indices_list, rescale_all_avg_gc_scores, seq_len)
rescale_conv_scores=sequtils.get_relevant_scores(relevant_indices_list, rescale_conv_scores, seq_len)

Motif matches sequences are 96662
Supplied labels are 96660
731
96660
96660


In [7]:
method_to_saved_scores = OrderedDict([('grad_times_input', grad_times_input_scores), ('rescale_all_multiref', rescale_all_multiref_scores), ('rescale_all_avg_gc_ref', rescale_all_avg_gc_scores), ('rescale_conv', rescale_conv_scores), ('integrated_gradients_avg_gc_ref', ig_avg_gc_scores), ('integrated_gradients_multiref', ig_multiref_scores)])
method_to_seq_id_to_scores={}
seq_id_to_covered_positions={}
motif_id_to_hit_locations=defaultdict(list)
motif_id_to_motif_length={}
motif_len_to_negatives=defaultdict(list)

windowscoringutils.windowScoring(method_to_saved_scores, seq_len, seq_ids_of_interest, motif_matches, relevant_labels_list, ig_seq_ids_of_interest, method_to_seq_id_to_scores, seq_id_to_covered_positions, motif_id_to_hit_locations, motif_id_to_motif_length, motif_len_to_negatives)

96660
96660
96660
96660
96660
96660
96660
96660
96660
96660
Motif positions: 172514.0, total positions: 400000


In [8]:
motif_id_to_pos_locs={}
windowscoringutils.motifToPosLocs(motif_id_to_pos_locs, motif_id_to_hit_locations, motif_id_to_motif_length)
top_motif_ids=windowscoringutils.topEnrichedMotifs(motif_id_to_pos_locs)

73


In [9]:
method_list=method_to_saved_scores.keys()
methods_to_plot_list=[]
for method in method_list:
    methods_to_plot_list.append([method])
windowscoringutils.displayResults(method_list, top_motif_ids, methods_to_plot_list, motif_id_to_motif_length, motif_id_to_hit_locations, motif_len_to_negatives, method_to_seq_id_to_scores, motif_id_to_pos_locs)

Number of motifs is 73


grad_times_input

CACTAGRGGG: [0.9895176887735505, 0.967080865342425, 1027.0]
CCACTAGGGGGC: [0.9895830643507011, 0.9657063207642193, 1100.0]
CVCCTAGYGG: [0.9820275439229714, 0.9445872641166663, 887.0]
CGCCMCCT: [0.9750375883386176, 0.9055685828010883, 999.0]
AGGGGGCGCTGT: [0.9549368016051302, 0.8592467366886486, 662.0]
AGRGGGCGGK: [0.9102726194271067, 0.6679440362054263, 966.0]
CGCTAGAGGCCH: [0.8209938712618737, 0.5668712703341767, 575.0]
SSCGCTAG: [0.8509608484206062, 0.5502481240270839, 1403.0]
GCGCCTTA: [0.8591011446112511, 0.39778326745967896, 1264.0]
AAGATGGCGGCG: [0.7625698684482198, 0.3511325047660021, 9.0]
GAGCGCGCGCGC: [0.681352000965586, 0.3356908001558413, 182.0]
HGCGSMDNHD: [0.7529353319759485, 0.2771622317047614, 584.0]
CCAATCGG: [0.7235394932324198, 0.266472169231169, 1002.0]
GRCMHCGCCY: [0.7532702323202153, 0.17078084044509623, 151.0]
NAGCGCGCGN: [0.6281952200619992, 0.13755605231434817, 354.0]
CGCCGCTCTA: [0.6906754331097492, 0.12652576938719487