In [1]:
from __future__ import print_function
from __future__ import division

from collections import OrderedDict
import os
import sys
import warnings

import argparse
import logging
import h5py as h5
import numpy as np
import pandas as pd
import scipy.io

import six
import csv
from six.moves import range

from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score
from keras.preprocessing import sequence
from keras.optimizers import RMSprop,Adam, SGD
from keras.models import Sequential
from keras.layers.core import  Dropout, Activation, Flatten
from keras.regularizers import l1,l2,l1_l2
from keras.constraints import maxnorm
#from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Dense, LSTM, Bidirectional
#from keras.utils import plot_model
#from keras.utils.layer_utils import print_layer_shapes
# fix random seed for reproducibility
np.random.seed(1369)

Using TensorFlow backend.


In [2]:
h5filename = "histonemodKmer_resample_ncl_K562.h5"
h5file = h5.File(h5filename,'r')
input_features = h5file['input/H3K27ac_kmer_2000']
output_H3K27ac = h5file['output/H3K27ac_2000']
input_features = np.array(input_features,dtype='int8')
output_H3K27ac = np.array(output_H3K27ac, dtype='int8')
print(input_features.shape)
print(output_H3K27ac.shape)

(23634, 2080)
(23634,)


In [3]:
output_H3K27ac_reshape = output_H3K27ac.reshape(len(output_H3K27ac),1)
#combine the label with input dna
input_features_label = np.concatenate((input_features,output_H3K27ac_reshape), axis=1)
H3K27ac_df = pd.DataFrame(output_H3K27ac)
pos_label= H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K27ac = input_features_label[pos_label_ix,:]
neg_sam_H3K27ac = input_features_label[neg_label_ix,:]
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K27ac.shape)
print(neg_sam_H3K27ac.shape)
print(input_features)

here
[    0     5     6 ..., 23624 23628 23632]
(23634, 2081)
(8231, 1)
(15403, 1)
(8231, 2081)
(15403, 2081)
[[22  4  5 ...,  5  0  2]
 [19  9  8 ...,  7  1  4]
 [46  5  8 ...,  9  1  6]
 ..., 
 [19  2  5 ...,  5  0  3]
 [50  5  6 ...,  4  0  0]
 [12  3  7 ...,  3  2  1]]


In [4]:
#train
train_neg_sample = int(neg_sam_H3K27ac.shape[0] * 0.7)
train_pos_sample = int(pos_sam_H3K27ac.shape[0] * 0.7)
train_neg_H3K27ac = neg_sam_H3K27ac[0:train_neg_sample,:]
train_pos_H3K27ac = pos_sam_H3K27ac[0:train_pos_sample,:]
train_neg_pos_H3K27ac = np.concatenate((train_neg_H3K27ac, train_pos_H3K27ac),axis = 0)
np.random.shuffle(train_neg_pos_H3K27ac)
X_train_H3K27ac = train_neg_pos_H3K27ac[:,0:2080]
Y_train_H3K27ac = train_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_train_H3K27ac)
print(frq)
print(X_train_H3K27ac.shape)
print(Y_train_H3K27ac.shape)

[10782  5761]
(16543, 2080)
(16543,)


In [5]:
#val
val_neg_sample = train_neg_sample + int(neg_sam_H3K27ac.shape[0] * 0.1)
val_pos_sample = train_pos_sample + int(pos_sam_H3K27ac.shape[0] * 0.1)
val_neg_H3K27ac = neg_sam_H3K27ac[train_neg_sample:val_neg_sample,:]
val_pos_H3K27ac = pos_sam_H3K27ac [train_pos_sample:val_pos_sample,:]
val_neg_pos_H3K27ac = np.concatenate((val_neg_H3K27ac, val_pos_H3K27ac),axis = 0)
np.random.shuffle(val_neg_pos_H3K27ac)
X_val_H3K27ac = val_neg_pos_H3K27ac[:,0:2080]
Y_val_H3K27ac = val_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_val_H3K27ac)
print(frq)
print(X_val_H3K27ac.shape)
print(Y_val_H3K27ac.shape)

[1540  823]
(2363, 2080)
(2363,)


In [6]:
#test
test_neg_H3K27ac = neg_sam_H3K27ac[val_neg_sample:,:]
test_pos_H3K27ac = pos_sam_H3K27ac [val_pos_sample:,:]
test_neg_pos_H3K27ac = np.concatenate((test_neg_H3K27ac, test_pos_H3K27ac),axis = 0)
np.random.shuffle(test_neg_pos_H3K27ac)
X_test_H3K27ac = test_neg_pos_H3K27ac[:,0:2080]
Y_test_H3K27ac = test_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_test_H3K27ac)
print(frq)
print(X_test_H3K27ac.shape)
print(Y_test_H3K27ac.shape)

[3081 1647]
(4728, 2080)
(4728,)


In [7]:
h5filename = "histonemodKmer_resample_nclx.h5"
h5file = h5.File(h5filename,'a')
h5file.create_dataset('/input/H3K27ac_ncl_train',data=X_train_H3K27ac, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/input/H3K27ac_ncl_val', data=X_val_H3K27ac, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/input/H3K27ac_ncl_test', data=X_test_H3K27ac, dtype =np.int8, compression ='gzip')

h5file.create_dataset('/output/H3K27ac_ncl_train',data = Y_train_H3K27ac, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/output/H3K27ac_ncl_val', data = Y_val_H3K27ac, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/output/H3K27ac_ncl_test', data = Y_test_H3K27ac, dtype =np.int8, compression ='gzip')


<HDF5 dataset "H3K27ac_ncl_test": shape (5270,), type "|i1">

In [8]:
h5file.close()

In [9]:
h5filename = "histonemodKmer_resample_nclx.h5"
h5file = h5.File(h5filename,'r')
idx = h5file['output/H3K27ac_idx']
idx = np.array(idx)
print(idx.shape)
print(idx[100:150])

(25743,)
[110 111 112 113 114 115 116 117 118 119 120 121 122 124 126 127 129 130
 131 132 133 134 135 136 137 138 139 140 141 142 144 145 146 148 150 151
 152 153 154 155 157 158 159 160 162 163 165 166 167 169]


In [7]:
 model = Sequential()
 #model.add(Conv1D(activation="relu", input_shape=(2080, 1), padding="valid", strides=1, filters=256, kernel_size=11, kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=4))
 #model.add(Dropout(0.6))
 #model.add(Conv1D(activation="relu", padding="valid", strides=1, filters=640, kernel_size=3, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=2))
 #model.add(Dropout(0.5))
 #model.add(Flatten())
 #model.summary()
 model.add(Dense(units=512, input_dim=2080, activation="relu", kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 #model.add(Dense(units=512, input_dim=512,  activation="relu", kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(Dropout(0.5))
 model.add(Dense(units=180, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 model.add(Dense(units=70, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dense(units=1, activation="sigmoid"))
 model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               1065472   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 180)               92340     
_________________________________________________________________
dropout_2 (Dropout)          (None, 180)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 70)                12670     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 71        
Total params: 1,170,553
Trainable params: 1,170,553
Non-trainable params: 0
_________________________________________________________________


In [8]:
 adam = Adam(lr=0.0001)
 sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
 print('running at most 60 epochs')
 checkpointer = ModelCheckpoint(filepath="HistoneMark_H3K27ac.hdf5", verbose=1, save_best_only=True)
 earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
 model.fit(X_train_H3K27ac, Y_train_H3K27ac, batch_size=128, epochs=50, shuffle=True, validation_data=( X_val_H3K27ac, Y_val_H3K27ac), callbacks=[checkpointer,earlystopper])
 #model.fit(X_train_s, Y_train_s, batch_size=12, epochs=50, shuffle=True, validation_data=( X_val_s, Y_val_s), callbacks=[checkpointer,earlystopper])
 y_pred = model.predict(X_test_H3K27ac)
 #y_pred = model.predict(X_test_s)
 #tresults = model.evaluate(X_test_s, Y_test_s)
 np.savetxt('H3K27ac_true.csv', Y_test_H3K27ac, delimiter=",")
 np.savetxt('H3K27ac_pred.csv', y_pred, delimiter=",")
 tresults = model.evaluate(X_test_H3K27ac, Y_test_H3K27ac)
 print(tresults)
 model.summary()		
 #print(roc_auc_score(Y_test_s,y_pred))
 print(roc_auc_score(Y_test_H3K27ac, y_pred))
 print(average_precision_score(Y_test_H3K27ac, y_pred))
 y_pred = (y_pred>0.5)
 cm = confusion_matrix(Y_test_H3K27ac, y_pred)
 print(cm)
 

running at most 60 epochs
Train on 16543 samples, validate on 2363 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.39433, saving model to HistoneMark_H3K27ac.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.39433 to 0.33841, saving model to HistoneMark_H3K27ac.hdf5
Epoch 3/50
Epoch 00003: val_loss improved from 0.33841 to 0.32625, saving model to HistoneMark_H3K27ac.hdf5
Epoch 4/50
Epoch 00004: val_loss improved from 0.32625 to 0.30698, saving model to HistoneMark_H3K27ac.hdf5
Epoch 5/50
Epoch 00005: val_loss improved from 0.30698 to 0.29916, saving model to HistoneMark_H3K27ac.hdf5
Epoch 6/50
Epoch 00006: val_loss improved from 0.29916 to 0.29773, saving model to HistoneMark_H3K27ac.hdf5
Epoch 7/50
Epoch 00007: val_loss improved from 0.29773 to 0.29384, saving model to HistoneMark_H3K27ac.hdf5
Epoch 8/50
Epoch 00008: val_loss improved from 0.29384 to 0.28709, saving model to HistoneMark_H3K27ac.hdf5
Epoch 9/50
Epoch 00009: val_loss did not improve
Epoch 10/50
Epoc

Epoch 28/50
Epoch 00028: val_loss did not improve
Epoch 29/50
Epoch 00029: val_loss did not improve
Epoch 30/50
Epoch 00030: val_loss did not improve
Epoch 31/50
Epoch 00031: val_loss did not improve
Epoch 32/50
Epoch 00032: val_loss did not improve
Epoch 00032: early stopping
[0.33261329770189052, 0.89149746202978786]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               1065472   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 180)               92340     
_________________________________________________________________
dropout_2 (Dropout)          (None, 180)               0         
_________________________________________________________________
dense_3 (Dense)    