In [228]:
from __future__ import print_function
from __future__ import division

from collections import OrderedDict
import os
import sys
import warnings

import argparse
import logging
import h5py as h5
import numpy as np
import pandas as pd
import scipy.io

import six
import csv
from six.moves import range

from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score
from keras.preprocessing import sequence
from keras.optimizers import RMSprop,Adam, SGD
from keras.models import Sequential
from keras.layers.core import  Dropout, Activation, Flatten
from keras.regularizers import l1,l2,l1_l2
from keras.constraints import maxnorm
#from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Dense, LSTM, Bidirectional
#from keras.utils import plot_model
#from keras.utils.layer_utils import print_layer_shapes
# fix random seed for reproducibility
np.random.seed(1369)

In [229]:
h5filename = "histonemodKmer_resample_ncl_GM12878.h5"
h5file = h5.File(h5filename,'r')
input_features = h5file['input/H3K4me3_kmer_2000']
output_H3K4me3 = h5file['output/H3K4me3_2000']
input_features = np.array(input_features,dtype='int8')
output_H3K4me3 = np.array(output_H3K4me3, dtype='int8')

In [230]:
output_H3K4me3_reshape = output_H3K4me3.reshape(len(output_H3K4me3),1)
#combine the label with input dna
input_features_label = np.concatenate((input_features,output_H3K4me3_reshape), axis=1)
H3K4me3_df = pd.DataFrame(output_H3K4me3)
pos_label= H3K4me3_df.loc[H3K4me3_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K4me3_df.loc[H3K4me3_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K4me3 = input_features_label[pos_label_ix,:]
neg_sam_H3K4me3 = input_features_label[neg_label_ix,:]
np.random.shuffle(pos_sam_H3K4me3)
np.random.shuffle(neg_sam_H3K4me3)
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K4me3.shape)
print(neg_sam_H3K4me3.shape)
print(input_features)

here
[    0     4     6 ..., 23643 23647 23651]
(23652, 2081)
(9150, 1)
(14502, 1)
(9150, 2081)
(14502, 2081)
[[44  4  6 ...,  2  2  1]
 [21  8  9 ..., 12  0  7]
 [13  6 16 ...,  7  2  5]
 ..., 
 [15  5 10 ...,  1  2  5]
 [17  4  5 ...,  3  1  6]
 [22  9 10 ...,  4  1  6]]


In [231]:
#train
train_sample_neg = int(neg_sam_H3K4me3.shape[0] * 0.7)
train_sample_pos = int(pos_sam_H3K4me3.shape[0] * 0.7)
train_neg_H3K4me3 = neg_sam_H3K4me3[0:train_sample_neg,:]
train_pos_H3K4me3 = pos_sam_H3K4me3[0:train_sample_pos,:]
train_neg_pos_H3K4me3 = np.concatenate((train_neg_H3K4me3, train_pos_H3K4me3),axis = 0)
np.random.shuffle(train_neg_pos_H3K4me3)
X_train_H3K4me3 = train_neg_pos_H3K4me3[:,0:2080]
Y_train_H3K4me3 = train_neg_pos_H3K4me3[:,2080]
frq = np.bincount(Y_train_H3K4me3)
print(frq)
print(X_train_H3K4me3.shape)
print(Y_train_H3K4me3.shape)

[10151  6405]
(16556, 2080)
(16556,)


In [232]:
#val
val_sample_neg = train_sample_neg + int(neg_sam_H3K4me3.shape[0]*0.1)
val_sample_pos = train_sample_pos + int(pos_sam_H3K4me3.shape[0]*0.1)
val_neg_H3K4me3 = neg_sam_H3K4me3[train_sample_neg:val_sample_neg,:]
val_pos_H3K4me3 = pos_sam_H3K4me3 [train_sample_pos:val_sample_pos,:]
val_neg_pos_H3K4me3 = np.concatenate((val_neg_H3K4me3, val_pos_H3K4me3),axis = 0)
np.random.shuffle(val_neg_pos_H3K4me3)
X_val_H3K4me3 = val_neg_pos_H3K4me3[:,0:2080]
Y_val_H3K4me3 = val_neg_pos_H3K4me3[:,2080]
frq = np.bincount(Y_val_H3K4me3)
print(frq)
print(X_val_H3K4me3.shape)
print(Y_val_H3K4me3.shape)

[1450  915]
(2365, 2080)
(2365,)


In [233]:
#test
test_neg_H3K4me3 = neg_sam_H3K4me3[val_sample_neg:,:]
test_pos_H3K4me3 = pos_sam_H3K4me3 [val_sample_pos:,:]
test_neg_pos_H3K4me3 = np.concatenate((test_neg_H3K4me3, test_pos_H3K4me3),axis = 0)
np.random.shuffle(test_neg_pos_H3K4me3)
X_test_H3K4me3 = test_neg_pos_H3K4me3[:,0:2080]
Y_test_H3K4me3 = test_neg_pos_H3K4me3[:,2080]
frq = np.bincount(Y_test_H3K4me3)
print(frq)
print(X_test_H3K4me3.shape)
print(Y_test_H3K4me3.shape)

[2901 1830]
(4731, 2080)
(4731,)


In [234]:
 model = Sequential()
 #model.add(Conv1D(activation="relu", input_shape=(2080, 1), padding="valid", strides=1, filters=256, kernel_size=11, kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=4))
 #model.add(Dropout(0.6))
 #model.add(Conv1D(activation="relu", padding="valid", strides=1, filters=640, kernel_size=3, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=2))
 #model.add(Dropout(0.5))
 #model.add(Flatten())
 #model.summary()
 model.add(Dense(units=512, input_dim=2080, activation="relu", kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 #model.add(Dense(units=512, input_dim=512,  activation="relu", kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(Dropout(0.5))
 model.add(Dense(units=180, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 model.add(Dense(units=70, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dense(units=1, activation="sigmoid"))
 model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_113 (Dense)            (None, 512)               1065472   
_________________________________________________________________
dropout_57 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_114 (Dense)            (None, 180)               92340     
_________________________________________________________________
dropout_58 (Dropout)         (None, 180)               0         
_________________________________________________________________
dense_115 (Dense)            (None, 70)                12670     
_________________________________________________________________
dense_116 (Dense)            (None, 1)                 71        
Total params: 1,170,553
Trainable params: 1,170,553
Non-trainable params: 0
_________________________________________________________________


In [235]:
 adam = Adam(lr=0.0001)
 sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
 print('running at most 60 epochs')
 checkpointer = ModelCheckpoint(filepath="HistoneMark_H3K9ac_K562.hdf5", verbose=1, save_best_only=True)
 earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
 model.fit(X_train_H3K4me3, Y_train_H3K4me3, batch_size=128, epochs=50, shuffle=True, validation_data=( X_val_H3K4me3, Y_val_H3K4me3), callbacks=[checkpointer,earlystopper])
 #model.fit(X_train_s, Y_train_s, batch_size=12, epochs=50, shuffle=True, validation_data=( X_val_s, Y_val_s), callbacks=[checkpointer,earlystopper])
 y_pred = model.predict(X_test_H3K4me3)
 #np.savetxt('H3K27ac_true.csv', Y_test_H3K4me3, delimiter=",")
 #np.savetxt('H3K27ac_pred.csv', y_pred, delimiter=",")
 #y_pred = model.predict(X_test_s)
 #tresults = model.evaluate(X_test_s, Y_test_s)
 tresults = model.evaluate(X_test_H3K4me3, Y_test_H3K4me3)
 print(tresults)
 model.summary()
 #print(roc_auc_score(Y_test_s,y_pred))
 print(roc_auc_score(Y_test_H3K4me3, y_pred))
 print(average_precision_score(Y_test_H3K4me3, y_pred))
 y_pred = (y_pred>0.5)
 cm = confusion_matrix(Y_test_H3K4me3, y_pred)
 print(cm)

running at most 60 epochs
Train on 16556 samples, validate on 2365 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.36314, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.36314 to 0.32867, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 3/50
Epoch 00003: val_loss improved from 0.32867 to 0.30063, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 4/50
Epoch 00004: val_loss improved from 0.30063 to 0.29303, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 5/50
Epoch 00005: val_loss improved from 0.29303 to 0.28371, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 6/50
Epoch 00006: val_loss did not improve
Epoch 7/50
Epoch 00007: val_loss improved from 0.28371 to 0.27194, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 8/50
Epoch 00008: val_loss did not improve
Epoch 9/50
Epoch 00009: val_loss improved from 0.27194 to 0.26345, saving model to HistoneMark_H3K9ac_K562.hdf5
Epoch 10/50
Epoch 00010: val_loss did not impro

Epoch 29/50
Epoch 00029: val_loss did not improve
Epoch 30/50
Epoch 00030: val_loss did not improve
Epoch 31/50
Epoch 00031: val_loss did not improve
Epoch 00031: early stopping
[0.28756740965820021, 0.89410272660806756]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_113 (Dense)            (None, 512)               1065472   
_________________________________________________________________
dropout_57 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_114 (Dense)            (None, 180)               92340     
_________________________________________________________________
dropout_58 (Dropout)         (None, 180)               0         
_________________________________________________________________
dense_115 (Dense)            (None, 70)                12670     
_____________________________________________________