In [4]:
from __future__ import print_function
from __future__ import division

from collections import OrderedDict
import os
import sys
import warnings

import argparse
import logging
import h5py as h5
import numpy as np
import pandas as pd
import scipy.io

import six
import csv
from six.moves import range

from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score
from keras.preprocessing import sequence
from keras.optimizers import RMSprop,Adam, SGD
from keras.models import Sequential
from keras.layers.core import  Dropout, Activation, Flatten
from keras.regularizers import l1,l2,l1_l2
from keras.constraints import maxnorm
#from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Dense, LSTM, Bidirectional
#from keras.utils import plot_model
#from keras.utils.layer_utils import print_layer_shapes
# fix random seed for reproducibility
np.random.seed(1369)

In [5]:
h5filename = "histonemodKmer_resample_ncl.h5"
h5file = h5.File(h5filename,'r')
input_features = h5file['input/H3K27me3_kmer_1000']
output_H3K27me3 = h5file['output/H3K27me3_1000']
input_features = np.array(input_features,dtype='int8')
output_H3K27me3 = np.array(output_H3K27me3, dtype='int8')
print(input_features.shape)
print(output_H3K27me3.shape)

(26262, 2080)
(26262,)


In [6]:
output_H3K27me3_reshape = output_H3K27me3.reshape(len(output_H3K27me3),1)
#combine the label with input dna
input_features_label = np.concatenate((input_features,output_H3K27me3_reshape), axis=1)
H3K27me3_df = pd.DataFrame(output_H3K27me3)
pos_label= H3K27me3_df.loc[H3K27me3_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K27me3_df.loc[H3K27me3_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K27me3 = input_features_label[pos_label_ix,:]
neg_sam_H3K27me3 = input_features_label[neg_label_ix,:]
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K27me3.shape)
print(neg_sam_H3K27me3.shape)
print(input_features)

here
[    0     8    41 ..., 26196 26217 26239]
(26262, 2081)
(2470, 1)
(23792, 1)
(2470, 2081)
(23792, 2081)
[[ 2  1  2 ...,  1  0  0]
 [11  1  3 ...,  2  2  0]
 [ 1  2  3 ...,  0  0  1]
 ..., 
 [ 3  2  6 ...,  3  0  1]
 [ 0  1  2 ...,  2  0  1]
 [24  1  4 ...,  1  0  5]]


In [7]:
#train
train_neg_H3K27me3 = neg_sam_H3K27me3[0:16655,:]
train_pos_H3K27me3 = pos_sam_H3K27me3[0:1729,:]
train_neg_pos_H3K27me3 = np.concatenate((train_neg_H3K27me3, train_pos_H3K27me3),axis = 0)
np.random.shuffle(train_neg_pos_H3K27me3)
X_train_H3K27me3 = train_neg_pos_H3K27me3[:,0:2080]
Y_train_H3K27me3 = train_neg_pos_H3K27me3[:,2080]
frq = np.bincount(Y_train_H3K27me3)
print(frq)
print(X_train_H3K27me3.shape)
print(Y_train_H3K27me3.shape)

[16655  1729]
(18384, 2080)
(18384,)


In [8]:
#val
val_neg_H3K27me3 = neg_sam_H3K27me3[16655:19281:]
val_pos_H3K27me3 = pos_sam_H3K27me3 [1729:1976,:]
val_neg_pos_H3K27me3 = np.concatenate((val_neg_H3K27me3, val_pos_H3K27me3),axis = 0)
np.random.shuffle(val_neg_pos_H3K27me3)
X_val_H3K27me3 = val_neg_pos_H3K27me3[:,0:2080]
Y_val_H3K27me3 = val_neg_pos_H3K27me3[:,2080]
frq = np.bincount(Y_val_H3K27me3)
print(frq)
print(X_val_H3K27me3.shape)
print(Y_val_H3K27me3.shape)

[2626  247]
(2873, 2080)
(2873,)


In [9]:
#test
test_neg_H3K27me3 = neg_sam_H3K27me3[19281:,:]
test_pos_H3K27me3 = pos_sam_H3K27me3 [1976:,:]
test_neg_pos_H3K27me3 = np.concatenate((test_neg_H3K27me3, test_pos_H3K27me3),axis = 0)
np.random.shuffle(test_neg_pos_H3K27me3)
X_test_H3K27me3 = test_neg_pos_H3K27me3[:,0:2080]
Y_test_H3K27me3 = test_neg_pos_H3K27me3[:,2080]
frq = np.bincount(Y_test_H3K27me3)
print(frq)
print(X_test_H3K27me3.shape)
print(Y_test_H3K27me3.shape)

[4511  494]
(5005, 2080)
(5005,)


In [7]:
h5filename = "histonemodKmer_resample_nclx.h5"
h5file = h5.File(h5filename,'a')
h5file.create_dataset('/input/H3K27me3_ncl_train',data=X_train_H3K27me3, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/input/H3K27me3_ncl_val', data=X_val_H3K27me3, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/input/H3K27me3_ncl_test', data=X_test_H3K27me3, dtype =np.int8, compression ='gzip')

h5file.create_dataset('/output/H3K27me3_ncl_train',data = Y_train_H3K27me3, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/output/H3K27me3_ncl_val', data = Y_val_H3K27me3, dtype =np.int8, compression ='gzip')
h5file.create_dataset('/output/H3K27me3_ncl_test', data = Y_test_H3K27me3, dtype =np.int8, compression ='gzip')


<HDF5 dataset "H3K27me3_ncl_test": shape (5263,), type "|i1">

In [8]:
h5file.close()

In [9]:
h5filename = "histonemodKmer_resample_nclx.h5"
h5file = h5.File(h5filename,'r')
idx = h5file['output/H3K27me3_idx']
idx = np.array(idx)
print(idx.shape)
print(idx[100:150])

(26309,)
[111 112 113 114 116 117 118 119 120 121 122 124 125 126 127 128 129 130
 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
 149 150 152 153 154 155 157 158 159 160 161 162 163 164]


In [10]:
 model = Sequential()
 #model.add(Conv1D(activation="relu", input_shape=(2080, 1), padding="valid", strides=1, filters=256, kernel_size=11, kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=4))
 #model.add(Dropout(0.6))
 #model.add(Conv1D(activation="relu", padding="valid", strides=1, filters=640, kernel_size=3, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=2))
 #model.add(Dropout(0.5))
 #model.add(Flatten())
 #model.summary()
 model.add(Dense(units=512, input_dim=2080, activation="relu", kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 #model.add(Dense(units=512, input_dim=512,  activation="relu", kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(Dropout(0.5))
 model.add(Dense(units=180, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 model.add(Dense(units=70, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dense(units=1, activation="sigmoid"))
 model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               1065472   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 180)               92340     
_________________________________________________________________
dropout_2 (Dropout)          (None, 180)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 70)                12670     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 71        
Total params: 1,170,553
Trainable params: 1,170,553
Non-trainable params: 0
_________________________________________________________________


In [11]:
 adam = Adam(lr=0.0001)
 sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
 print('running at most 60 epochs')
 checkpointer = ModelCheckpoint(filepath="HistoneMark_H3K27me3.hdf5", verbose=1, save_best_only=True)
 earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
 model.fit(X_train_H3K27me3, Y_train_H3K27me3, batch_size=128, epochs=50, shuffle=True, validation_data=( X_val_H3K27me3, Y_val_H3K27me3), callbacks=[checkpointer,earlystopper])
 #model.fit(X_train_s, Y_train_s, batch_size=12, epochs=50, shuffle=True, validation_data=( X_val_s, Y_val_s), callbacks=[checkpointer,earlystopper])
 y_pred = model.predict(X_test_H3K27me3)
 #y_pred = model.predict(X_test_s)
 #tresults = model.evaluate(X_test_s, Y_test_s)
 np.savetxt('H3K27me3_true.csv', Y_test_H3K27me3, delimiter=",")
 np.savetxt('H3K27me3_pred.csv', y_pred, delimiter=",")
 tresults = model.evaluate(X_test_H3K27me3, Y_test_H3K27me3)
 print(tresults)
 model.summary()		
 #print(roc_auc_score(Y_test_s,y_pred))
 print(roc_auc_score(Y_test_H3K27me3, y_pred))
 print(average_precision_score(Y_test_H3K27me3, y_pred))
 y_pred = (y_pred>0.5)
 cm = confusion_matrix(Y_test_H3K27me3, y_pred)
 print(cm)

running at most 60 epochs
Train on 18384 samples, validate on 2873 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.25377, saving model to HistoneMark_H3K27me3.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.25377 to 0.24958, saving model to HistoneMark_H3K27me3.hdf5
Epoch 3/50
Epoch 00003: val_loss improved from 0.24958 to 0.22831, saving model to HistoneMark_H3K27me3.hdf5
Epoch 4/50
Epoch 00004: val_loss improved from 0.22831 to 0.22221, saving model to HistoneMark_H3K27me3.hdf5
Epoch 5/50
Epoch 00005: val_loss improved from 0.22221 to 0.21388, saving model to HistoneMark_H3K27me3.hdf5
Epoch 6/50
Epoch 00006: val_loss improved from 0.21388 to 0.20614, saving model to HistoneMark_H3K27me3.hdf5
Epoch 7/50
Epoch 00007: val_loss did not improve
Epoch 8/50
Epoch 00008: val_loss improved from 0.20614 to 0.20171, saving model to HistoneMark_H3K27me3.hdf5
Epoch 9/50
Epoch 00009: val_loss improved from 0.20171 to 0.19538, saving model to HistoneMark_H3K27me3.hdf5
Epoch 10

0.897145708601
0.558811141655
[[4395  116]
 [ 284  210]]
