In [1]:
from __future__ import print_function
from __future__ import division

from collections import OrderedDict
import os
import sys
import warnings

import argparse
import logging
import h5py as h5
import numpy as np
import pandas as pd
import scipy.io

import six
import csv
from six.moves import range

from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score
from keras.preprocessing import sequence
from keras.optimizers import RMSprop,Adam, SGD
from keras.models import Sequential
from keras.layers.core import  Dropout, Activation, Flatten
from keras.regularizers import l1,l2,l1_l2
from keras.constraints import maxnorm
#from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Dense, LSTM, Bidirectional
#from keras.utils import plot_model
#from keras.utils.layer_utils import print_layer_shapes
# fix random seed for reproducibility
np.random.seed(1369)

Using TensorFlow backend.


In [3]:
h5filename = "histonemodKmer_resample_ncl_K562.h5"
h5file = h5.File(h5filename,'r')
input_features = h5file['input/H3K9ac_kmer_2000']
output_H3K9ac = h5file['output/H3K9ac_2000']
input_features = np.array(input_features,dtype='int8')
output_H3K9ac = np.array(output_H3K9ac, dtype='int8')
output_H3K9ac_reshape = output_H3K9ac.reshape(len(output_H3K9ac),1)

In [4]:
#combine the label with input dna
input_features_label = np.concatenate((input_features,output_H3K9ac_reshape), axis=1)
H3K9ac_df = pd.DataFrame(output_H3K9ac)
pos_label= H3K9ac_df.loc[H3K9ac_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K9ac_df.loc[H3K9ac_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K9ac = input_features_label[pos_label_ix,:]
neg_sam_H3K9ac = input_features_label[neg_label_ix,:]
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K9ac.shape)
print(neg_sam_H3K9ac.shape)
print(input_features)

here
[    0     4     6 ..., 23946 23950 23953]
(23954, 2081)
(8770, 1)
(15184, 1)
(8770, 2081)
(15184, 2081)
[[27  8 10 ...,  4  1  5]
 [16  6  3 ...,  4  1  1]
 [ 4  3  0 ...,  4  2  2]
 ..., 
 [20  7  8 ...,  9  3  4]
 [37  4  7 ...,  4  0  9]
 [ 0  1  0 ...,  3  1  0]]


In [5]:
#train
train_neg_sample = int(neg_sam_H3K9ac.shape[0] * 0.7)
train_pos_sample = int(pos_sam_H3K9ac.shape[0] * 0.7)
train_neg_H3K9ac = neg_sam_H3K9ac[0:train_neg_sample,:]
train_pos_H3K9ac = pos_sam_H3K9ac[0:train_pos_sample,:]
train_neg_pos_H3K9ac = np.concatenate((train_neg_H3K9ac, train_pos_H3K9ac),axis = 0)
np.random.shuffle(train_neg_pos_H3K9ac)
X_train_H3K9ac = train_neg_pos_H3K9ac[:,0:2080]
Y_train_H3K9ac = train_neg_pos_H3K9ac[:,2080]
frq = np.bincount(Y_train_H3K9ac)
print(frq)
print(X_train_H3K9ac.shape)
print(Y_train_H3K9ac.shape)

[10628  6139]
(16767, 2080)
(16767,)


In [6]:
#val
val_neg_sample = train_neg_sample + int(neg_sam_H3K9ac.shape[0] * 0.1)
val_pos_sample = train_pos_sample + int(pos_sam_H3K9ac.shape[0] * 0.1)
val_neg_H3K9ac = neg_sam_H3K9ac[train_neg_sample:val_neg_sample,:]
val_pos_H3K9ac = pos_sam_H3K9ac [train_pos_sample:val_pos_sample,:]
val_neg_pos_H3K9ac = np.concatenate((val_neg_H3K9ac, val_pos_H3K9ac),axis = 0)
np.random.shuffle(val_neg_pos_H3K9ac)
X_val_H3K9ac = val_neg_pos_H3K9ac[:,0:2080]
Y_val_H3K9ac = val_neg_pos_H3K9ac[:,2080]
frq = np.bincount(Y_val_H3K9ac)
print(frq)
print(X_val_H3K9ac.shape)
print(Y_val_H3K9ac.shape)

[1518  877]
(2395, 2080)
(2395,)


In [7]:
#test
test_neg_H3K9ac = neg_sam_H3K9ac[val_neg_sample:,:]
test_pos_H3K9ac = pos_sam_H3K9ac [val_pos_sample:,:]
test_neg_pos_H3K9ac = np.concatenate((test_neg_H3K9ac, test_pos_H3K9ac),axis = 0)
np.random.shuffle(test_neg_pos_H3K9ac)
X_test_H3K9ac = test_neg_pos_H3K9ac[:,0:2080]
Y_test_H3K9ac = test_neg_pos_H3K9ac[:,2080]
frq = np.bincount(Y_test_H3K9ac)
print(frq)
print(X_test_H3K9ac.shape)
print(Y_test_H3K9ac.shape)

[3038 1754]
(4792, 2080)
(4792,)


<HDF5 dataset "H3K9ac_ncl_test": shape (5090,), type "|i1">

In [8]:
 model = Sequential()
 #model.add(Conv1D(activation="relu", input_shape=(2080, 1), padding="valid", strides=1, filters=256, kernel_size=11, kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=4))
 #model.add(Dropout(0.6))
 #model.add(Conv1D(activation="relu", padding="valid", strides=1, filters=640, kernel_size=3, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.001)))
 #model.add(MaxPooling1D(pool_size=2))
 #model.add(Dropout(0.5))
 #model.add(Flatten())
 #model.summary()
 model.add(Dense(units=512, input_dim=2080, activation="relu", kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 #model.add(Dense(units=512, input_dim=512,  activation="relu", kernel_initializer='glorot_uniform',kernel_regularizer=l2(0.001)))
 #model.add(Dropout(0.5))
 model.add(Dense(units=180, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dropout(0.5))
 model.add(Dense(units=70, activation="relu",kernel_initializer='glorot_uniform'))
 model.add(Dense(units=1, activation="sigmoid"))
 model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               1065472   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 180)               92340     
_________________________________________________________________
dropout_2 (Dropout)          (None, 180)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 70)                12670     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 71        
Total params: 1,170,553
Trainable params: 1,170,553
Non-trainable params: 0
_________________________________________________________________


In [9]:
 adam = Adam(lr=0.0001)
 sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
 print('running at most 60 epochs')
 checkpointer = ModelCheckpoint(filepath="HistoneMark_H3K9ac.hdf5", verbose=1, save_best_only=True)
 earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
 model.fit(X_train_H3K9ac, Y_train_H3K9ac, batch_size=128, epochs=50, shuffle=True, validation_data=( X_val_H3K9ac, Y_val_H3K9ac), callbacks=[checkpointer,earlystopper])
 #model.fit(X_train_s, Y_train_s, batch_size=12, epochs=50, shuffle=True, validation_data=( X_val_s, Y_val_s), callbacks=[checkpointer,earlystopper])
 y_pred = model.predict(X_test_H3K9ac)
 #y_pred = model.predict(X_test_s)
 #tresults = model.evaluate(X_test_s, Y_test_s)
 np.savetxt('H3K9ac_true.csv', Y_test_H3K9ac, delimiter=",")
 np.savetxt('H3K9ac_pred.csv', y_pred, delimiter=",")
 tresults = model.evaluate(X_test_H3K9ac, Y_test_H3K9ac)
 print(tresults)
 model.summary()
 #print(roc_auc_score(Y_test_s,y_pred))
 print(roc_auc_score(Y_test_H3K9ac, y_pred))
 print(average_precision_score(Y_test_H3K9ac, y_pred))
 y_pred = (y_pred>0.5)
 cm = confusion_matrix(Y_test_H3K9ac, y_pred)
 print(cm)

running at most 60 epochs
Train on 16767 samples, validate on 2395 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.41348, saving model to HistoneMark_H3K9ac.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.41348 to 0.34354, saving model to HistoneMark_H3K9ac.hdf5
Epoch 3/50
Epoch 00003: val_loss improved from 0.34354 to 0.31389, saving model to HistoneMark_H3K9ac.hdf5
Epoch 4/50
Epoch 00004: val_loss improved from 0.31389 to 0.29253, saving model to HistoneMark_H3K9ac.hdf5
Epoch 5/50
Epoch 00005: val_loss improved from 0.29253 to 0.28790, saving model to HistoneMark_H3K9ac.hdf5
Epoch 6/50
Epoch 00006: val_loss improved from 0.28790 to 0.27204, saving model to HistoneMark_H3K9ac.hdf5
Epoch 7/50
Epoch 00007: val_loss improved from 0.27204 to 0.26945, saving model to HistoneMark_H3K9ac.hdf5
Epoch 8/50
Epoch 00008: val_loss improved from 0.26945 to 0.26014, saving model to HistoneMark_H3K9ac.hdf5
Epoch 9/50
Epoch 00009: val_loss did not improve
Epoch 10/50
Epoch 00010: