In [40]:
from __future__ import print_function
from __future__ import division

from collections import OrderedDict
import os
import sys
import warnings

import argparse
import logging
import h5py as h5
import numpy as np
import pandas as pd
import scipy.io

import six
import csv
import math
from six.moves import range

from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score
from keras.preprocessing import sequence
from keras.optimizers import RMSprop,Adam, SGD
from keras.models import Sequential
from keras.layers.core import  Dropout, Activation, Flatten
from keras.regularizers import l1,l2,l1_l2
from keras.constraints import maxnorm
#from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Dense, LSTM, Bidirectional
#from keras.utils import plot_model
#from keras.utils.layer_utils import print_layer_shapes
# fix random seed for reproducibility

from sklearn.decomposition import PCA
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN,SMOTETomek
np.random.seed(1369)

In [41]:
h5filename = "histonemodKmer_resample_ncl.h5"
h5file = h5.File(h5filename,'r')
input_features = h5file['input/H3K27ac_kmer']
output_H3K27ac = h5file['output/H3K27ac']
input_features = np.array(input_features,dtype='int8')
output_H3K27ac = np.array(output_H3K27ac, dtype='int8')
print(input_features.shape)
print(output_H3K27ac.shape)

(25740, 2080)
(25740,)


In [42]:
output_H3K27ac_reshape = output_H3K27ac.reshape(len(output_H3K27ac),1)
#combine the label with input dna
input_features_label = np.concatenate((input_features,output_H3K27ac_reshape), axis=1)
H3K27ac_df = pd.DataFrame(output_H3K27ac)
pos_label= H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K27ac = input_features_label[pos_label_ix,:]
neg_sam_H3K27ac = input_features_label[neg_label_ix,:]
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K27ac.shape)
print(neg_sam_H3K27ac.shape)
print(input_features)

here
[    0     4     6 ..., 25719 25723 25731]
(25740, 2081)
(4123, 1)
(21617, 1)
(4123, 2081)
(21617, 2081)
[[0 0 0 ..., 1 0 2]
 [3 0 1 ..., 1 0 0]
 [9 2 0 ..., 0 1 0]
 ..., 
 [1 1 0 ..., 0 1 0]
 [4 3 3 ..., 1 0 1]
 [0 0 0 ..., 0 1 0]]


In [43]:
#apply SMOTE
train_neg_H3K27ac_s = neg_sam_H3K27ac[0:17294,:]
train_pos_H3K27ac_s = pos_sam_H3K27ac[0:3176,:]
train_neg_pos_H3K27ac = np.concatenate((train_neg_H3K27ac_s, train_pos_H3K27ac_s),axis = 0)
np.random.shuffle(train_neg_pos_H3K27ac)
X_train_H3K27ac_s = train_neg_pos_H3K27ac[:,0:2080]
Y_train_H3K27ac_s = train_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_train_H3K27ac_s)
print(frq)
print(X_train_H3K27ac_s.shape)
print(Y_train_H3K27ac_s.shape)

[17294  3176]
(20470, 2080)
(20470,)


In [44]:
#test
test_neg_H3K27ac = neg_sam_H3K27ac[17294:,:]
test_pos_H3K27ac = pos_sam_H3K27ac [3176:,:]
test_neg_pos_H3K27ac = np.concatenate((test_neg_H3K27ac, test_pos_H3K27ac),axis = 0)
np.random.shuffle(test_neg_pos_H3K27ac)
X_test_H3K27ac = test_neg_pos_H3K27ac[:,0:2080]
Y_test_H3K27ac = test_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_test_H3K27ac)
print(frq)
print(X_test_H3K27ac.shape)
print(Y_test_H3K27ac.shape)

[4323  947]
(5270, 2080)
(5270,)


In [45]:
smote = ADASYN(random_state = 42)
X_resampled, y_resampled = smote.fit_sample(X_train_H3K27ac_s, Y_train_H3K27ac_s)


In [52]:
print(X_resampled.shape)
print(y_resampled.shape)
frq = np.bincount(y_resampled)
print(frq)

(34507, 2080)
(34507,)
[17294 17213]


(18024, 2080)
(18024,)
[  811 17213]


In [55]:
y_resampled = np.array(y_resampled, dtype='int8')
X_resampled = np.array(X_resampled, dtype='int8')
print(y_resampled[0:50])
output_H3K27ac_reshape = y_resampled.reshape(len(y_resampled),1)
#combine the label with input dna
input_features_label = np.concatenate((X_resampled,output_H3K27ac_reshape), axis=1)
H3K27ac_df = pd.DataFrame(y_resampled)
pos_label= H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==1]
pos_label_ix = np.array(pos_label.index)
neg_label = H3K27ac_df.loc[H3K27ac_df.iloc[:,0]==0]
neg_label_ix = np.array(neg_label.index)
pos_sam_H3K27ac = input_features_label[pos_label_ix,:]
neg_sam_H3K27ac = input_features_label[neg_label_ix,:]
print('here')
print(pos_label_ix)
print(input_features_label.shape)
print(pos_label.shape)
print(neg_label.shape)
print(pos_sam_H3K27ac.shape)
print(neg_sam_H3K27ac.shape)
print(input_features)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1]
here
[    0     1     2 ..., 18021 18022 18023]
(18024, 2081)
(17213, 1)
(811, 1)
(17213, 2081)
(811, 2081)
[[0 0 0 ..., 1 0 2]
 [3 0 1 ..., 1 0 0]
 [9 2 0 ..., 0 1 0]
 ..., 
 [1 1 0 ..., 0 1 0]
 [4 3 3 ..., 1 0 1]
 [0 0 0 ..., 0 1 0]]


In [56]:
# data for train
pos_threshhold = int(math.ceil(pos_label.shape[0] * 0.9))
neg_threshhold = int(math.ceil(neg_label.shape[0] * 0.9))
train_neg_H3K27ac = neg_sam_H3K27ac[0:neg_threshhold,:]
train_pos_H3K27ac = pos_sam_H3K27ac[0:pos_threshhold,:]
train_neg_pos_H3K27ac = np.concatenate((train_neg_H3K27ac, train_pos_H3K27ac),axis = 0)
np.random.shuffle(train_neg_pos_H3K27ac)
X_train_H3K27ac = train_neg_pos_H3K27ac[:,0:2080]
Y_train_H3K27ac = train_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_train_H3K27ac)
print(pos_threshhold)
print(neg_threshhold)
print(frq)
print(X_train_H3K27ac.shape)
print(Y_train_H3K27ac.shape)
print(Y_train_H3K27ac[0:50])

15492
730
[  730 15492]
(16222, 2080)
(16222,)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [57]:
#val
val_neg_H3K27ac = neg_sam_H3K27ac[neg_threshhold:,:]
val_pos_H3K27ac = pos_sam_H3K27ac [pos_threshhold:,:]
val_neg_pos_H3K27ac = np.concatenate((val_neg_H3K27ac, val_pos_H3K27ac),axis = 0)
np.random.shuffle(val_neg_pos_H3K27ac)
X_val_H3K27ac = val_neg_pos_H3K27ac[:,0:2080]
Y_val_H3K27ac = val_neg_pos_H3K27ac[:,2080]
frq = np.bincount(Y_val_H3K27ac)
print(frq)
print(X_val_H3K27ac.shape)
print(Y_val_H3K27ac.shape)

[  81 1721]
(1802, 2080)
(1802,)


In [58]:
model = Sequential()
model.add(Dense(units=512, input_dim=2080, activation="tanh", kernel_initializer='glorot_uniform'))
model.add(Dropout(0.5))
model.add(Dense(units=180, activation="tanh",kernel_initializer='glorot_uniform'))
model.add(Dropout(0.5))
model.add(Dense(units=70, activation="tanh",kernel_initializer='glorot_uniform'))
model.add(Dense(units=1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 512)               1065472   
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 180)               92340     
_________________________________________________________________
dropout_10 (Dropout)         (None, 180)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 70)                12670     
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 71        
Total params: 1,170,553
Trainable params: 1,170,553
Non-trainable params: 0
_________________________________________________________________


In [59]:
adam = Adam(lr=0.0001)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
print('running at most 60 epochs')
checkpointer = ModelCheckpoint(filepath="HistoneMark_H3K27ac.hdf5", verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
model.fit(X_train_H3K27ac, Y_train_H3K27ac, batch_size=12, epochs=50, shuffle=True, validation_data=( X_val_H3K27ac, Y_val_H3K27ac), callbacks=[checkpointer,earlystopper])
y_pred = model.predict(X_test_H3K27ac)
 #y_pred = model.predict(X_test_s)
 #tresults = model.evaluate(X_test_s, Y_test_s)
tresults = model.evaluate(X_test_H3K27ac, Y_test_H3K27ac)
print(tresults)
model.summary()		
 #print(roc_auc_score(Y_test_s,y_pred))
print(roc_auc_score(Y_test_H3K27ac, y_pred))
print(average_precision_score(Y_test_H3K27ac, y_pred))
y_pred = (y_pred>0.5)
cm = confusion_matrix(Y_test_H3K27ac, y_pred)
print(cm)

running at most 60 epochs
Train on 16222 samples, validate on 1802 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.03246, saving model to HistoneMark_H3K27ac.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.03246 to 0.02566, saving model to HistoneMark_H3K27ac.hdf5
Epoch 3/50
Epoch 00003: val_loss did not improve
Epoch 4/50
Epoch 00004: val_loss improved from 0.02566 to 0.02040, saving model to HistoneMark_H3K27ac.hdf5
Epoch 5/50
Epoch 00005: val_loss did not improve
Epoch 6/50
Epoch 00006: val_loss improved from 0.02040 to 0.01574, saving model to HistoneMark_H3K27ac.hdf5
Epoch 7/50
Epoch 00007: val_loss did not improve
Epoch 8/50
Epoch 00008: val_loss did not improve
Epoch 9/50
Epoch 00009: val_loss did not improve
Epoch 10/50
Epoch 00010: val_loss did not improve
Epoch 11/50
Epoch 00011: val_loss did not improve
Epoch 12/50
Epoch 00012: val_loss did not improve
Epoch 13/50
Epoch 00013: val_loss did not improve
Epoch 14/50
Epoch 00014: val_loss did not improve
Ep