In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from keras.layers import Conv2D, MaxPooling2D, Conv1D
from keras.layers import Activation, Dropout, Flatten, Dense, Input
from keras.models import Model
from keras.layers import Embedding, Dense, LSTM, RepeatVector
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras import regularizers
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint




# Load data

In [2]:
data_df = pd.read_csv('data/x_train.csv')
label_df = pd.read_csv('data/y_train.csv')

In [10]:
X.shape

(15485, 100, 5)

# Prepare data for training

In [3]:
grp = data_df.groupby('GeneId')
X_ser = grp.apply(lambda x: np.array(x[[ 'H3K4me3', 'H3K4me1', 'H3K36me3', 'H3K9me3', 'H3K27me3']]))

X = np.stack(np.array(X_ser))
X_CNN = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
y = label_df.Prediction.values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# CNN Training

In [12]:
inp = Input(shape=(100, 5))

x = Conv1D(32, (29), activation='relu')(inp)
x = Dropout(0.2)(x)
x = Flatten()(x) 
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)

out = Dense(1,activation='sigmoid')(x)

adam = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.4,
                              patience=5, min_lr=0.0005, verbose=1)

model = Model(inputs=inp, outputs=out)
model.compile(loss='binary_crossentropy',
              metrics=['accuracy'], optimizer= Adam(lr=0.01))

checkpoint = ModelCheckpoint('trained_models/histone_CNN.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [reduce_lr, checkpoint]

In [13]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=40, callbacks=callbacks_list)

Train on 12388 samples, validate on 3097 samples
Epoch 1/40

Epoch 00001: val_acc improved from -inf to 0.50823, saving model to trained_models/histone_CNN.hdf5
Epoch 2/40

Epoch 00002: val_acc improved from 0.50823 to 0.81272, saving model to trained_models/histone_CNN.hdf5
Epoch 3/40

Epoch 00003: val_acc improved from 0.81272 to 0.81627, saving model to trained_models/histone_CNN.hdf5
Epoch 4/40

Epoch 00004: val_acc did not improve from 0.81627
Epoch 5/40

Epoch 00005: val_acc improved from 0.81627 to 0.84049, saving model to trained_models/histone_CNN.hdf5
Epoch 6/40

Epoch 00006: val_acc did not improve from 0.84049
Epoch 7/40

Epoch 00007: val_acc improved from 0.84049 to 0.84404, saving model to trained_models/histone_CNN.hdf5
Epoch 8/40

Epoch 00008: val_acc did not improve from 0.84404
Epoch 9/40

Epoch 00009: val_acc did not improve from 0.84404
Epoch 10/40

KeyboardInterrupt: 

In [14]:
model_json = model.to_json()
with open("trained_models/histone_CNN.json", "w") as json_file:
    json_file.write(model_json)

# LSTM Training

In [15]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, y, test_size=0.2)

In [21]:
X_train_lstm.shape

(12388, 100, 5)

In [25]:
inp = Input(shape=(100, 5))
x = LSTM(254, dropout=0.2, return_sequences=True)(inp)
x = LSTM(254, dropout=0.2)(inp)
x = Dense(254, activation='relu', W_regularizer=regularizers.l2(0))(x)
x = Dense(100, activation='relu', W_regularizer=regularizers.l2(0))(x)
x = Dense(50, activation='relu', W_regularizer=regularizers.l2(0))(x)
x = Dense(1, activation='sigmoid', W_regularizer=regularizers.l2(0))(x)


model_lstm = Model(input=inp, output=x)
model_lstm.compile(loss='binary_crossentropy', optimizer= Adam(lr=0.001), metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.4,
                              patience=2, min_lr=0.0005, verbose=1)

checkpoint = ModelCheckpoint('trained_models/histone_LSTM.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [reduce_lr, checkpoint]


  after removing the cwd from sys.path.
  """
  
  import sys
  # Remove the CWD from sys.path while we load stuff.


In [18]:
history = model_lstm.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), \
           nb_epoch=100, batch_size=100, shuffle=True,\
           callbacks=callbacks_list)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 12388 samples, validate on 3097 samples
Epoch 1/100
 2000/12388 [===>..........................] - ETA: 42s - loss: 0.5807 - acc: 0.7080

KeyboardInterrupt: 

In [None]:
model_json = model_lstm.to_json()
with open("trained_models/histone_LSTM.json", "w") as json_file:
    json_file.write(model_json)

In [1]:
from predictor.gene_expression_predictor import GeneExpressionPredictor

Using TensorFlow backend.


In [16]:
predictor = GeneExpressionPredictor('CNN')

In [18]:
res = predictor.model_predictor(X)

Unnamed: 0,result
0,0.205984
1,0.042844
2,0.913897
3,0.861922
4,0.705359
5,0.104271
6,0.064140
7,0.836932
8,0.850994
9,0.740645
