<a href="https://colab.research.google.com/github/dagousket/ML-course-VIB-2020/blob/master/Histone_Marks_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Let's go Deep Learning style!


In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
import PIL
import tensorflow as tf

random_seed = 666
np.random.seed(random_seed)

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

### Import and format data

In [2]:
train = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_test.csv")

# Save labels in external variable
train_labels = train.pop("Label")
train_index_col = train.pop("GeneId")
test_index_col = test.pop("GeneId")

In [3]:
marks = list(set([c_name.split("_")[0] for c_name in train.columns]))

# Shape of array will be 100 bins * 5 histone marks

def pool_marks(mybins, marks) :
  df = pd.DataFrame()
  totalread = 0.001
  for mark in marks :
      df.reset_index()
      df[mark] = mybins[[mark + "_" + str(bin) for bin in range(0,100)]].to_numpy()
      totalread = totalread + sum(df[mark])
  df = df.to_numpy() / totalread
  return df

def get_input(X, marks):
  res_out = X.apply(lambda y : pool_marks(y, marks), axis = 1)
  return pd.DataFrame(res_out)

X = [A for A in get_input(train, marks).iloc[:,0]]
X = np.array([np.array(dp) for dp in X])

T = [A for A in get_input(test, marks).iloc[:,0]]
T = np.array([np.array(dp) for dp in T])

In [19]:
X.shape

(10436, 100, 5)

In [35]:
#one-hot encode target column
from keras.utils import to_categorical
y_train = to_categorical(train_labels)
y_train = np.array([Y for Y in y_train])
print(y_train.shape)
y_train[0]

(10436, 2)


array([0., 1.], dtype=float32)

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_train, test_size=0.85)

### Make a DL model

In [40]:
from keras.layers.recurrent import LSTM
num_classes = 2

model = Sequential([
  layers.InputLayer(input_shape=(100,5)),
  layers.Conv1D(30, kernel_size=3, padding="valid", activation='relu', kernel_initializer='random_uniform', data_format ="channels_last"),
  layers.MaxPooling1D(pool_size=5, strides=1, padding='valid'),
  layers.Dropout(0.2),
  layers.TimeDistributed(layers.Dense(128, activation='relu')),
  layers.Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(20, activation='relu'),
  layers.Dropout(0.4),
  layers.Dense(num_classes, activation='softmax')])


model.compile(optimizer='adam',
              loss='BinaryCrossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 98, 30)            480       
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 94, 30)            0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 94, 30)            0         
_________________________________________________________________
time_distributed_7 (TimeDist (None, 94, 128)           3968      
_________________________________________________________________
bidirectional_7 (Bidirection (None, 94, 256)           263168    
_________________________________________________________________
dropout_22 (Dropout)         (None, 94, 256)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 24064)            

In [41]:

history=model.fit(X_train, y_train, batch_size=100 , epochs=100 , verbose=1, validation_data=(X_test, y_test))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [44]:
from sklearn.metrics import log_loss, accuracy_score

predictions_train_prob = model.predict(X_train)
predictions_val_prob = model.predict(X_test)

print("Log-loss: (%f) %f"%(log_loss(y_train,predictions_train_prob[:,1]),log_loss(y_test,predictions_val_prob[:,1])))

In [None]:
# Predict Kaggle set
predictions_test_prob = model.predict(X_test)

predictions_df = pd.DataFrame({"GeneId":X_test.index,"Label":predictions[:,1]})
predictions_df.to_csv('submission_rf.csv', index=False)