<a href="https://colab.research.google.com/github/dagousket/ML-course-VIB-2020/blob/master/Histone_Marks_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Let's go Deep Learning style!


In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
import PIL
import tensorflow as tf

random_seed = 666
np.random.seed(random_seed)

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.metrics import log_loss, accuracy_score

### Import and format data

In [4]:
train = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_test.csv")

# Save labels in external variable
train_labels = train.pop("Label")
train_index_col = train.pop("GeneId")
test_index_col = test.pop("GeneId")

In [5]:
marks = list(set([c_name.split("_")[0] for c_name in train.columns]))

# Shape of array will be 100 bins * 5 histone marks

def pool_marks(mybins, marks) :
  df = pd.DataFrame()
  for mark in marks :
      df.reset_index()
      df[mark] = mybins[[mark + "_" + str(bin) for bin in range(0,100)]].to_numpy()
  df = df.to_numpy() / 200
  return df

def get_input(X, marks):
  res_out = X.apply(lambda y : pool_marks(y, marks), axis = 1)
  return pd.DataFrame(res_out)

X = [A for A in get_input(train, marks).iloc[:,0]]
X = np.array([np.array(dp) for dp in X])

T = [A for A in get_input(test, marks).iloc[:,0]]
T = np.array([np.array(dp) for dp in T])

In [6]:
print(X.shape)
print(T.shape)
X[:5]

(10436, 100, 5)
(5049, 100, 5)


array([[[0.005, 0.005, 0.015, 0.01 , 0.005],
        [0.01 , 0.   , 0.005, 0.025, 0.005],
        [0.015, 0.025, 0.015, 0.04 , 0.005],
        ...,
        [0.04 , 0.03 , 0.015, 0.08 , 0.01 ],
        [0.055, 0.025, 0.01 , 0.075, 0.01 ],
        [0.05 , 0.01 , 0.01 , 0.06 , 0.01 ]],

       [[0.   , 0.005, 0.   , 0.005, 0.   ],
        [0.   , 0.   , 0.01 , 0.01 , 0.01 ],
        [0.   , 0.   , 0.005, 0.005, 0.01 ],
        ...,
        [0.005, 0.   , 0.   , 0.005, 0.   ],
        [0.005, 0.   , 0.   , 0.   , 0.005],
        [0.005, 0.   , 0.   , 0.   , 0.005]],

       [[0.015, 0.005, 0.005, 0.005, 0.035],
        [0.01 , 0.005, 0.01 , 0.005, 0.02 ],
        [0.01 , 0.005, 0.   , 0.02 , 0.01 ],
        ...,
        [0.   , 0.005, 0.005, 0.005, 0.01 ],
        [0.005, 0.01 , 0.   , 0.   , 0.005],
        [0.005, 0.005, 0.   , 0.005, 0.005]],

       [[0.01 , 0.005, 0.015, 0.02 , 0.02 ],
        [0.01 , 0.005, 0.015, 0.005, 0.   ],
        [0.03 , 0.005, 0.005, 0.005, 0.005],
        ..

In [72]:
#one-hot encode target column
from keras.utils import to_categorical

# 2-class array
train_labels_cat = np.array([Y for Y in to_categorical(train_labels)])

# 1-class array
train_labels_cat = np.array([float(Y[1]) for Y in to_categorical(train_labels)])
train_labels_cat = train_labels_cat.reshape((train_labels_cat.shape[0], 1))

print(train_labels_cat.shape)
print(train_labels_cat[:5])
print(train_labels[:5])

(10436, 1)
[[1.]
 [1.]
 [0.]
 [0.]
 [1.]]
0    1
1    1
2    0
3    0
4    1
Name: Label, dtype: int64


In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, train_labels_cat, test_size=0.25)

### Make a DL model

In [97]:
from sklearn.metrics import log_loss
import tensorflow as tf

def custom_loss_function(y_true, y_pred):
  # 2-class
  #custom_logloss = tf.compat.v1.losses.log_loss(y_true[:, 1], y_pred[:, 1])
  # 1-class
  custom_logloss = tf.compat.v1.losses.log_loss(y_true, y_pred)
  return custom_logloss

y_true = y_train[:5]
y_pred = predictions_train_prob[:5]
cl = custom_loss_function(np.array(y_true),np.array(y_pred))
print(cl)

tf.Tensor(1.2853619, shape=(), dtype=float32)


In [98]:
from keras.layers.recurrent import LSTM

num_classes = 1

model = Sequential([
  layers.InputLayer(input_shape=(100,5)),
  layers.Conv1D(60, kernel_size=3, padding="valid", activation='relu', kernel_initializer='random_uniform', data_format ="channels_last"),
  layers.MaxPooling1D(pool_size=3, strides=1, padding='valid'),
  layers.Dropout(0.3),
  layers.TimeDistributed(layers.Dense(128, activation='relu')),
  layers.Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)),
  layers.Dropout(0.2),
  layers.Dense(20, activation='relu'),
  layers.Dropout(0.4),
  layers.Flatten(),
  layers.Dense(num_classes, activation='sigmoid')])

#model = Sequential()
#model.add(layers.Conv1D(40, kernel_size=3, input_shape=(100,5)))
#model.add(layers.MaxPooling1D(pool_size=3, strides=1))
#model.add(layers.Dense(64, input_dim=20, activation='relu'))
#model.add(layers.Dropout(0.5))
#model.add(layers.Dense(64, activation='relu'))
#model.add(layers.Dropout(0.5))
#model.add(layers.Flatten())
#model.add(layers.Dense(1, activation='sigmoid'))

#model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.compile(optimizer='adam',loss=custom_loss_function,metrics=['accuracy'])

model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_13 (Conv1D)           (None, 98, 60)            960       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 96, 60)            0         
_________________________________________________________________
dropout_45 (Dropout)         (None, 96, 60)            0         
_________________________________________________________________
time_distributed_10 (TimeDis (None, 96, 128)           7808      
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 96, 256)           263168    
_________________________________________________________________
dropout_46 (Dropout)         (None, 96, 256)           0         
_________________________________________________________________
dense_64 (Dense)             (None, 96, 20)          

In [99]:

history=model.fit(X_train, y_train, batch_size=200 , epochs=60 , verbose=1, validation_data=(X_test, y_test))


Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [106]:
from sklearn.metrics import log_loss, accuracy_score

predictions_train_prob = model.predict(X_train)
predictions_test_prob = model.predict(X_test)

print(predictions_train_prob[:5])
print(y_train[:5])
print(predictions_test_prob[:5])
print(y_test[:5])

[[0.12308866]
 [0.91856253]
 [0.08872909]
 [0.07651903]
 [0.01962665]]
[[0.]
 [1.]
 [0.]
 [1.]
 [0.]]
[[0.7923171 ]
 [0.07175089]
 [0.10678643]
 [0.1041955 ]
 [0.9215231 ]]
[[1.]
 [0.]
 [0.]
 [0.]
 [1.]]


In [101]:
# Log Loss from Tensor Flow
print(tf.compat.v1.losses.log_loss(y_train, predictions_train_prob))
print(tf.compat.v1.losses.log_loss(y_test, predictions_test_prob))

print(tf.keras.losses.BinaryCrossentropy(y_train, predictions_train_prob))
print(tf.keras.losses.BinaryCrossentropy(y_test, predictions_test_prob))
#print(tf.compat.v1.losses.log_loss([y[1] for y in y_test], [p[1] for p in predictions_test_prob]))

tf.Tensor(0.34256044, shape=(), dtype=float32)
tf.Tensor(0.34440696, shape=(), dtype=float32)
<tensorflow.python.keras.losses.BinaryCrossentropy object at 0x7fd530dcbe80>
<tensorflow.python.keras.losses.BinaryCrossentropy object at 0x7fd530dcb390>


In [102]:
# Log Loss from Keras
from sklearn.metrics import log_loss
print(log_loss( y_train, predictions_train_prob))
print(log_loss( y_test, predictions_test_prob))

#print(log_loss([y[1] for y in y_train], [p[1] for p in predictions_train_prob]))
#print(log_loss([y[1] for y in y_test], [p[1] for p in predictions_test_prob]))

0.34256064634889394
0.3444072429404689


In [103]:
# Predict Kaggle set
predictions_val_prob = model.predict(T)
predictions_val_prob[:5]

array([[0.955695  ],
       [0.1870415 ],
       [0.08750492],
       [0.93770283],
       [0.88538533]], dtype=float32)

In [104]:
H = [X[0] for X in predictions_val_prob]

#H = [X[1] for X in predictions_val_prob]
H[:5]

[0.955695, 0.1870415, 0.087504916, 0.93770283, 0.88538533]

In [105]:
predictions_df = pd.DataFrame({"GeneId":test_index_col,"Label":H})
print(predictions_df.head())
predictions_df.to_csv('submission_dl7.csv', index=False)

   GeneId     Label
0    5222  0.955695
1     891  0.187042
2    7219  0.087505
3    7225  0.937703
4    9432  0.885385
