<a href="https://colab.research.google.com/github/danielelbrecht/mirna/blob/master/mirna_model2_with_schedule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files
import numpy as np

from keras.callbacks import LearningRateScheduler
from keras.models import Model
from keras.layers import Input, LSTM, TimeDistributed, Dropout, Dense, Permute, Flatten, Multiply, RepeatVector, Activation, Masking, Bidirectional
from keras import regularizers, optimizers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.wrappers import Wrapper
from keras.engine.topology import InputSpec
from keras import backend as K

In [2]:
# Load data sets

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

pos_file_obj = drive.CreateFile({'id': '1vl-qE0U5W6ll3JH41QqDajyx6oAwC3C0'})                       
pos_file_obj.GetContentFile('input.txt')

neg_file_obj = drive.CreateFile({'id': '1Rnh8RHUsmCGmiCZobu3g7ezeUJQq0CH-'})                       
neg_file_obj.GetContentFile('negatives.txt')

[?25l[K    1% |▎                               | 10kB 19.7MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.8MB/s eta 0:00:01[K    3% |█                               | 30kB 2.6MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.9MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.3MB/s eta 0:00:01[K    9% |███                             | 92kB 3.7MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.8MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 4.1MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.6MB/s eta 0:00:01[

In [0]:
pos_file_obj
pos_content = pos_file_obj.GetContentString()
neg_content = neg_file_obj.GetContentString()
pos_file = []
neg_file = []
temp = []

for x in pos_content:
  if x == '\n':
    pos_file.append(temp)
    temp = []
  else: 
    temp.append(x)
    
for x in neg_content:
  if x == '\n':
    neg_file.append(temp)
    temp = []
  else: 
    temp.append(x)
    
    
def read_line(line):

    array = []

    for entry in line:
        if entry == '0':
            array.append(np.int32(0))
        if entry == '1':
            array.append(np.int32(1))
        if len(array) == 16:
          break

    return np.asarray(array)
  
def process_data(pos_file, neg_file):

    data = []
    is_example = 0
    pos_examples = 0
    neg_examples = 0

    for line in pos_file: # Iterate over file

        if (line[0] == '0' or line[0] == '1') and is_example == 0:  # When new sequence is encountered, initialize new example
            example = []
            is_example = 1
            example.append(read_line(line))

        if (line[0] == '0' or line[0] == '1') and is_example == 1:  # During sequence
            example.append(read_line(line))

        if line[0] != '0' and line[0] != '1' and is_example == 1:  # When sequence terminates
            is_example = 0
            data.append(example)
            pos_examples = pos_examples + 1

    for line in neg_file: # Iterate over file

        if (line[0] == '0' or line[0] == '1') and is_example == 0:  # When new sequence is encountered, initialize new example
            example = []
            is_example = 1
            example.append(read_line(line))

        if (line[0] == '0' or line[0] == '1') and is_example == 1:  # During sequence
            example.append(read_line(line))

        if line[0] != '0' and line[0] != '1' and is_example == 1:  # When sequence terminates
            is_example = 0
            data.append(example)
            neg_examples = neg_examples + 1

    return np.asarray(data), pos_examples, neg_examples

In [0]:
# Process the data and generate training labels

full_data, num_pos, num_neg = process_data(pos_file, neg_file)

# Generate labels
pos_labels = np.ones(num_pos)
neg_labels = np.zeros(num_neg)
data_labels = np.concatenate((pos_labels, neg_labels))

binary_labels = np.zeros([len(data_labels), 2])

for i in range(len(data_labels)):
    if data_labels[i] == 1:
        binary_labels[i][1] = 1
    else:
        binary_labels[i][0] = 1


# Get mask length
mask_length = 0
for i in range(len(full_data)):
    if len(full_data[i]) > mask_length:
        mask_length = len(full_data[i])


In [5]:
# Pad data
data_padded = pad_sequences(full_data, maxlen=mask_length, dtype='object', padding='post', truncating='post', value=0)

# Shuffle data and get training and validation sets
indices = np.random.permutation(35267)
shuffled_data = data_padded[indices]
shuffled_labels = binary_labels[indices]

print(data_padded.shape)
print(shuffled_data.shape)


(35267, 142, 16)
(35267, 142, 16)


In [0]:
# Define hyper parameters
LSTM1_units = 32
LSTM2_units = 16
fully_connected_layer1_units = 32
fully_connected_layer2_units = 32
output_size = 2
learning_rate = 0.01



In [0]:
# Learning rate scheduler
def schedule_function(epoch, lr):
  if epoch==15 or epoch==30:
    return lr/10
  else:
    return lr


schedule = LearningRateScheduler(schedule_function, verbose=1)

In [25]:
# Functional API model

# Input layer
inputs = Input(shape=(mask_length, 16), name='inputs')


# LSTM Layers
lstm1 = Bidirectional(LSTM(20, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat')(inputs)
lstm2 = Bidirectional(LSTM(10, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat')(lstm1)

#Flatten
flatten = Flatten()(lstm2)


# Fully connected layers
do1 = Dropout(0.1)(flatten)
fc1 = Dense(100, activation='sigmoid')(do1)
do2 = Dropout(0.1)(fc1)
fc2 = Dense(100, activation='sigmoid')(do2)

# Output layer
softmax = Dense(output_size, activation='softmax')(fc2)


# Compile model
model2 = Model(inputs=inputs, outputs=softmax)

model2.compile(optimizer='rmsprop',
               loss='binary_crossentropy',
               metrics=['accuracy'])

model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 142, 16)           0         
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 142, 40)           5920      
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 142, 20)           4080      
_________________________________________________________________
flatten_8 (Flatten)          (None, 2840)              0         
_________________________________________________________________
dropout_15 (Dropout)         (None, 2840)              0         
_________________________________________________________________
dense_21 (Dense)             (None, 100)               284100    
_________________________________________________________________
dropout_16 (Dropout)         (None, 100)               0         
__________

In [26]:
# Get train and val data
train_data = shuffled_data[0:30000]
train_labels = shuffled_labels[0:30000]

val_data = shuffled_data[30000:35267]
val_labels = shuffled_labels[30000:35267]


history = model2.fit(train_data, 
                    train_labels, 
                    epochs=40,
                    batch_size = 128,
                    validation_data=(val_data, val_labels),
                    callbacks=[schedule])
    


Train on 30000 samples, validate on 5267 samples
Epoch 1/40

Epoch 00001: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/40

Epoch 00002: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/40

Epoch 00003: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 4/40

Epoch 00004: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 5/40

Epoch 00005: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 6/40

Epoch 00006: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 7/40

Epoch 00007: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 8/40

Epoch 00008: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 9/40

Epoch 00009: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 10/40

Epoch 00010: LearningRateScheduler setting learning rate to 0.0010000000474974513.


In [0]:
from sklearn import metrics

# Get advanced metrics
preds = model2.predict(shuffled_data[0:5000])
met = perfeval(preds, shuffled_labels[0:5000], 1)

SE: 0.876 SP: 0.901 F-Score: 0.883 PPV: 0.891 gmean: 0.888 AUROC: 0.959 AUPR: 0.958
