In [None]:
import tensorflow as tf
import pandas as pd
import sys
from Bio import SeqIO
import pandas as pd
import numpy as np
sys.path.insert(1, './python-functions/')

In [None]:
print(tf.__version__)

In [None]:
from my_functions import SeqTo3mer, Remover, TokenPad

In [None]:
# load training set
df_train = pd.read_csv('training-set.csv', sep=',', index_col=0)

# remove unallowed characters
df_train = Remover(df_train)

# label one-hot encoding
one_hot_labels = pd.get_dummies(df_train['labels']).values
df_train['b-labels'] = list(one_hot_labels)

In [None]:
# label formatting
array_list = []
for arr in list(df_train['b-labels']):
  array_list.append(arr)

y_b = np.vstack(array_list)

In [None]:
# preprocessing
X = df_train['seq'].str.upper()
y = df_train['labels']

# k-mer generation
X = SeqTo3mer(X)

# tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB_SIZE = 64
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X)

# tokenizer and zero-padding
X_pad = TokenPad(X, 498, 'post', tokenizer)

## Neural Network

In [None]:
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints

In [None]:
# Attention layer (supports masking)
# Author: Christos Baziotis
# https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.


        Note: The layer has been tested with Keras 1.x

        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...

            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)

        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]



In [None]:
def biLSTMAtt(lr):


  sequence_input = Input(shape=(498,), dtype="int32")
  embedded_sequences = Embedding(64, 10, mask_zero=True)(sequence_input)

  lstm = Bidirectional(LSTM(32, input_shape=(498, 64), return_sequences=True)(embedded_sequences)

  context_vector, weights = Attention(return_attention=True)(lstm)

  dense1 = Dense(128, activation="relu")(context_vector)
  dropout = Dropout(0.4)(dense1)
  output = Dense(13, activation='softmax')(dropout)

  model = tf.keras.Model(inputs=sequence_input, outputs=output)
  
  # compile model
  opt = tf.keras.optimizers.RMSprop(lr=lr, momentum=0.01, centered=True)
  
  METRICS = [
	  tf.keras.metrics.TruePositives(name='tp'),
	  tf.keras.metrics.FalsePositives(name='fp'),
  	tf.keras.metrics.TrueNegatives(name='tn'),
	  tf.keras.metrics.FalseNegatives(name='fn'),
	  tf.keras.metrics.BinaryAccuracy(name='accuracy'),
	  tf.keras.metrics.Precision(name='precision'),
	  tf.keras.metrics.Recall(name='recall'),
    ]

  model.compile(loss='categorical_crossentropy',
                optimizer=opt,
                metrics=METRICS)
  
  return model

## K-fold Cross-validation

In [None]:
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

def CreateFolds(X, y, folds, test_size):

  sss = StratifiedShuffleSplit(n_splits=folds, test_size=test_size)
  sss.get_n_splits(X, y)

  count = 1

  train_list = []
  test_list = []

  for train_index, test_index in sss.split(X, y):
    print('Fold {}'.format(count))

    train_list.append(list(train_index))
    test_list.append(list(test_index))

    count += 1
  
  return train_list, test_list

In [None]:
train_list, test_list = CreateFolds(X_pad, y, 10, 0.2)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10


In [None]:
df_train_idx = pd.DataFrame()
for i,n in enumerate(train_list):
  df_train_idx['fold{}'.format(i+1)] = n

In [None]:
df_test_idx = pd.DataFrame()
for i,n in enumerate(test_list):
  df_test_idx['fold{}'.format(i+1)] = n

# Training

In [None]:
description = 'ncRNA - 3-mer'

start = 0
model_name = 'biLSTMAtt'

for i in range(start,10):
  print('Fold {}'.format(i+1))
  X_train = X_pad[df_train_idx['fold{}'.format(i+1)]]
  X_test = X_pad[df_test_idx['fold{}'.format(i+1)]]

  y_train = y_b[df_train_idx['fold{}'.format(i+1)]]
  y_test = y_b[df_test_idx['fold{}'.format(i+1)]]

  print('Train shape: X: {}, y: {}'.format(X_train.shape, y_train.shape))
  print('Test shape: X: {}, y: {}'.format(X_test.shape, y_test.shape))
  print('Initializing model')

  lr = 0.004
  epochs = 10
  model = biLSTMAtt(lr)

  callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

  print('Training model {} with lr: {}'.format(i+1, lr))
  history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[callback])

  print('Saving model...')
  main_folder = './saved_models/'
  current_folder = 'ncypred-SKF-fold{}-lr{}-{}/'.format(i+1, lr, model_name)

  model.save(main_folder+current_folder) 

  print('Saving history at {}'.format(current_folder)) 
  hist_df = pd.DataFrame(history.history) 
  hist_csv_file = main_folder+current_folder+'history.csv'

  with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

  print('Saving parameters...')
  param_file = main_folder+current_folder+'params.dat'
  with open(param_file, mode='w') as p:
    p.write(description)
    p.write('Learning rate: {}'.format(lr))
    p.write('Epochs: {}'.format(epochs))

  print(' ')

# Full training

In [None]:
model_name = 'biLSTMAtt'

lr = 0.004
epochs = 15
model = biLSTMAtt(lr)

print('Train shape: X: {}, y: {}'.format(X_pad.shape, y_b.shape))

history = model.fit(X_pad, y_b, epochs=epochs)

print('Saving model...')
main_folder = './saved_models/'
current_folder = 'ncypred-lr{}-{}/'.format(lr, model_name)

model.save(main_folder+current_folder) 

print('Saving history at {}'.format(current_folder)) 
hist_df = pd.DataFrame(history.history) 
hist_csv_file = main_folder+current_folder+'history.csv'

with open(hist_csv_file, mode='w') as f:
  hist_df.to_csv(f)