In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

## Training set

In [2]:
df_train = pd.read_csv('./nRC-training-set.csv', index_col=0)

In [3]:
# One-hot encoding das classes
one_hot_labels = pd.get_dummies(df_train['labels']).values
df_train['b-labels'] = list(one_hot_labels)

In [4]:
df_train

Unnamed: 0,ids,seq,labels,b-labels
0,RF00001_AF095839_1_346-228,GCGTACGGCCATACTATGGGGAATACACCTGATCCCGTCCGATTTC...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,RF00001_AY245018_1_1-119,GCTATCGGCCATACTAAGCCAAATGCACCGGATCCCTTCCGAACTC...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,RF00001_X52048_1_2-120,TGCTACGATCATACCACTTAGAAAGCACCCGGTCCCATCAGACCCC...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,RF00001_M28193_1_1-119,AGTTACGGCCATACCTCAGAGAATATACCGTATCCCGTTCGATCTG...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,RF00001_X14816_1_860-978,ACCAACGGCCATACCACGTTGAAAGTACCCAGTCTCGTCAGATCCT...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
6315,RF02535_AFEY01343643_1_18075-17945,ACTTCCAATGCAATGGCTGCAGTGAAGCTATAATTATAGCCTTGTA...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
6316,RF02535_AAPE02009951_1_24083-24245,ATTCCCAGTGCTGCACCGAGAGGACCTGTCTCCTGTGGACTGGAAG...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
6317,RF02535_ABQO011108623_1_28-199,AGTGCAACGGCTGCACCGAAGGCACAATCGTAGCCTTGTATTTCAC...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
6318,RF02535_AAPE02044716_1_11582-11441,ATTCCCGCTGCTGCACAGAGAGGACCCGTGTCCCGTGGACTGGGAG...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [5]:
from sklearn.utils import shuffle

df_train = shuffle(df_train)
df_train = df_train.reset_index(drop=True)

In [6]:
df_train

Unnamed: 0,ids,seq,labels,b-labels
0,RF00001_AAWZ02019646_1_204701-204588,AGGTAGGGTTGCAATAAAGGCACCAGATCCCATTACATCTTGGAAA...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,RF00002_AJ307679_1_897-1055,GACCCTGGGGGATGGATCACTCGGCTCGTATTACGAAGACGAACGC...,5_8S_rRNA,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,RF00001_AAQR03010791_1_33267-33380,ATCAATGGCCATACCCCTCTGAAAGTACCTGATCTTGTCTGATCTT...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,RF00223_ABRT010369175_1_885-1057,CACCAACCAAGACAGCATCTCACAGATCTACTTGTGTGTTTCCAGA...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,RF00778_FN596251_1_1049963-1050065,GTTGCTCACTCTCCCTCAAGGGCTTCTGCACTTTGGCCATGGCCAT...,miRNA,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...,...,...
6315,RF00557_CP002728_1_2259604-2259443,TTATTGTTATATATTATGCTGCAGACCGTAGGTGCTATAAAGCTTA...,leader,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
6316,RF00449_AAGU03024943_1_27857-28130,CTGGTCTGAGGGAAGGCGAGGATCGCCCTCGCCGCCGGTTCGGCCA...,IRES,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
6317,RF00001_AFYH01166467_1_4673-4555,AAAAAGAGGCACAGTAGGACCCCAGTGCCTGGTCTTGTCTGATCTC...,5S_rRNA,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6318,RF01057_CAFE01000042_1_5903-6040,GTCTTCGAGGAGCGTTGCGACGGGCAAACGGCTGAAAAACACCGCC...,riboswitch,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"


## Gerar tokens

In [7]:
def seq_to_3mer(seq_list):
	print('Processing {} sequences'.format(len(seq_list)))
	
	main_list = []
	
	for _, i in enumerate(seq_list):
		# print('type(i): ', type(i))
		# print('type([i]): ', type([i]))
		# print('type(list(i)): ', type(list(i)))
		seq = list(i)
		seq_kmer = []

		for j, _ in enumerate(seq):
			if j < len(seq) - 2:
				seq_kmer.append(seq[j] + seq[j+1] + seq[j+2])
			else:
				continue

		main_list.append(seq_kmer)

	return main_list 


In [8]:
# Gerar k-mers
X_3mer = seq_to_3mer(df_train['seq'])

Processing 6320 sequences


In [9]:
seq_length = []
for x in X_3mer:
    seq_length.append(len(x))
    
seq_length.sort()
seq_length[-1]

1134

In [10]:
# Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

MAX_VOCAB_SIZE = 600
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_3mer)
idx = tokenizer.word_index
print(len(idx))

583


In [11]:
import pickle

# saving tokenizer
with open('tokenizer-nRC-alltokens.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
def token_pad(sentences, max_len, prefix):
	print('Zero-padding sequences to {} and tokenizing'.format(max_len))

	with open('./tokenizer-nRC-alltokens.pickle', 'rb') as handle:
		tokenizer = pickle.load(handle)

	tokens = tokenizer.texts_to_sequences(sentences)
	all_pad = pad_sequences(tokens, max_len, padding=prefix)

	return all_pad

In [13]:
# Tokenization and zero-padding
X = token_pad(X_3mer, 1134, 'post')

Zero-padding sequences to 1134 and tokenizing


In [14]:
X

array([[12, 16, 53, ...,  0,  0,  0],
       [52, 50, 25, ...,  0,  0,  0],
       [48, 40, 33, ...,  0,  0,  0],
       ...,
       [ 1,  1,  1, ...,  0,  0,  0],
       [55, 30, 26, ...,  0,  0,  0],
       [55, 30, 58, ...,  0,  0,  0]])

In [16]:
array_list = []
for arr in list(df_train['b-labels']):
    array_list.append(arr)

In [17]:
y = np.vstack(array_list)

In [18]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [39]:
X.shape

(6320, 1134)

In [40]:
y.shape

(6320, 13)

# Neural network

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, Attention, Flatten, SimpleRNN
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import AdditiveAttention

In [22]:
def biLSTM(lr, D):


    sequence_input = Input(shape=(498,), dtype="int32")
    embedded_sequences = Embedding(64, D, mask_zero=True)(sequence_input)


    lstm = Bidirectional(LSTM(32, input_shape=(498, D), return_sequences=False,
                            return_state=False), name="bilstm1")(embedded_sequences)

    dense1 = Dense(128, activation="relu")(lstm)
    dropout = Dropout(0.4)(dense1)
    output = Dense(13, activation='softmax')(dropout)

    model = tf.keras.Model(inputs=sequence_input, outputs=output)
  
    # compile model
    opt = tf.keras.optimizers.RMSprop(learning_rate=lr, momentum=0.01, centered=True)
  
    METRICS = [
	  tf.keras.metrics.TruePositives(name='tp'),
	  tf.keras.metrics.FalsePositives(name='fp'),
  	tf.keras.metrics.TrueNegatives(name='tn'),
	  tf.keras.metrics.FalseNegatives(name='fn'),
	  tf.keras.metrics.BinaryAccuracy(name='accuracy'),
	  tf.keras.metrics.Precision(name='precision'),
	  tf.keras.metrics.Recall(name='recall'),
	  tf.keras.metrics.AUC(name='auc'),
    ]

    model.compile(loss='categorical_crossentropy',
                optimizer=opt,
                metrics=METRICS)
  
    return model

In [23]:
model = biLSTM(0.004, 10)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 498)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 498, 10)           640       
_________________________________________________________________
bilstm1 (Bidirectional)      (None, 64)                11008     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 13)                1677      
Total params: 21,645
Trainable params: 21,645
Non-trainable params: 0
_______________________________________________________

In [24]:
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer

# https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.


        Note: The layer has been tested with Keras 1.x

        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...

            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)

        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]



In [41]:
def biLSTMAtt(lr, D):


    sequence_input = Input(shape=(1134,), dtype="int32")
    embedded_sequences = Embedding(583, D, mask_zero=True)(sequence_input)

    (lstm, forward_h, forward_c, 
        backward_h, backward_c) = Bidirectional(LSTM(32, input_shape=(1134, D), return_sequences=True,
                                                  return_state=True), name="bi_lstm_1")(embedded_sequences)

    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])

    context_vector, weights = Attention(return_attention=True)(lstm)

    dense1 = Dense(128, activation="relu")(context_vector)
    dropout = Dropout(0.4)(dense1)
    output = Dense(13, activation='softmax')(dropout)

    model = tf.keras.Model(inputs=sequence_input, outputs=output)
  
    # compile model
    opt = tf.keras.optimizers.RMSprop(learning_rate=lr, momentum=0.01, centered=True)
  
    METRICS = [
	  tf.keras.metrics.TruePositives(name='tp'),
	  tf.keras.metrics.FalsePositives(name='fp'),
  	tf.keras.metrics.TrueNegatives(name='tn'),
	  tf.keras.metrics.FalseNegatives(name='fn'),
	  tf.keras.metrics.BinaryAccuracy(name='accuracy'),
	  tf.keras.metrics.Precision(name='precision'),
	  tf.keras.metrics.Recall(name='recall'),
	  tf.keras.metrics.AUC(name='auc'),
    ]

    model.compile(loss='categorical_crossentropy',
                optimizer=opt,
                metrics=METRICS)
  
    return model

In [42]:
model = biLSTMAtt(0.004, 10)
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 1134)]            0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 1134, 10)          5830      
_________________________________________________________________
bi_lstm_1 (Bidirectional)    [(None, 1134, 64), (None, 11008     
_________________________________________________________________
attention_2 (Attention)      [(None, 64), (None, 1134) 1198      
_________________________________________________________________
dense_12 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 13)                1677

# K-fold cross validation

In [29]:
from sklearn.model_selection import StratifiedShuffleSplit

def CreateFolds(X, y, folds, test_size):

    sss = StratifiedShuffleSplit(n_splits=folds, test_size=test_size)
    sss.get_n_splits(X, y)

    count = 1

    train_list = []
    test_list = []

    for train_index, test_index in sss.split(X, y):
        print('Fold {}'.format(count))

        train_list.append(list(train_index))
        test_list.append(list(test_index))

        count += 1
  
    return train_list, test_list

In [30]:
train_list, test_list = CreateFolds(X, y, 10, 0.2)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10


In [32]:
df_train_idx = pd.DataFrame()
for i,n in enumerate(train_list):
    df_train_idx['fold{}'.format(i+1)] = n
        
df_test_idx = pd.DataFrame()
for i,n in enumerate(test_list):
    df_test_idx['fold{}'.format(i+1)] = n

In [33]:
df_train_idx.to_csv('./df_train_idx.csv')
df_test_idx.to_csv('./df_test_idx.csv')

In [29]:
df_train_idx = pd.read_csv('./df_train_10folds.csv', index_col=0)
df_test_idx = pd.read_csv('./df_test_10folds.csv', index_col=0)

In [34]:
def scheduler(epoch, lr):
    if epoch < 3:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# 10-fold CV

In [44]:
start = 0 # last trained fold
model_name = 'biLSTMAtt'

d = 10
lr = 0.008

for i in range(start, 10):
    print(model_name)
    print('Fold {}, lr: {}, D: {}'.format(i+1, lr, d))
    X_train = X[df_train_idx['fold{}'.format(i+1)]]
    X_test = X[df_test_idx['fold{}'.format(i+1)]]

    y_train = y[df_train_idx['fold{}'.format(i+1)]]
    y_test = y[df_test_idx['fold{}'.format(i+1)]]

    callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

    # MODEL
    model = biLSTMAtt(lr,d)
    history = model.fit(X_train, y_train, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[callback])

    hist_df = pd.DataFrame(history.history) 
    hist_csv_file = './history-{}-lr{}-D{}-{}epochs-fold{}-dropout40.csv'.format(model_name,
                                                                                 lr, d, '10', i)

    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)


biLSTMAtt
Fold 1, lr: 0.008, D: 10
Epoch 1/10

Epoch 00001: LearningRateScheduler setting learning rate to 0.00800000037997961.
 14/158 [=>............................] - ETA: 1:48 - loss: 2.5715 - tp: 0.0000e+00 - fp: 0.0000e+00 - tn: 5376.0000 - fn: 448.0000 - accuracy: 0.9231 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.5178

KeyboardInterrupt: 