#Using RNNs to Label Sequences

We use RNN models to label secondary structure annotations. 

In [63]:
#
# Imports for data loading and classification
#

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleDeepRNN
keras.layers.recurrent.SimpleDeepRNN
import keras.preprocessing.sequence
from keras.optimizers import SGD

from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as ps
import sys


#
# Setup matplotlib and ipython
#
%matplotlib inline

# random seed
R_SEED = 42

##Load input data

In [85]:
#
# The character mapping to encode amino acid sequences.
# Non-AA chars should be mapped to X, or 0.
#

AAs = ['X','I','L','V','F','M','C','A','G','P','T','S','Y','W','Q','N','H','E','D','K','R']
AAIndexes = {AAs[i] : i for i in range(len(AAs))}

def encodeAA(x) :
    # Encode an amino acid sequence
    if x not in AAIndexes :
        return 0
    return AAIndexes[x]

def seqToIdxs(seq) :
    return map(encodeAA, seq.strip().upper())

#
#  Functions for loading sequences and background distributions
#

def loadSeqs(seqs_file, num_lines=sys.maxint) :
    seqs = []
    with open(seqs_file) as in_f :
        for line in in_f :
            seqs.append(seqToIdxs(line))
            if len(seqs) >= num_lines : break
    return seqs

def loadFeatureSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

def loadFeatureBkgrdSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_featurebackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

def loadGlobalBkgrdSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_globalbackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

def createLabelVector(length, label) :
    ones = np.ones(length).reshape(length,1)
    zeros = np.zeros(length).reshape(length,1)
    if label == 1 :
        return np.append(zeros,ones,axis=1)
    return np.append(ones,zeros,axis=1)


In [87]:
# Try loading some data
print(loadFeatureSeqs('transmembrane-region',1)[0][:10])
print(loadFeatureBkgrdSeqs('transmembrane-region',1)[0][:10])
print(loadGlobalBkgrdSeqs('transmembrane-region',1)[0][:10])
print(createLabelVector(3,1))
print(createLabelVector(3,-1))

[5, 8, 8, 8, 18, 12, 13, 9, 1, 1]
[15, 3, 15, 3, 11, 10, 17, 2, 18, 11]
[2, 11, 1, 14, 12, 3, 9, 17, 10, 15]
[[ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]]
[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


In [88]:
# Try loading data and converting to keras types

seqs = loadFeatureSeqs('transmembrane-region',1)
proc_seqs = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=10)

##Declare the model

In [89]:
def getsimpleLSTM(input_dim) :
    model = keras.models.Sequential()
    model.add(Embedding(input_dim, 256))
    model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(128, 1, init='uniform'))
    model.add(Activation('softmax'))
    return model

def getsimpleRNN(input_dim) :
    model = keras.models.Sequential()
    model.add(Embedding(input_dim, 256))
    model.add(SimpleDeepRNN(256, 2, truncate_gradient=3, return_sequences=False))
    return model


##Try things out!

In [98]:
seqs_pos = loadFeatureSeqs('transmembrane-region',1000)
proc_seqs_pos = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=100)
labels_pos = createLabelVector(proc_seqs_pos.shape[1], 1)

seqs_neg = loadGlobalBkgrdSeqs('transmembrane-region',1000)
proc_seqs_neg = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=100)
labels_neg = createLabelVector(proc_seqs_neg.shape[1], -1)
print(len(seqs_neg),len(seqs_neg[0]))
print(proc_seqs_neg.shape)
print labels_neg.shape
proc_seqs = np.append(proc_seqs_pos, proc_seqs_neg, axis=0)
labels = np.append(labels_pos, labels_neg)

input_dim = len(AAs)

#model = getsimpleRNN(input_dim)
#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
#model.compile(loss='categorical_crossentropy', optimizer=sgd)

(1000, 160)
(1, 100)
(100, 2)


In [94]:
# split the training and testing data
print proc_seqs.shape
print labels.shape
X_train, X_test, y_train, y_test = train_test_split(proc_seqs, labels, test_size=0.1, random_state=R_SEED)
print X_train.shape
model.fit(X_train, y_train, nb_epoch=10, batch_size=5)

(2, 100)
(400,)


ValueError: Found array with dim 400. Expected 2