#Using RNNs to Label Sequences

We use RNN models to label secondary structure annotations. 

In [1]:
#
# Imports for data loading and classification
#

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleDeepRNN
import keras.preprocessing.sequence
from keras.optimizers import SGD

from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as ps
import sys


#
# Setup matplotlib and ipython
#
%matplotlib inline

# random seed
R_SEED = 42

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 28 days


##Load input data

In [23]:
#
# The character mapping to encode amino acid sequences.
# Non-AA chars should be mapped to X, or 0.
#

AAs = ['X','I','L','V','F','M','C','A','G','P','T','S','Y','W','Q','N','H','E','D','K','R']
AAIndexes = {AAs[i] : i for i in range(len(AAs))}

def encodeAA(x) :
    # Encode an amino acid sequence
    if x not in AAIndexes :
        return 0
    return AAIndexes[x]

def seqToIdxs(seq) :
    return map(encodeAA, seq.strip().upper())

#
#  Functions for loading sequences and background distributions
#

def loadSeqs(seqs_file, num_lines=sys.maxint, min_len=0) :
    seqs = []
    with open(seqs_file) as in_f :
        for line in in_f :
            if len(line.strip()) >= min_len :
                seqs.append(seqToIdxs(line))
            if len(seqs) >= num_lines : break
    return seqs

def loadFeatureSeqs(task_name, num_lines=sys.maxint, min_len=0) :
    path = '/home/gene245/cprobert/seq_features/%s_seqs.txt' % task_name
    return loadSeqs(path, num_lines, min_len)

def loadFeatureBkgrdSeqs(task_name, num_lines=sys.maxint, min_len=0) :
    path = '/home/gene245/cprobert/seq_features/%s_featurebackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines, min_len)

def loadGlobalBkgrdSeqs(task_name, num_lines=sys.maxint, min_len=0) :
    path = '/home/gene245/cprobert/seq_features/%s_globalbackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines, min_len)

def createBinaryLabelVector(length, label) :
    if label == 1 :
        return np.ones(length, dtype=int)
    return np.zeros(length, dtype=int)

def createOneHotLabels(length, label) :
    ar = [1,0] if label == 0 else [0,1]
    ar = [ar for i in xrange(length)]
    return np.array(ar)

def loadShuffledData(task_name, num_exs=sys.maxint, bkgrd='global', max_len=100, min_len=10) :
    """
    Loads a shuffled set of (seqs, labels) for the given task.
    """
    assert(bkgrd in ['global', 'feature'])
    seqs_pos = loadFeatureSeqs(task_name,num_exs/2)
    if bkgrd == 'global' :
        seqs_neg = loadGlobalBkgrdSeqs(task_name,num_exs/2)
    else :
        seqs_neg = loadFeatureBkgrdSeqs(task_name,num_exs/2)
    seqs = keras.preprocessing.sequence.pad_sequences(seqs_pos + seqs_neg, maxlen=max_len)
    labels = np.append(createOneHotLabels(len(seqs_pos), 1),
                       createOneHotLabels(len(seqs_neg), 0), axis=0)
    np.random.seed(R_SEED)
    idxs = np.arange(labels.shape[0])
    np.random.shuffle(idxs)
    seqs, labels = seqs[idxs], labels[idxs]
    return seqs, labels

In [3]:
# Try loading some data
print(loadFeatureSeqs('transmembrane-region',1)[0][:10])
print(loadFeatureBkgrdSeqs('transmembrane-region',1)[0][:10])
print(loadGlobalBkgrdSeqs('transmembrane-region',1)[0][:10])
print(createBinaryLabelVector(3,1))
print(createBinaryLabelVector(3,0))
print(loadShuffledData('transmembrane-region',num_exs=4,max_len=4))

[5, 8, 8, 8, 18, 12, 13, 9, 1, 1]
[15, 3, 15, 3, 11, 10, 17, 2, 18, 11]
[2, 11, 1, 14, 12, 3, 9, 17, 10, 15]
[1 1 1]
[0 0 0]
(array([[ 5,  8,  8,  8],
       [19, 14, 15, 15],
       [ 5,  8,  8,  8],
       [ 2, 11,  1, 14]], dtype=int32), array([1, 0, 1, 0]))


In [4]:
# Try loading data and converting to keras types

seqs = loadFeatureSeqs('transmembrane-region',1)
proc_seqs = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=10)

##Declare the model

In [29]:
def getsimpleLSTM(input_dim=len(AAs)+1) :
    model = keras.models.Sequential()
    model.add(Embedding(input_dim, 256))
    model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(128, 2, init='uniform'))
    model.add(Activation('sigmoid'))
    return model

def getsimpleRNN(input_dim=len(AAs)+1) :
    model = keras.models.Sequential()
    model.add(Embedding(input_dim, 256))
    model.add(SimpleDeepRNN(256, 128, truncate_gradient=5))
    #model.add(Dropout(0.5))
    model.add(Dense(128, 2, init='uniform'))
    model.add(Activation('softmax'))
    return model


##Try things out!

In [30]:
# Load the input data, and split the test data
seqs, labels = loadShuffledData('transmembrane-region', num_exs=1000, bkgrd='global', max_len=100, min_len=10)

X_train, X_test, y_train, y_test = train_test_split(seqs, labels, test_size=0.1, random_state=R_SEED)




In [31]:
print 'X_train', X_train[:5]
print 'y_train', y_train[:5]

X_train [[ 5  7  7  5  2  8 20 18 17 18  9  3  8  7  2 11  8 20  3 11  2  7 11 10
  11 16 20 11  2  3  8  7 11 19 11  4 20 18  3  4  5  9 14 10 18 17  3  4
   8 20 11 17 20 20 17 17 18 18  5 17  2 20 13  7  7  1 17 20  2  9 10  4
  18 20  2 20 19  8  5  2  9 14 10 11  7 15  8 19  1 17  2 17 18  1 18  2
  10 20  2 17]
 [ 5  7 20  2  4 11  9 20  9  9  9 11 17 18  2  4 12 17 10 12 12 11  2 11
  14 14 12  9  2  2  1  2  2  2  3  1  3  2  6  7  2  3  7  2  9  7  3  7
  13  7 11  8 20 17  2 10 11 18  9 11  4  2 10 10  3  2  6  7  2  8  8  4
  11  2  2  2  8  2  7 11 20 17 14 14  2 14 20 13 10 20  9  2 11  8  2  1
  13  3  7  2]
 [ 7  3  9  1 10 11  4 14 16 11 11 10 10  8  7 19  6  2 16  3 17  9 17  3
  11 18 20 19 17  3 16 10  7  2 11 11 14  4  6 16  8  8  9  9  3 11 16  9
   1 19  2  4 15 18  3 18  1  2  2 18 12  2 10  3  2 17 12  2 10  1  3 17
  12 10  9  7 18  2 10 10  8 10  2 15 12 20  7  2  4  8 17 15 16 14 15 19
   2  2  8  1]
 [15  8 20 20 11  4  3 14  1  8 18 12  4 18 12  1  9 17  3 

In [32]:
# Declare and compile the model
model = getsimpleRNN()
#sgd = SGD(lr=0.1, decay=1e-7, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='sgd')

In [33]:
# fit the model
model.fit(X_train, y_train, nb_epoch=30, batch_size=16,
          validation_split=0.1, shuffle=True, show_accuracy=True)

Train on 810 samples, validate on 90 samples
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


{'acc': [0.5,
  0.4802469135802469,
  0.48518518518518516,
  0.44814814814814813,
  0.49259259259259258,
  0.49259259259259258,
  0.52222222222222225,
  0.47530864197530864,
  0.46296296296296297,
  0.4580246913580247,
  0.49012345679012348,
  0.4777777777777778,
  0.48518518518518516,
  0.51728395061728394,
  0.49753086419753084,
  0.48765432098765432,
  0.48271604938271606,
  0.5024691358024691,
  0.5,
  0.46049382716049381,
  0.49629629629629629,
  0.51975308641975304,
  0.50740740740740742,
  0.47901234567901235,
  0.50617283950617287,
  0.5024691358024691,
  0.49382716049382713,
  0.52469135802469136,
  0.46419753086419752,
  0.50740740740740742],
 'epoch': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29],
 'loss': [0.69936694873952399,
  0.70697405211627307,
  0.70307847922573019,
  0.70643853076397722,
  0.70060782229851171,
  0.70300915440728184,
  0.698834597

In [34]:
score = model.test(X_test, y_test, accuracy=True)
print score

score = model.test(X_train, y_train, accuracy=True)
print score

print model.predict_classes(X_test)
print y_test[:20]
X_test.shape

[array(0.7185029689021053), array(0.52)]
[array(0.7307127183985944), array(0.49777777777777776)]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[[1 0]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]
 [0 1]]


(100, 100)

In [2]:
# Try Keras examples
from keras.datasets import cifar10
(X_train, y_train), (X_test, y_test) = cifar10.load_data(test_split=0.1, seed=113)
print X_train.shape
print y_train.shape

Downloading data from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Untaring file...
(45000, 3, 32, 32)
(45000, 1)


In [3]:
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.pkl", nb_words=None, skip_top=0, maxlen=None, test_split=0.1, seed=113)





Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl


In [15]:
print X_train[0]
print np.unique(y_train)
print len(X_train), len(y_train)
X_train = X_train[:1000]
y_train = y_train[:1000]
print len(X_train[0])
print len(X_train[1])
print max(map(max, X_train))

[17, 10, 2, 257, 7, 25, 18, 69, 4195, 1513, 16, 121, 41, 2, 73, 3, 26, 14, 20, 33, 1758, 303, 4, 16, 75, 121, 14, 299, 15, 6, 153, 8, 112, 263, 18, 14, 20, 22, 96, 22, 16, 101, 219, 14, 21, 4, 12, 13, 9, 11, 12, 13, 9, 11, 61, 257, 7, 10886, 17974, 6, 15138, 13325, 4, 29, 18, 3, 42, 238, 3, 10, 45, 4874, 146, 272, 15138, 17974, 6, 62, 191, 7, 2, 17832, 4, 28, 2, 4570, 281, 206, 15, 2, 40383, 6009, 5, 8198, 98697, 8, 2, 19456, 3, 6, 257, 7, 66280, 8062, 4, 12, 13, 9, 11, 12, 13, 9, 11, 14, 56, 39, 7, 163, 10355, 5908, 3, 2, 407, 357, 1437, 53, 1201, 768, 2, 9732, 4, 12, 13, 9, 11, 12, 13, 9, 11, 19, 81, 196, 31, 275, 218, 40, 19, 3, 561, 1789, 695, 10872, 4, 34, 10, 48, 222, 4, 558, 23, 727, 3982, 4, 34, 20, 15, 41, 149, 1699, 116, 4, 12, 13, 9, 11, 12, 13, 9, 11, 121, 174, 25, 318, 4, 470, 3, 33, 19, 4562, 19, 2, 216, 4, 18, 20, 70, 62, 357, 4, 318, 3, 121, 6, 553, 526, 55, 1573, 16582, 49, 19, 2839, 7, 2, 7106, 4, 19]
[0 1]
1000 1000
216
198
102063


In [17]:
max_features = max(max(map(max, X_train)), max(map(max, X_test)))
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=200, dtype='int32')
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=200, dtype='int32')

# model = Sequential()
# model.add(Embedding(max_features, 256))
# model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
# model.add(Dropout(0.5))
# model.add(Dense(128, 1))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy', optimizer='rmsprop')

model.fit(X_train, y_train, batch_size=16, nb_epoch=10)
score = model.evaluate(X_test, y_test, batch_size=16)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9

IndexError: index 102094 is out of bounds for size 102094
Apply node that caused the error: AdvancedSubtensor1(<TensorType(float64, matrix)>, Flatten{1}.0)
Inputs shapes: [(102094, 256), (3200,)]
Inputs strides: [(2048, 8), (4,)]
Inputs types: [TensorType(float64, matrix), TensorType(int32, vector)]
Use the Theano flag 'exception_verbosity=high' for a debugprint of this apply node.