#Using RNNs to Label Sequences

We use RNN models to label secondary structure annotations. 

In [3]:
#
# Imports for data loading and classification
#

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
import keras.preprocessing.sequence

import matplotlib.pyplot as plt
import numpy as np
import pandas as ps
import sys


#
# Setup matplotlib and ipython
#
%matplotlib inline

# random seed
R_SEED = 42

##Load input data

In [20]:
#
# The character mapping to encode amino acid sequences.
# Non-AA chars should be mapped to X, or 0.
#

AAs = ['X','I','L','V','F','M','C','A','G','P','T','S','Y','W','Q','N','H','E','D','K','R']
AAIndexes = {AAs[i] : i for i in range(len(AAs))}

def encodeAA(x) :
    # Encode an amino acid sequence
    if x not in AAIndexes :
        return 0
    return AAIndexes[x]

def seqToIdxs(seq) :
    return map(encodeAA, seq.strip().upper())

#
#  Functions for loading sequences and background distributions
#

def loadSeqs(seqs_file, num_lines=sys.maxint) :
    seqs = []
    with open(seqs_file) as in_f :
        for line in in_f :
            seqs.append(seqToIdxs(line))
            if len(seqs) >= num_lines : break
    return seqs

def loadFeatureSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

def loadFeatureBkgrdSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_featurebackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

def loadGlobalBkgrdSeqs(task_name, num_lines=sys.maxint) :
    path = '/home/gene245/cprobert/seq_features/%s_globalbackground_seqs.txt' % task_name
    return loadSeqs(path, num_lines)

In [23]:
# Try loading some data
print(loadFeatureSeqs('transmembrane-region',10)[0][:10])
print(loadFeatureBkgrdSeqs('transmembrane-region',10)[0][:10])
print(loadGlobalBkgrdSeqs('transmembrane-region',1)[0][:10])

[5, 8, 8, 8, 18, 12, 13, 9, 1, 1]
[15, 3, 15, 3, 11, 10, 17, 2, 18, 11]
[2, 11, 1, 14, 12, 3, 9, 17, 10, 15]


##Declare the model

In [None]:
model = keras.models.Sequential()
model.add(Embedding(max_features, 256))
model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))