In [None]:
# Import
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
import os
import random
import sys
import utils
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix

In [None]:
# Load old model
load_model = False

In [None]:
# Read training data
filename = "data_train.txt"

try:
    infile = open(filename, 'r')
except IOError as error:
    sys.stderr.write("File I/O error, reason" + str(error) + "\n")
    sys.exit(1)

seqflag = False
X_train = []
Y_train = []
seq_aa = []
seq_ss = []
for line in infile:
    if line.startswith("end") or line.startswith("<end>"):
        seqflag = False
        X_train.append(seq_aa)
        Y_train.append(seq_ss)
        seq_aa = []
        seq_ss = []
    elif seqflag:
        aa = line.split()[0]
        ss = line.split()[1]
        seq_aa.append(aa)
        seq_ss.append(ss)
    elif line.startswith("<>"):
        seqflag = True
        
# Read validation data
filename = "data_valid.txt"

try:
    infile = open(filename, 'r')
except IOError as error:
    sys.stderr.write("File I/O error, reason" + str(error) + "\n")
    sys.exit(1)

seqflag = False
X_valid = []
Y_valid = []
seq_aa = []
seq_ss = []
for line in infile:
    if line.startswith("end") or line.startswith("<end>"):
        seqflag = False
        X_valid.append(seq_aa)
        Y_valid.append(seq_ss)
        seq_aa = []
        seq_ss = []
    elif seqflag:
        aa = line.split()[0]
        ss = line.split()[1]
        seq_aa.append(aa)
        seq_ss.append(ss)
    elif line.startswith("<>"):
        seqflag = True

In [None]:
# Define encoding dictionaries
aadict = {'A':[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], 
          'R':[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'N':[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'D':[0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'C':[0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'Q':[0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'E':[0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
          'G':[0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0],
          'H':[0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0],
          'I':[0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],
          'L':[0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0],
          'K':[0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0],
          'M':[0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0],
          'F':[0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0],
          'P':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0],
          'S':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0],                                    
          'T':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0],
          'W':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
          'Y':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0],
          'V':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],
 }

ssdict = {'_':[1,0,0], 
          'e':[0,1,0],
          'h':[0,0,1],
 }

# One-hot-encode training data
for i in range(0,len(X_train)):
    seq_aa = X_train[i]
    seq_ss = Y_train[i]

    for j in range(0,len(seq_ss)):
        aa = seq_aa[j]
        ss = seq_ss[j]
        
        if aa not in aadict:
            print("Unknown aa: " + aa)
            sys.exit(1)
        else:
            X_train[i][j] = aadict[aa]
        
        if ss not in ssdict:
            print("Unknown ss: " + ss)
            sys.exit(1)
        else:
            Y_train[i][j] = ssdict[ss]

# One-hot-encode validation data
for i in range(0,len(X_valid)):
    seq_aa = X_valid[i]
    seq_ss = Y_valid[i]

    for j in range(0,len(seq_ss)):
        aa = seq_aa[j]
        ss = seq_ss[j]
        
        if aa not in aadict:
            print("Unknown aa: " + aa)
            sys.exit(1)
        else:
            X_valid[i][j] = aadict[aa]
        
        if ss not in ssdict:
            print("Unknown ss: " + ss)
            sys.exit(1)
        else:
            Y_valid[i][j] = ssdict[ss]

# Get sequence lengths
X_train_lengths = [0]*len(X_train)
X_valid_lengths = [0]*len(X_valid)

for i in range(len(X_train)):
    X_train_lengths[i] = len(X_train[i])

for i in range(len(X_valid)):
    X_valid_lengths[i] = len(X_valid[i])            
            
# Do padding to same sequence length
numseqs_train = len(X_train)
numseqs_valid = len(X_valid)
maxseqlen = max(len(max(X_train,key=len)),len(max(X_valid,key=len)))

X_train = pad_sequences(X_train,padding="post",maxlen=maxseqlen)
Y_train = pad_sequences(Y_train,padding="post",maxlen=maxseqlen)

X_valid = pad_sequences(X_valid,padding="post",maxlen=maxseqlen)
Y_valid = pad_sequences(Y_valid,padding="post",maxlen=maxseqlen) 

# Reshape Ys to correct dimensions
Y_train = np.reshape(Y_train,[-1,3])
Y_valid = np.reshape(Y_valid,[-1,3])

In [None]:
## Build the network
tf.reset_default_graph()

## Define placeholders
num_classes = 3

X_ph = tf.placeholder(tf.float32, [None,maxseqlen,20], name='XPlaceholder')
X_len_ph = tf.placeholder(tf.int32, [None], name='XlenPlaceholder')
Y_ph = tf.placeholder(tf.float32, [None,num_classes], name='YPlaceholder')
phase_ph = tf.placeholder(tf.bool,name='phasePlaceholder')
keep_prob_ph = tf.placeholder(tf.float32,name='keepprobPlaceholder')

## Define the model

# Initialize weights
weight_initializer = tf.truncated_normal_initializer(stddev=0.1)

# Create biRNN layer
RNN_units = 100

with tf.variable_scope('layer1'):    
    cell_fw = tf.nn.rnn_cell.LSTMCell(RNN_units)
    cell_bw = tf.nn.rnn_cell.LSTMCell(RNN_units)    
    
    with tf.variable_scope('layer1_output'):
        l_1, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=X_ph, 
                                             sequence_length=X_len_ph, dtype=tf.float32)
        l_1 = tf.concat(l_1,2) # Merge forward/backward LSTMs
        l_1_reshaped = tf.reshape(l_1,[-1,2*RNN_units])
        l_1_reshaped = tf.contrib.layers.batch_norm(l_1_reshaped, center=True, scale=True, is_training=phase_ph)

# Create fully-connected/drop-out layer
FFNN_units = 50

with tf.variable_scope('layer2'): 
    W_2 = tf.get_variable('W_2', [2*RNN_units,FFNN_units], 
                          initializer=weight_initializer)
    b_2 = tf.get_variable('b_2', [FFNN_units],
                          initializer=tf.constant_initializer(0.0))

    with tf.variable_scope('layer3_output'): 
            l_2 = tf.matmul(l_1_reshaped, W_2) + b_2
            l_2 = tf.maximum(l_2, 0.01*l_2) # Leaky relu
            l_2 = tf.nn.dropout(l_2, keep_prob = keep_prob_ph)
            l_2 = tf.contrib.layers.batch_norm(l_2, center=True, scale=True, is_training=phase_ph)

# Create softmax layer
with tf.variable_scope('layer3'): 
    W_3 = tf.get_variable('W_3', [FFNN_units, num_classes], 
                          initializer=weight_initializer)
    b_3 = tf.get_variable('b_3', [num_classes],
                          initializer=tf.constant_initializer(0.0))

    with tf.variable_scope('layer3_output'):
        l_3 = tf.matmul(l_2, W_3) + b_3
              
Y = tf.nn.softmax(l_3)

# Print number of trainable parameters
print('Model consits of ', utils.num_params(), 'trainable parameters.')

In [None]:
# Define prediction function
def pred(X_in, sess):
    feed_dict = {X_ph: X_in}
    fetches = [Y]
    res = sess.run(fetches, feed_dict)
    return res[0]

In [None]:
### Implement training ops

# Define the cross entropy loss
with tf.variable_scope('loss'):
    
    # Mask padding 
    boolmask = tf.equal(tf.reduce_sum(Y_ph,axis=1), 1)
    Y_ph_masked = tf.boolean_mask(Y_ph,boolmask)
    Y_masked = tf.boolean_mask(Y,boolmask)
    
    # Compute loss  
    cross_entropy = -tf.reduce_sum(Y_ph_masked * tf.log(Y_masked), reduction_indices=[1]) 
    cross_entropy = tf.reduce_mean(cross_entropy)
    
    # L2 regularization
    reg_scale = 0.0001
    regularize = tf.contrib.layers.l2_regularizer(reg_scale)
    params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    reg_term = sum([regularize(param) for param in params])
    cross_entropy += reg_term

# Define the training op
with tf.variable_scope('trainOP'):
    
    # Apply Adam optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    
    # Clip gradients
    gvs = optimizer.compute_gradients(cross_entropy)
    capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
    train_op = optimizer.apply_gradients(capped_gvs)    

# Define the accuracy op
with tf.variable_scope('performance'):
    
    # Mask padding 
    boolmask = tf.equal(tf.reduce_sum(Y_ph,axis=1), 1)
    Y_ph_masked = tf.boolean_mask(Y_ph,boolmask)
    Y_masked = tf.boolean_mask(Y,boolmask)

    # Compute accuracy
    correct_prediction = tf.equal(tf.argmax(Y_masked, axis=1), tf.argmax(Y_ph_masked, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
## Start the session
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts))

if load_model:
    try:
        tf.train.Saver().restore(sess, "/save/model.ckpt")
        print("Using saved model")
    except:
        sess.run(tf.global_variables_initializer())
        print('Model not found, new parameters initialized')
else:
    sess.run(tf.global_variables_initializer())

In [None]:
# Define parameters
max_epochs = 20000
dropout_prob_train = 0.5
dropout_prob_val = 1.0

train_cost, val_cost, train_acc, val_acc = [],[],[],[]

try:       
    for e in range(max_epochs):
            
        ## Fetch random batches
        batchsize_train = 1 # Number of sequences
        batchsize_valid = 1
        
        # Get random indices
        idx_train = np.random.choice(range(len(X_train)),batchsize_train,replace=False)
        idx_valid = np.random.choice(range(len(X_valid)),batchsize_valid,replace=False)

        # Get X batches
        X_train_batch = X_train[idx_train,:]
        X_valid_batch = X_valid[idx_valid,:]
        
        # Get sequence lengths of X batches
        X_train_lengths_batch = [X_train_lengths[idx_train[i]] for i in range(len(idx_train))]
        X_valid_lengths_batch = [X_valid_lengths[idx_valid[i]] for i in range(len(idx_valid))]
        
        # Get Y batches 
        idx_train_rollout = [maxseqlen*idx_train[i] for i in range(len(idx_train))]
        idx_valid_rollout = [maxseqlen*idx_valid[i] for i in range(len(idx_valid))]
        
        Y_train_batch = [Y_train[idx_train_rollout[i]:idx_train_rollout[i]+maxseqlen,:] for i in range(len(idx_train_rollout))]
        Y_valid_batch = [Y_valid[idx_valid_rollout[i]:idx_valid_rollout[i]+maxseqlen,:] for i in range(len(idx_valid_rollout))]     
        Y_train_batch = np.vstack(Y_train_batch)
        Y_valid_batch = np.vstack(Y_valid_batch)
        
        # 1) Run the train op
        feed_dict_train = {X_ph: X_train_batch, X_len_ph: X_train_lengths_batch, Y_ph: Y_train_batch, 
                           phase_ph: 1, keep_prob_ph: dropout_prob_train}
        fetches_train = [train_op, cross_entropy, accuracy]
        res = sess.run(fetches=fetches_train, feed_dict=feed_dict_train)
            
        # 2) Compute train_cost, val_cost, train_acc, val_acc
        train_cost += [res[1]]
        train_acc += [res[2]]
            
        # 3) Run validation
        feed_dict_valid = {X_ph: X_valid_batch, X_len_ph: X_valid_lengths_batch, Y_ph: Y_valid_batch, 
                           phase_ph: 0, keep_prob_ph: dropout_prob_val}
        fetches_valid = [cross_entropy, accuracy]
        res = sess.run(fetches=fetches_valid, feed_dict=feed_dict_valid)
            
        val_cost += [res[0]]
        val_acc += [res[1]]
            
        # Print training summaries
        if e % 100 == 0:
            print("Epoch %i, Train Cost: %0.3f\tVal Cost: %0.3f\t Val acc: %0.3f" \
                %(e, train_cost[-1],val_cost[-1],val_acc[-1]))
                
except KeyboardInterrupt:
    print('KeyboardInterrupt')

print('Done')

In [None]:
# Define plot size
fig = plt.figure(figsize=(12,6))

# 1) Plot train and validation loss as a function of epochs
epoch = np.arange(len(train_cost))
fig.add_subplot(121)
plt.title('Loss')
plt.plot(epoch, train_cost,'r', label='Train Loss')
plt.plot(epoch, val_cost,'b', label='Val Loss')
plt.legend(loc=2)
plt.xlabel('Epochs'), plt.ylabel('Loss')
plt.tight_layout()

# 2) Plot train and validation accuracy as a function of epochs
fig.add_subplot(122)
plt.title('Accuracy')
plt.plot(epoch, train_acc,'r', label='Train Accuracy')
plt.plot(epoch, val_acc,'b', label='Val Accuracy')
plt.legend(loc=4)
plt.xlabel('Epochs'), plt.ylabel('Accuracy')
plt.tight_layout()
plt.show()

In [None]:
# Save model
save_path = tf.train.Saver().save(sess, "/tmp/model.ckpt")
print("Model saved in file: %s" % save_path)

In [None]:
# Create confusion matrix from validation data results
feed_dict_valid = {X_ph: X_valid, X_len_ph: X_valid_lengths, Y_ph: Y_valid, phase_ph: 0, keep_prob_ph: dropout_prob_val}
fetches_preds = [Y]
preds = sess.run(fetches=fetches_preds, feed_dict=feed_dict_valid)
preds = np.vstack(preds)
tmp = np.argmax(preds,axis=1).reshape(-1)
preds = np.eye(3)[tmp]

# Mask padding
boolmask = np.equal(np.sum(Y_valid,axis=1), 1).tolist()
Y_valid_masked = Y_valid[boolmask,:]
preds_masked = preds[boolmask,:]

# Compute metrics
preds_masked_dense = np.argmax(preds_masked, axis=1)
Y_valid_masked_dense = np.argmax(Y_valid_masked, axis=1)
confusionmat = confusion_matrix(Y_valid_masked_dense,preds_masked_dense)
print(confusionmat)
print("The total validation accuracy is: ",(confusionmat[0,0]+confusionmat[1,1]+confusionmat[2,2])/np.sum(confusionmat))
print("The random coil precision is: ",confusionmat[0,0]/np.sum(confusionmat[0,:]))
print("The random coil recall is: ",confusionmat[0,0]/np.sum(confusionmat[:,0]))
print("The beta sheet precision is: ",confusionmat[1,1]/np.sum(confusionmat[1,:]))
print("The beta sheet recall is: " ,confusionmat[1,1]/np.sum(confusionmat[:,1]))
print("The alpha helix precision is: ",confusionmat[2,2]/np.sum(confusionmat[2,:]))
print("The alpha helix recall is: ",confusionmat[2,2]/np.sum(confusionmat[:,2]))