## Load and process the dataset

In [1]:
import gzip, json
import numpy as np
import utils 
from sklearn.model_selection import train_test_split

# 20 standard amino acids
aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# load
dataset = utils.load_phipsi()

# 90% train, 10% test
train,test = train_test_split(dataset, test_size=0.1, random_state=42)

## Clustering

In [2]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss


In [3]:
NCLUST = 20

KM = KMeans(n_clusters=NCLUST, max_iter=5, random_state=42)
KM.fit(np.vstack([item['avec'] for item in train]))


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

## Test tensorflow on GPU

In [4]:
import tensorflow as tf

print("Built with GPU:", tf.test.is_built_with_cuda())
print("GPU available:", tf.test.is_gpu_available())
print("GPU device:", tf.test.gpu_device_name())


Built with GPU: True
GPU available: True
GPU device: /device:GPU:0


In [5]:
# convert sequences & dihedral clusters
# to one-hot representation
for item in train:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]

for item in test:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]


In [6]:
phi_ref = np.hstack([item['phi'] for item in test])
psi_ref = np.hstack([item['psi'] for item in test])


In [12]:
import utils
from random import shuffle

lr           = 0.0001   # learning rate
l2_coef      = 0.001  # L2 penalty weight
nb_epochs    = 200
n_layers     = 15
n_filters    = 60
kernel_size  = 5


In [13]:
%%time

relu = tf.nn.relu
conv1d = tf.layers.conv1d

with tf.Graph().as_default():
    with tf.name_scope('input'):
        features = tf.placeholder(dtype=tf.int8, shape=(1, None, 20))
        labels = tf.placeholder(dtype=tf.int8, shape=(1, None, NCLUST))

    layers = []

    # first convolution-activation pair
    layers.append(relu(conv1d(tf.to_float(features), n_filters, kernel_size, padding='SAME')))

    # stack of residual layers
    for _ in range(n_layers):
        layers.append(relu(conv1d(layers[-1], n_filters, kernel_size, padding='SAME')))
        layers.append(relu(conv1d(layers[-1], n_filters, kernel_size, padding='SAME') + layers[-2]))
    
    # last layer - output
    layers.append(relu(conv1d(layers[-1], NCLUST, kernel_size, padding='SAME')))

    # loss
    out = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.to_float(labels), logits=layers[-1])
    loss = tf.reduce_mean(out)

    # predicted probabilities for different
    # dihedral clusters
    prob = tf.nn.softmax(layers[-1])
    
    
    vars = tf.trainable_variables()
    lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not
                       in ['bias', 'gamma', 'b', 'g', 'beta']]) * l2_coef
    # optimizer
    opt = tf.train.AdamOptimizer(learning_rate=lr)

    # training op
    train_op = opt.minimize(loss+lossL2)

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

    total_parameters=np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
    print("tot. params: " + str(total_parameters))
    print("tot. conv1d: " + str(len(layers)))
    
    with tf.Session() as sess:
        sess.run(init_op)
        
        for epoch in range(nb_epochs):
            
            train_loss = 0
            step = 0
            rmse_phi = 0
            rmse_psi = 0
            shuffle(train)
            for item in train:
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={
                                             features: item['X'],
                                             labels: item['Y'] })
                step += 1
                train_loss += loss_value
            train_loss /= step

            val_loss = 0
            step = 0
            for item in test:
                loss_value,pred  = sess.run([loss, prob],
                                      feed_dict={
                                          features: item['X'],
                                          labels: item['Y'] })
                step += 1
                val_loss += loss_value

                # RMSE
                avec = np.matmul(pred.reshape((len(item['sequence']),NCLUST)), KM.cluster_centers_)
                norm_phi = np.sqrt(np.square(avec[:,0])+np.square(avec[:,1]))
                norm_psi = np.sqrt(np.square(avec[:,2])+np.square(avec[:,3]))
                phi_pred = np.arctan2(avec[:,0] / norm_phi, avec[:,1] / norm_phi)
                psi_pred = np.arctan2(avec[:,2] / norm_psi, avec[:,3] / norm_psi)
                
                rmse_phi += utils.ang_mae(item['phi'], phi_pred)
                rmse_psi += utils.ang_mae(item['psi'], psi_pred)

            val_loss /= step
            rmse_phi /= step
            rmse_psi /= step
        
            print("epoch {:5d} | train_loss {:8.5f} | val_loss {:8.5f} | rmse(phi) {:9.5f} | rmse(psi) {:9.5f}".
                  format(epoch, train_loss, val_loss, rmse_phi*180/np.pi, rmse_psi*180/np.pi))


tot. params: 553880
tot. conv1d: 32
epoch     0 | train_loss  2.37296 | val_loss  2.29541 | rmse(phi)  27.34407 | rmse(psi)  56.65149
epoch     1 | train_loss  2.27942 | val_loss  2.28667 | rmse(phi)  25.71244 | rmse(psi)  50.42965
epoch     2 | train_loss  2.25689 | val_loss  2.24935 | rmse(phi)  26.58238 | rmse(psi)  52.19232
epoch     3 | train_loss  2.24584 | val_loss  2.24225 | rmse(phi)  26.26930 | rmse(psi)  51.24101
epoch     4 | train_loss  2.11240 | val_loss  2.05197 | rmse(phi)  25.39474 | rmse(psi)  51.05747
epoch     5 | train_loss  2.01673 | val_loss  2.00963 | rmse(phi)  24.82359 | rmse(psi)  48.45734
epoch     6 | train_loss  1.99854 | val_loss  2.00106 | rmse(phi)  24.59328 | rmse(psi)  48.22736
epoch     7 | train_loss  1.96958 | val_loss  1.96413 | rmse(phi)  24.39349 | rmse(psi)  47.84904
epoch     8 | train_loss  1.95132 | val_loss  1.96215 | rmse(phi)  24.90268 | rmse(psi)  48.73439
epoch     9 | train_loss  1.94496 | val_loss  1.96522 | rmse(phi)  25.01083 | rmse

epoch    84 | train_loss  1.86721 | val_loss  1.93511 | rmse(phi)  24.36704 | rmse(psi)  47.21692
epoch    85 | train_loss  1.86673 | val_loss  1.92828 | rmse(phi)  24.25242 | rmse(psi)  46.99362
epoch    86 | train_loss  1.86651 | val_loss  1.93463 | rmse(phi)  24.12133 | rmse(psi)  46.70302
epoch    87 | train_loss  1.86596 | val_loss  1.93812 | rmse(phi)  24.49997 | rmse(psi)  47.27162
epoch    88 | train_loss  1.86562 | val_loss  1.92962 | rmse(phi)  24.14296 | rmse(psi)  46.28833
epoch    89 | train_loss  1.86504 | val_loss  1.92854 | rmse(phi)  24.58196 | rmse(psi)  46.95330
epoch    90 | train_loss  1.86516 | val_loss  1.93348 | rmse(phi)  23.99821 | rmse(psi)  46.03128
epoch    91 | train_loss  1.86532 | val_loss  1.93103 | rmse(phi)  24.16485 | rmse(psi)  46.53446
epoch    92 | train_loss  1.86414 | val_loss  1.92780 | rmse(phi)  23.99457 | rmse(psi)  46.34929
epoch    93 | train_loss  1.86357 | val_loss  1.95206 | rmse(phi)  24.84466 | rmse(psi)  48.87040
epoch    94 | train_

epoch   168 | train_loss  1.84487 | val_loss  1.93298 | rmse(phi)  24.12432 | rmse(psi)  46.76008
epoch   169 | train_loss  1.84349 | val_loss  1.94331 | rmse(phi)  24.21465 | rmse(psi)  47.03078
epoch   170 | train_loss  1.84363 | val_loss  1.94712 | rmse(phi)  24.72603 | rmse(psi)  48.30874
epoch   171 | train_loss  1.84377 | val_loss  1.93484 | rmse(phi)  24.12907 | rmse(psi)  46.81352
epoch   172 | train_loss  1.84360 | val_loss  1.94445 | rmse(phi)  24.08187 | rmse(psi)  46.65103
epoch   173 | train_loss  1.84263 | val_loss  1.96497 | rmse(phi)  24.89380 | rmse(psi)  49.63749
epoch   174 | train_loss  1.84289 | val_loss  1.94046 | rmse(phi)  24.10796 | rmse(psi)  46.97586
epoch   175 | train_loss  1.84327 | val_loss  1.95038 | rmse(phi)  23.88133 | rmse(psi)  46.69677
epoch   176 | train_loss  1.84253 | val_loss  1.93757 | rmse(phi)  24.35848 | rmse(psi)  47.33553
epoch   177 | train_loss  1.84243 | val_loss  1.93334 | rmse(phi)  24.37053 | rmse(psi)  47.05393
epoch   178 | train_