## Load and process the dataset

In [1]:
import gzip, json
import numpy as np
import utils 
from sklearn.model_selection import train_test_split

# 20 standard amino acids
aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# load
dataset = utils.load_phipsi()

# 90% train, 10% test
train,test = train_test_split(dataset, test_size=0.1, random_state=42)

## Clustering

In [2]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss


In [3]:
NCLUST = 20

KM = KMeans(n_clusters=NCLUST, max_iter=5, random_state=42)
KM.fit(np.vstack([item['avec'] for item in train]))


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

## Test tensorflow on GPU

In [4]:
import tensorflow as tf

print("Built with GPU:", tf.test.is_built_with_cuda())
print("GPU available:", tf.test.is_gpu_available())
print("GPU device:", tf.test.gpu_device_name())


Built with GPU: True
GPU available: True
GPU device: /device:GPU:0


In [5]:
# convert sequences & dihedral clusters
# to one-hot representation
for item in train:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]

for item in test:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]


In [6]:
phi_ref = np.hstack([item['phi'] for item in test])
psi_ref = np.hstack([item['psi'] for item in test])


In [7]:
import utils
from random import shuffle

lr           = 0.001  # learning rate
l2_coef      = 0.001  # L2 penalty weight
nb_epochs    = 10
n_layers     = 10
n_filters    = 60
kernel_size  = 3


In [8]:
%%time

relu = tf.nn.relu
conv1d = tf.layers.conv1d

with tf.Graph().as_default():
    with tf.name_scope('input'):
        features = tf.placeholder(dtype=tf.int8, shape=(1, None, 20))
        labels = tf.placeholder(dtype=tf.int8, shape=(1, None, NCLUST))

    layers = []

    # first convolution-activation pair
    layers.append(relu(conv1d(tf.to_float(features), n_filters, kernel_size, padding='SAME')))

    # stack of convolutional layers
    for _ in range(n_layers):
        layers.append(relu(conv1d(layers[-1], n_filters, kernel_size, padding='SAME')))
    
    # last layer - reshape to the number of clusters
    layers.append(relu(conv1d(layers[-1], NCLUST, kernel_size, padding='SAME')))

    # loss
    out = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.to_float(labels), logits=layers[-1])
    loss = tf.reduce_mean(out)

    # predicted probabilities for different
    # dihedral clusters
    prob = tf.nn.softmax(layers[-1])
    
    
    vars = tf.trainable_variables()
    lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not
                       in ['bias', 'gamma', 'b', 'g', 'beta']]) * l2_coef
    # optimizer
    opt = tf.train.AdamOptimizer(learning_rate=lr)

    # training op
    train_op = opt.minimize(loss+lossL2)

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

    total_parameters=np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
    print("tot. params: " + str(total_parameters))
    print("tot. conv1d: " + str(len(layers)))
    
    with tf.Session() as sess:
        sess.run(init_op)
        
        for epoch in range(nb_epochs):
            
            train_loss = 0
            step = 0
            rmse_phi = 0
            rmse_psi = 0
            shuffle(train)
            for item in train:
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={
                                             features: item['X'],
                                             labels: item['Y'] })
                step += 1
                train_loss += loss_value
            train_loss /= step

            val_loss = 0
            step = 0
            for item in test:
                loss_value,pred  = sess.run([loss, prob],
                                      feed_dict={
                                          features: item['X'],
                                          labels: item['Y'] })
                step += 1
                val_loss += loss_value

                # RMSE
                avec = np.matmul(pred.reshape((len(item['sequence']),NCLUST)), KM.cluster_centers_)
                norm_phi = np.sqrt(np.square(avec[:,0])+np.square(avec[:,1]))
                norm_psi = np.sqrt(np.square(avec[:,2])+np.square(avec[:,3]))
                phi_pred = np.arctan2(avec[:,0] / norm_phi, avec[:,1] / norm_phi)
                psi_pred = np.arctan2(avec[:,2] / norm_psi, avec[:,3] / norm_psi)
                
                rmse_phi += utils.ang_rmse(item['phi'], phi_pred)
                rmse_psi += utils.ang_rmse(item['psi'], psi_pred)

            val_loss /= step
            rmse_phi /= step
            rmse_psi /= step
        
            print("epoch {:5d} | train_loss {:8.5f} | val_loss {:8.5f} | rmse(phi) {:9.5f} | rmse(psi) {:9.5f}".
                  format(epoch, train_loss, val_loss, rmse_phi*180/np.pi, rmse_psi*180/np.pi))


tot. params: 115880
tot. conv1d: 12
epoch     0 | train_loss  2.38856 | val_loss  2.22178 | rmse(phi)  44.63289 | rmse(psi)  82.28884
epoch     1 | train_loss  2.16125 | val_loss  2.10340 | rmse(phi)  41.85695 | rmse(psi)  81.08831
epoch     2 | train_loss  2.10102 | val_loss  2.07969 | rmse(phi)  41.18176 | rmse(psi)  80.38404
epoch     3 | train_loss  2.08946 | val_loss  2.07757 | rmse(phi)  41.59289 | rmse(psi)  79.23122
epoch     4 | train_loss  2.08395 | val_loss  2.08160 | rmse(phi)  41.88818 | rmse(psi)  81.00105
epoch     5 | train_loss  2.07885 | val_loss  2.06695 | rmse(phi)  41.84675 | rmse(psi)  79.32459
epoch     6 | train_loss  2.07822 | val_loss  2.06803 | rmse(phi)  42.01974 | rmse(psi)  80.49949
epoch     7 | train_loss  2.07566 | val_loss  2.06913 | rmse(phi)  42.31166 | rmse(psi)  81.14040
epoch     8 | train_loss  2.07328 | val_loss  2.06654 | rmse(phi)  41.03646 | rmse(psi)  79.68261
epoch     9 | train_loss  2.07043 | val_loss  2.06704 | rmse(phi)  43.30840 | rmse