## Load and process the dataset

In [1]:
import gzip, json
import numpy as np
import utils 
from sklearn.model_selection import train_test_split

# 20 standard amino acids
aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# load
dataset = utils.load_phipsi()

# 90% train, 10% test
train,test = train_test_split(dataset, test_size=0.1, random_state=42)

## Clustering

In [2]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss


In [4]:
NCLUST = 20

KM = KMeans(n_clusters=NCLUST, max_iter=5, random_state=42)
KM.fit(np.vstack([item['avec'] for item in train]))


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [6]:
import pickle
NCLUST = 20
with open('phipsi_km20.pkl', 'rb') as f:
    KM = pickle.load(f)
KM

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

## Test tensorflow on GPU

In [4]:
import tensorflow as tf

print("Built with GPU:", tf.test.is_built_with_cuda())
print("GPU available:", tf.test.is_gpu_available())
print("GPU device:", tf.test.gpu_device_name())


Built with GPU: True
GPU available: True
GPU device: /device:GPU:0


In [7]:
# convert sequences & dihedral clusters
# to one-hot representation
for item in train:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]

for item in test:
    item['X'] = np.eye(20)[item['sequence']]
    item['Y'] = np.eye(NCLUST)[np.array(KM.predict(item['avec']), dtype=np.int8)]
    item['X'] = item['X'][np.newaxis]
    item['Y'] = item['Y'][np.newaxis]


In [8]:
phi_ref = np.hstack([item['phi'] for item in test])
psi_ref = np.hstack([item['psi'] for item in test])


In [14]:
# entropy of the background distribution
X_pred = KM.predict(np.vstack([item['avec'] for item in test]))
a = np.sum(np.eye(NCLUST)[X_pred], axis=0)/X_pred.shape[0]
s0=-np.sum(np.log(a)*a)
print(s0)

2.6522007457633086


In [17]:
import utils
from random import shuffle

lr           = 0.0001   # learning rate
l2_coef      = 0.001  # L2 penalty weight
nb_epochs    = 10
n_layers     = 5
n_filters    = 60
kernel_size  = 5


In [18]:
%%time

relu = tf.nn.elu
conv1d = tf.layers.conv1d

with tf.Graph().as_default():
    with tf.name_scope('input'):
        features = tf.placeholder(dtype=tf.int8, shape=(1, None, 20))
        labels = tf.placeholder(dtype=tf.int8, shape=(1, None, NCLUST))

    layers = []

    # first convolution-activation pair
    layers.append(relu(conv1d(tf.to_float(features), n_filters, kernel_size, padding='SAME')))

    # stack of residual layers
    for _ in range(n_layers):
        layers.append(relu(conv1d(layers[-1], n_filters, kernel_size, padding='SAME')))
        layers.append(relu(conv1d(layers[-1], n_filters, kernel_size, padding='SAME') + layers[-2]))
    
    # last layer - output
    layers.append(conv1d(layers[-1], NCLUST, kernel_size, padding='SAME'))

    # loss
    out = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.to_float(labels), logits=layers[-1])
    loss = tf.reduce_mean(out)

    # predicted probabilities for different
    # dihedral clusters
    prob = tf.nn.softmax(layers[-1])
    
    
    vars = tf.trainable_variables()
    lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not
                       in ['bias', 'gamma', 'b', 'g', 'beta']]) * l2_coef
    # optimizer
    opt = tf.train.AdamOptimizer(learning_rate=lr)

    # training op
    train_op = opt.minimize(loss+lossL2)

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

    total_parameters=np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
    print("tot. params: " + str(total_parameters))
    print("tot. layers: " + str(len(layers)))
    
    with tf.Session() as sess:
        sess.run(init_op)
        
        for epoch in range(nb_epochs):
            
            train_loss = 0
            step = 0
            rmse_phi = 0
            rmse_psi = 0
            shuffle(train)
            for item in train:
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={
                                             features: item['X'],
                                             labels: item['Y'] })
                step += 1
                train_loss += loss_value
            train_loss /= step

            val_loss = 0
            step = 0
            s = 0
            for item in test:
                loss_value,pred  = sess.run([loss, prob],
                                      feed_dict={
                                          features: item['X'],
                                          labels: item['Y'] })
                step += 1
                val_loss += loss_value

                # RMSE
                avec = np.matmul(pred.reshape((len(item['sequence']),NCLUST)), KM.cluster_centers_)
                norm_phi = np.sqrt(np.square(avec[:,0])+np.square(avec[:,1]))
                norm_psi = np.sqrt(np.square(avec[:,2])+np.square(avec[:,3]))
                phi_pred = np.arctan2(avec[:,0] / norm_phi, avec[:,1] / norm_phi)
                psi_pred = np.arctan2(avec[:,2] / norm_psi, avec[:,3] / norm_psi)
                
                rmse_phi += utils.ang_mae(item['phi'], phi_pred)
                rmse_psi += utils.ang_mae(item['psi'], psi_pred)
                
                s += np.average(np.sum(-np.log(pred)*pred, axis=-1))

            val_loss /= step
            rmse_phi /= step
            rmse_psi /= step
        
            print("epoch {:5d} | train_loss {:8.5f} | val_loss {:8.5f} | mae(phi) {:9.5f} | mae(psi) {:9.5f} | s_loss {:9.5f}".
                  format(epoch, train_loss, val_loss, rmse_phi*180/np.pi, rmse_psi*180/np.pi, s0 - s/len(test)))


tot. params: 192680
tot. layers: 12
epoch     0 | train_loss  2.18220 | val_loss  2.12933 | mae(phi)  25.47346 | mae(psi)  51.59488 | s_loss   0.54863
epoch     1 | train_loss  2.10700 | val_loss  2.10240 | mae(phi)  25.85320 | mae(psi)  50.91024 | s_loss   0.49199
epoch     2 | train_loss  2.08838 | val_loss  2.08943 | mae(phi)  25.07425 | mae(psi)  49.85411 | s_loss   0.61191
epoch     3 | train_loss  2.07980 | val_loss  2.08696 | mae(phi)  25.31871 | mae(psi)  49.97214 | s_loss   0.57210
epoch     4 | train_loss  2.07235 | val_loss  2.07344 | mae(phi)  25.10412 | mae(psi)  49.35855 | s_loss   0.56064
epoch     5 | train_loss  2.06734 | val_loss  2.07283 | mae(phi)  25.10174 | mae(psi)  49.13631 | s_loss   0.58730
epoch     6 | train_loss  2.06392 | val_loss  2.07042 | mae(phi)  24.97360 | mae(psi)  49.14294 | s_loss   0.57989
epoch     7 | train_loss  2.06083 | val_loss  2.07251 | mae(phi)  24.69616 | mae(psi)  48.96504 | s_loss   0.63968
epoch     8 | train_loss  2.05772 | val_loss