# Global Parameters

## Input

In [1]:
import tensorflow as tf
import numpy as np
print(tf.__version__)

1.9.0


In [2]:
train_data = np.load("../data/cgan/full/train_features_smiles.npy")
val_data = np.load("../data/cgan/full/val_features_smiles.npy")

In [3]:
train_data = np.load("../data/classification/train_features_sample.npy")
val_data = np.load("../data/classification/val_features_sample.npy")

In [4]:
train_data = np.asarray([ np.asarray(element) for element in train_data[:,0]])
val_data = np.asarray([ np.asarray(element) for element in val_data[:,0]])
train_data.shape, val_data.shape

((42268, 250), (10568, 250))

# Model

In [11]:
# Training Parameters
learning_rate = 0.001
batch_size = 128
batches_per_epoch = int(train_data.shape[0]/batch_size)+1
num_epochs = 3
print("Number of epochs: {} with batches per epoch: {}".format(num_epochs, batches_per_epoch))

# Network Parameters
sequence_length=train_data.shape[1]
num_smiles_characters = 21

Number of epochs: 3 with batches per epoch: 331


In [12]:
sequence_length

250

In [25]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    tf.set_random_seed(10)
    
    with tf.variable_scope('input'):
        sequences = tf.placeholder(tf.int32, [None, sequence_length], name='sequences')
        is_training = tf.placeholder(tf.bool, name='is_train')

        dataset = (tf.data.Dataset.from_tensor_slices(sequences)
                   .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
                   .apply(tf.contrib.data.batch_and_drop_remainder(batch_size)))
    
        iterator = dataset.make_initializable_iterator()
    
    batch_sequences = iterator.get_next()
    
    with tf.variable_scope('embedding'):
        embedded_sequences = tf.one_hot(batch_sequences, num_smiles_characters)

    # Define weights
    with tf.variable_scope('tcn-encoder'):
        flat = tf.layers.flatten(embedded_sequences, name="dflat")
        encoded = tf.layers.dense(inputs=flat,
                                 activation=None,
                                 units=1000,
                                 name="encoded")
   
    print(encoded.shape)

    with tf.variable_scope('decoder'):   
        decoded = tf.layers.dense(inputs=encoded,
                                 activation=None,
                                 units=sequence_length*num_smiles_characters,
                                 name="decoded")
        decoded = tf.reshape(decoded, 
                                             shape=[-1, sequence_length, num_smiles_characters], 
                                             name='embedded_decoded_sequences')
    
    # Define loss and optimizer
    with tf.name_scope("loss_op"):
#         loss_op = tf.losses.absolute_difference(batch_sequences, decoded)
        loss_op = tf.losses.sparse_softmax_cross_entropy(batch_sequences, decoded)
        acc, acc_op  = tf.metrics.accuracy(labels=batch_sequences, 
                                  predictions=tf.argmax(decoded,2))
        correct_prediction = tf.equal(tf.argmax(decoded,2, output_type=tf.int32), batch_sequences)
        acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("loss_op", loss_op)
    
    with tf.name_scope("optimizer"):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        train_op = optimizer.minimize(loss_op)
    
    summ = tf.summary.merge_all()
    
     # Initialize the variables (i.e. assign their default value)
    #init = tf.global_variables_initializer()
    init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    print("All parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.global_variables()]))
    print("Trainable parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()]))
    [ print("{}{}".format(x.name, x.shape)) for x in tf.trainable_variables() if "LayerNorm" not in x.name]

(128, 1000)
All parameters: 31518752.0
Trainable parameters: 10506250
tcn-encoder/encoded/kernel:0(5250, 1000)
tcn-encoder/encoded/bias:0(1000,)
decoder/decoded/kernel:0(1000, 5250)
decoder/decoded/bias:0(5250,)


In [26]:
def print_progress(step, loss, acc):
    print("Step {}, Loss={:.4f}, Accuracy={:.3f}".format(str(step), loss, acc))

In [27]:
def validation(epoch):    
    # Calculate batch loss and accuracy
    losses = []
    accuracies = []
    sess.run(iterator.initializer, feed_dict={sequences: val_data})
    while True:
        try:
            # Run optimization
            loss, a = sess.run([loss_op, acc], feed_dict={is_training: False})
            losses.append(loss)
            accuracies.append(a)
        except tf.errors.OutOfRangeError:
            break
    loss_avg = sum(losses)/len(losses)
    acc_avg = sum(accuracies)/len(accuracies)
    print_progress("VALIDATION for epoch {}".format(epoch), loss_avg, acc_avg)
    return acc_avg

## Start training

In [28]:
from pathlib import Path
import random 
from datetime import datetime
path = "../logs/auto_encoder/"
log_dir = "{}{}".format(path, datetime.now().strftime("%Y%m%d_%H%M"))
Path(log_dir).mkdir(exist_ok=True, parents=True)
tb_writer = tf.summary.FileWriter(log_dir, graph)
config = tf.ConfigProto()
config.gpu_options.allow_growth = False
best_val_acc = 0.8
sess = tf.Session(graph=graph)
# Run the initializer
epoch, step = 0, 0
sess.run([init, iterator.initializer], feed_dict={sequences: train_data})
while epoch < num_epochs:
    try: 
        sess.run(train_op, feed_dict={is_training: True})
        step = step +1 
        if step % int(batches_per_epoch/4) == 0 or step == 1:
            loss, a = sess.run([loss_op, acc], feed_dict={is_training: True})
            print_progress(step, loss, a)
            [s] = sess.run([summ], feed_dict={is_training: True})
            #tb_writer.add_summary(np.mean(loss), step)
    except tf.errors.OutOfRangeError:
        path
        epoch = epoch + 1
        val_acc = validation(epoch)           

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            save_path = saver.save(sess, "{}{}".format(path, "v2"))
            print("Model saved in path: %s" % save_path)
        sess.run(iterator.initializer, feed_dict={sequences: train_data})
print("Optimization Finished!")
    

Step 1, Loss=2.9416, Accuracy=0.201
Step 82, Loss=1.4832, Accuracy=0.609
Step 164, Loss=0.9955, Accuracy=0.749
Step 246, Loss=0.7586, Accuracy=0.809
Step VALIDATION for epoch 1, Loss=0.5568, Accuracy=0.862
Model saved in path: ../logs/auto_encoder/v2
Step 328, Loss=0.5076, Accuracy=0.877
Step 410, Loss=0.3649, Accuracy=0.912
Step 492, Loss=0.2402, Accuracy=0.943
Step 574, Loss=0.2391, Accuracy=0.943
Step VALIDATION for epoch 2, Loss=0.2865, Accuracy=0.927
Model saved in path: ../logs/auto_encoder/v2
Step 656, Loss=0.1828, Accuracy=0.959
Step 738, Loss=0.1026, Accuracy=0.981
Step 820, Loss=0.1209, Accuracy=0.975
Step 902, Loss=0.0945, Accuracy=0.982
Step VALIDATION for epoch 3, Loss=0.2203, Accuracy=0.941
Model saved in path: ../logs/auto_encoder/v2
Optimization Finished!


# Validation

In [17]:
val_seq = val_data[54:55,:]
val_seq = np.repeat(val_seq, [batch_size], axis=0)

In [18]:
val_seq.shape

(128, 250)

In [19]:
sess.run([iterator.initializer], feed_dict={sequences: val_seq})

[None]

In [20]:
decoded_to_index = tf.argmax(decoded, axis=2) 

In [21]:
decoded_smiles = sess.run(decoded_to_index, feed_dict={is_training: False})

In [22]:
from common.bio.constants import *
print("".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in val_seq[0]]))
print("".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in decoded_smiles[0]]))

MTSPVRILGIDPGLRRTGWGLITAQGTKLTYGDCGVVTSDGELPLALRLRELFEGIGRIVEAVRPDEVAVEETFVNKDAQATLKLGHARAMALLVPALAGLPVFEYAPNLIKKTVAGSGHAEKVQIQAMVRFLLPKAEFRVADAADALAIAITHASHRDAHALRQAHLPGGKRRSLTGQAAAGQGLAGKGFSAAAAARIEAALAKQG0000000000000000000000000000000000000000000
MTTAVRILGIDPGLRRTGWGLIGAIGTLLRYAASGTVTSDGELDLALRLRELHEGIGRVVTAYAPDEAAVEHTFVNKDAQATLKLGAARGVALLVPALAGLPVSEYAPKLVKKTVAGTGHAEKVQIHAMVRFLLPKAEFKVADAADALAIAITHASHRPAEALAKAMARGGARRGANATAAAAVVLAGKHPSASEMAAIEAAAGKA00R00000000000000000000000000000000000000000


In [23]:
print(decoded_smiles[0])

[11 17 17  1 18 15  8 10  6  8  3 13  6 10 15 15 17  6 19  6 10  8  6  1
  8  6 17 10 10 15 20  1  1 16  6 17 18 17 16  3  6  4 10  3 10  1 10 15
 10 15  4 10  7  4  6  8  6 15 18 18 17  1 20  1 13  3  4  1  1 18  4  7
 17  5 18 12  9  3  1 14  1 17 10  9 10  6  1  1 15  6 18  1 10 10 18 13
  1 10  1  6 10 13 18 16  4 20  1 13  9 10 18  9  9 17 18  1  6 17  6  7
  1  4  9 18 14  8  7  1 11 18 15  5 10 10 13  9  1  4  5  9 18  1  3  1
  1  3  1 10  1  8  1  8 17  7  1 16  7 15 13  1  4  1 10  1  9  1 11  1
 15  6  6  1 15 15  6  1 12  1 17  1  1  1  1 18 18 10  1  6  9  7 13 16
  1 16  4 11  1  1  8  4  1  1  1  6  9  1  0  0 15  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0]


In [24]:
val_seq[0]

array([11, 17, 16, 13, 18, 15,  8, 10,  6,  8,  3, 13,  6, 10, 15, 15, 17,
        6, 19,  6, 10,  8, 17,  1, 14,  6, 17,  9, 10, 17, 20,  6,  3,  2,
        6, 18, 18, 17, 16,  3,  6,  4, 10, 13, 10,  1, 10, 15, 10, 15,  4,
       10,  5,  4,  6,  8,  6, 15,  8, 18,  4,  1, 18, 15, 13,  3,  4, 18,
        1, 18,  4,  4, 17,  5, 18, 12,  9,  3,  1, 14,  1, 17, 10,  9, 10,
        6,  7,  1, 15,  1, 11,  1, 10, 10, 18, 13,  1, 10,  1,  6, 10, 13,
       18,  5,  4, 20,  1, 13, 12, 10,  8,  9,  9, 17, 18,  1,  6, 16,  6,
        7,  1,  4,  9, 18, 14,  8, 14,  1, 11, 18, 15,  5, 10, 10, 13,  9,
        1,  4,  5, 15, 18,  1,  3,  1,  1,  3,  1, 10,  1,  8,  1,  8, 17,
        7,  1, 16,  7, 15,  3,  1,  7,  1, 10, 15, 14,  1,  7, 10, 13,  6,
        6,  9, 15, 15, 16, 10, 17,  6, 14,  1,  1,  1,  6, 14,  6, 10,  1,
        6,  9,  6,  5, 16,  1,  1,  1,  1,  1, 15,  8,  4,  1,  1, 10,  1,
        9, 14,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0