# Global Parameters

## Input

In [1]:
import tensorflow as tf
import numpy as np
print(tf.__version__)

1.9.0


In [2]:
train_data = np.load("../data/classification/train_features_all.npy")
val_data = np.load("../data/classification/val_features_all.npy")

KeyboardInterrupt: 

In [None]:
train_data = np.asarray([ np.asarray(element) for element in train_data[:,0]])
val_data = np.asarray([ np.asarray(element) for element in val_data[:,0]])
train_data = train_data[:,:128]
val_data = val_data[:,:128]
train_data.shape, val_data.shape

In [None]:
train_data = np.load("../data/cgan/full/train_features_smiles.npy")
val_data = np.load("../data/cgan/full/val_features_smiles.npy")

In [None]:
train_data.shape, val_data.shape

# Model

In [None]:
# Training Parameters
learning_rate = 0.001#0.5
BATCH_SIZE = 128
batches_per_epoch = int(train_data.shape[0]/BATCH_SIZE)+1
num_epochs = 3
print("Number of epochs: {} with batches per epoch: {}".format(num_epochs, batches_per_epoch))

# Network Parameters
SEQUENCE_LENGTH=train_data.shape[1]
ONE_HOT_LENGTH = train_data.max()+1
# embedding_size = 128

DROPOUT_RATE = 0.4

In [None]:
SEQUENCE_LENGTH

In [None]:
def encoder(x, layers, is_training):    
    with tf.variable_scope('encoder'):
        encoded = tf.layers.flatten(x, name="flat")
        i = 0
        for layer in layers:
            encoded = tf.layers.dense(inputs=encoded,
                                      activation=tf.nn.selu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      units=layer,
                                      name="dense{}".format(i))
            i = i+1
        encoded = tf.layers.batch_normalization(encoded, name = "batch_normalization_encoder")
    print(encoded.shape)
    return encoded

In [None]:
def decoder(x, layers, is_training):
    with tf.variable_scope('decoder'):
        i = 0
        decoded = x
        for layer in reversed(layers):            
            decoded = tf.layers.dense(inputs=decoded,
                                      activation=tf.nn.selu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      units=layer,
                                      name="dense{}".format(i))
            i = i + 1
        decoded = tf.layers.batch_normalization(decoded, name = "batch_normalization_decoder")
        decoded = tf.layers.dense(inputs=decoded,
                                  activation=None,
                                  units=SEQUENCE_LENGTH*ONE_HOT_LENGTH,
                                  name="final_dense")
        decoded = tf.reshape(decoded, shape=[-1, SEQUENCE_LENGTH, ONE_HOT_LENGTH], name='decoded')
        print(decoded.shape)
    return decoded

In [12]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    tf.set_random_seed(10)
    
    with tf.variable_scope('input'):
        sequences = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='sequences')
        is_training = tf.placeholder(tf.bool, name='is_train')

        dataset = (tf.data.Dataset.from_tensor_slices(sequences)
                   .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
                   .apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)))    
        iterator = dataset.make_initializable_iterator()
    
    batch_sequences = iterator.get_next()    
        
    with tf.variable_scope('one_hot'):
        one_hot_seq = tf.one_hot(batch_sequences, ONE_HOT_LENGTH)
    
    encoded = encoder(one_hot_seq, [1024], is_training)
    encoded = tf.layers.dropout(encoded, DROPOUT_RATE, name="dropout",training=is_training)
    decoded = decoder(encoded, [], is_training)

    
    # Define loss and optimizer
    with tf.name_scope("loss_op"):
        loss_op = tf.losses.sparse_softmax_cross_entropy(batch_sequences, decoded)
        acc, acc_op  = tf.metrics.accuracy(labels=batch_sequences, predictions=tf.argmax(decoded,2))
        correct_prediction = tf.equal(tf.argmax(decoded,2, output_type=tf.int32), batch_sequences)
        acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("loss_op", loss_op)
    
    with tf.name_scope("optimizer"):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        train_op = optimizer.minimize(loss_op)
    
    summ = tf.summary.merge_all()
    
     # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()
    #init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    print("All parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.global_variables()]))
    print("Trainable parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()]))
    [ print("{}{}".format(x.name, x.shape)) for x in tf.trainable_variables() if "LayerNorm" not in x.name]

(128, 1024)
(128, 128, 21)
All parameters: 16542594.0
Trainable parameters: 5512832
encoder/dense0/kernel:0(2688, 1024)
encoder/dense0/bias:0(1024,)
encoder/batch_normalization_encoder/gamma:0(1024,)
encoder/batch_normalization_encoder/beta:0(1024,)
decoder/batch_normalization_decoder/gamma:0(1024,)
decoder/batch_normalization_decoder/beta:0(1024,)
decoder/final_dense/kernel:0(1024, 2688)
decoder/final_dense/bias:0(2688,)


In [13]:
def print_progress(step, loss, acc):
    print("Step {}, Loss={:.4f}, Accuracy={:.3f}".format(str(step), loss, acc))

In [14]:
def validation(epoch):    
    # Calculate batch loss and accuracy
    losses = []
    accuracies = []
    sess.run(iterator.initializer, feed_dict={sequences: val_data})
    while True:
        try:
            # Run optimization
            loss, a = sess.run([loss_op, acc], feed_dict={is_training: False})
            losses.append(loss)
            accuracies.append(a)
        except tf.errors.OutOfRangeError:
            break
    loss_avg = sum(losses)/len(losses)
    acc_avg = sum(accuracies)/len(accuracies)
    print_progress("VALIDATION for epoch {}".format(epoch), loss_avg, acc_avg)
    return acc_avg

## Start training

In [15]:
from pathlib import Path
import random 
from datetime import datetime
path = "../logs/auto_encoder/"
log_dir = "{}{}".format(path, datetime.now().strftime("%Y%m%d_%H%M"))
Path(log_dir).mkdir(exist_ok=True, parents=True)
tb_writer = tf.summary.FileWriter(log_dir, graph)
config = tf.ConfigProto()
config.gpu_options.allow_growth = False
best_val_acc = 0.8
sess = tf.Session(graph=graph)
# Run the initializer
epoch, step = 0, 0
sess.run([init, iterator.initializer], feed_dict={sequences: train_data})
while epoch < num_epochs:
    try: 
        sess.run(train_op, feed_dict={is_training: True})
        step = step +1 
        if step % int(batches_per_epoch/4) == 0 or step == 1:
            loss, a = sess.run([loss_op, acc], feed_dict={is_training: False})
            print_progress(step, loss, a)
            [s] = sess.run([summ], feed_dict={is_training: False})
            #tb_writer.add_summary(np.mean(loss), step)
    except tf.errors.OutOfRangeError:
        epoch = epoch + 1
        val_acc = validation(epoch)           

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            save_path = saver.save(sess, "{}{}".format(path, "v_128"))
            print("Model saved in path: %s" % save_path)
        sess.run(iterator.initializer, feed_dict={sequences: train_data})
print("Optimization Finished!")
    

Step 1, Loss=3.0380, Accuracy=0.064
Step 361, Loss=0.2981, Accuracy=0.945
Step 722, Loss=0.1358, Accuracy=0.973
Step 1083, Loss=0.0985, Accuracy=0.981
Step VALIDATION for epoch 1, Loss=0.0934, Accuracy=0.982
Model saved in path: ../logs/auto_encoder/v_128
Step 1444, Loss=0.0664, Accuracy=0.990
Step 1805, Loss=0.0657, Accuracy=0.988
Step 2166, Loss=0.0746, Accuracy=0.987
Step 2527, Loss=0.0633, Accuracy=0.989
Step VALIDATION for epoch 2, Loss=0.0752, Accuracy=0.985
Model saved in path: ../logs/auto_encoder/v_128
Step 2888, Loss=0.0487, Accuracy=0.992
Step 3249, Loss=0.0586, Accuracy=0.990
Step 3610, Loss=0.0500, Accuracy=0.992
Step 3971, Loss=0.0503, Accuracy=0.992
Step VALIDATION for epoch 3, Loss=0.0683, Accuracy=0.987
Model saved in path: ../logs/auto_encoder/v_128
Optimization Finished!


# Validation

In [16]:
val_seq = val_data[:1]
val_seq = np.repeat(val_seq, [BATCH_SIZE], axis=0)

In [17]:
val_seq.shape

(128, 128)

In [18]:
sess.run([iterator.initializer], feed_dict={sequences: val_seq})

[None]

In [19]:
decoded_to_index = tf.argmax(decoded, axis=2) 

In [20]:
decoded_smiles = sess.run(decoded_to_index, feed_dict={is_training: False})

In [21]:
print(decoded_smiles[0])

[11  1  8  4 14  7  8  3  6 10 10 15  7 18  6 10 15 18  3  7  6 10 17 13
 18 16  1 16 10 18 10 10  1  6  1  6  1 10 16 18  6 17  5  1 10 15 10 18
 15 10  5  1  3 18 20  8 10 13  6 12 16 18 16 20 18  6  1 12  9  9  3 10
 17 15  1 16 15  1 18 18 17  6  1 17  3  6  8  6 10  4  5  1 10 14 10  1
 15  9  6  5 12  8 18 10 18 16 15 16 13  4  9 10  6 16 18  1  1  4  8  4
  1  1 17 13  6 18 15 17]


In [22]:
val_data[0]

array([11,  1,  8,  4, 14,  7,  8,  3,  6, 10, 10, 15,  7, 18,  6, 10, 15,
       18,  3,  7,  6, 10, 17, 13, 18, 16,  1, 16, 10, 18, 10, 10,  1,  6,
        8,  6,  1, 10, 16, 18,  6, 17,  5,  1, 10, 15, 10, 18, 15, 10,  5,
        1,  3, 18, 20,  8, 10, 13,  6, 12, 16, 18, 16,  9, 20,  6,  1, 12,
        9,  9,  3, 10, 17, 15,  1, 16, 19,  1, 18, 18, 17,  6,  1, 17,  3,
        6,  8,  6, 15,  4,  5,  1, 10, 14, 10,  1, 15,  9,  6,  5, 12,  8,
       18, 10, 18, 16, 15, 16, 13,  4,  9, 10,  6, 16, 18,  1,  1,  4,  8,
        4,  1,  1, 17, 13,  6, 18, 15, 17])

In [23]:
from common.bio.constants import *
print("".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in val_data[0]]))
print("".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in decoded_smiles[0]]))

MAIEQHIDGLLRHVGLRVDHGLTPVSASLVLLAGIGALSVGTFALRLVRLFADVYILPGNSVSKYGANKKDLTRASWAVVTGATDGIGREFALQLARKGFNIVLVSRSPEKLGSVAAEIEAATPGVRT
MAIEQHIDGLLRHVGLRVDHGLTPVSASLVLLAGAGALSVGTFALRLVRLFADVYILPGNSVSYVGANKKDLTRASRAVVTGATDGIGLEFALQLARKGFNIVLVSRSPEKLGSVAAEIEAATPGVRT
