## Loading preprosed data

In [1]:
PATH = "../../weights/cgan_{}/version1".format("v1")

In [2]:
BATCH_SIZE=256
LAMBDA = 10
NUM_EPOCH = 3000
DATA_TYPE = "mini_sample"

In [3]:
from common.bio.blast import *
import pandas as pd
import numpy as np
import tensorflow as tf
tf.__version__

'1.8.0'

In [4]:
def load_data(path):
    loaded = np.load(path)
    return np.asarray([ np.asarray(element) for element in loaded])

In [5]:
path = "../../data/cgan/{}/".format(DATA_TYPE)
train_seq = load_data(path+"train_seq.npy")
val_seq = load_data(path+"val_seq.npy")
train_substrate_1 = load_data(path+"train_smiles_substrate_1.npy")
train_substrate_2 = load_data(path+"train_smiles_substrate_2.npy")
train_product_1 = load_data(path+"train_smiles_product_1.npy")
train_product_2 = load_data(path+"train_smiles_product_2.npy")
val_substrate_1 = load_data(path+"/val_smiles_substrate_1.npy")
val_substrate_2 = load_data(path+"val_smiles_substrate_2.npy")
val_product_1 = load_data(path+"val_smiles_product_1.npy")
val_product_2 = load_data(path+"val_smiles_product_2.npy")

In [6]:
NUM_OF_ACIDS = 21
EMBEDDING_SIZE = 32
NUM_SMILES_CHARACTERS = 44

In [7]:
SEQUENCE_LENGTH=train_seq.shape[1]
SEQUENCE_LENGTH

256

In [8]:
SMILES_LENGTH=train_substrate_1.shape[1]
SMILES_LENGTH

100

In [9]:
STEPS_PER_EPOCH = int(train_seq.shape[0]/BATCH_SIZE)+1
STEPS_PER_EPOCH

85

# Model

## Discriminator

In [10]:
NUM_OF_LAYERS=6

In [11]:
def discriminator_layer(x, level, filters):
    conv = tf.layers.conv2d(
            inputs=x,
            filters=filters,
            kernel_size=[3,EMBEDDING_SIZE],
            strides=(2,1),
            padding="same",
            activation=tf.nn.leaky_relu,
            name = "dconv{}".format(level))
    bn = tf.layers.batch_normalization(conv, name = "dbn{}".format(level))
    print(bn.shape)
    return bn 

In [12]:
def discriminator(x, s1, s2, p1, p2, is_training):

    with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE) as scope:
        print('discriminator')
        layer = x
        for layer_id in range(NUM_OF_LAYERS):            
            layer = discriminator_layer(layer, layer_id, 2*(2**(layer_id+1)))
        flat = tf.layers.flatten(layer, name="dflat")
        print(flat.shape)
        
        z = tf.concat([s1, s2, p1, p2], axis = 1)        
        final = tf.concat([flat, z], axis = 1)
        
        output = tf.layers.dense(inputs=final,
                                 activation=None,
                                 units=1,
                                 name="doutput")
        print(output.shape)
        output = tf.reshape(output, [-1])
        print(output.shape)
        return output

# Generator

In [13]:
def generator_layer(x, level, filters):
        bn = tf.layers.batch_normalization(x, name = "gbn{}".format(level))
        up = tf.keras.layers.UpSampling2D(size=(2, 1))(bn)
        print(up.shape)
        return tf.layers.conv2d(inputs=up, 
                                 filters=filters,
                                 kernel_size=[3,EMBEDDING_SIZE],
                                 padding="same",
                                 activation=tf.nn.relu,
                                 name = "conv{}".format(level))

In [14]:
import math
def generator(s1, s2, p1, p2, input_batch=None, is_training=True):
    with tf.variable_scope('generator') as scope:
        print('generator')
        if input_batch is None:
            input_batch = tf.cast(tf.random_normal([BATCH_SIZE, 128]), tf.float32)
        dim = math.floor(SEQUENCE_LENGTH/(2**NUM_OF_LAYERS))  
        z = tf.concat([s1, s2, p1, p2, input_batch], axis = 1)
        print(z.shape)
        dense1 = tf.layers.dense(inputs=z,
                                 kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 bias_initializer=tf.zeros_initializer (),
                                 units=dim*EMBEDDING_SIZE*128,
                                 activation=tf.nn.relu,
                                 name="dense1")
        layer = tf.reshape(dense1, shape=[-1, dim, EMBEDDING_SIZE, 128], name='reshape1')
        for layer_id in range(NUM_OF_LAYERS):
            layer = generator_layer(layer, layer_id, 128/(2**(layer_id+1)))
        final_conv = tf.layers.conv2d(inputs=layer, 
                         filters=1,
                         kernel_size=[3,EMBEDDING_SIZE],
                         padding="same",
                         activation=tf.nn.sigmoid,
                         name = "final_conv")    
        print(final_conv.shape)
        return final_conv

## Encoder

In [15]:
def encoder(smiles_input):
    with tf.variable_scope('auto_encoder', reuse=tf.AUTO_REUSE):
        flat = tf.layers.flatten(smiles_input, name="dflat")
        encoded = tf.layers.dense(inputs=flat,
                                 activation=None,
                                 units=10,
                                 name="encoded")
        return encoded

## Decoder

In [16]:
def decoder(encoded):
    with tf.variable_scope('auto_encoder', reuse=tf.AUTO_REUSE):   
        decoded = tf.layers.dense(inputs=encoded,
                                 activation=None,
                                 units=SMILES_LENGTH*NUM_SMILES_CHARACTERS,
                                 name="decoded")
        decoded = tf.reshape(decoded, shape=[-1, SMILES_LENGTH, NUM_SMILES_CHARACTERS], name='decoded_smiles')
        return decoded

## Auto Encoder loss

In [17]:
def get_auto_encoder_loss(encoded, real):
    with tf.variable_scope("auto_encoder", reuse=tf.AUTO_REUSE):
        decoded = decoder(encoded)
        loss_op = tf.losses.sparse_softmax_cross_entropy(real, decoded)
        acc, acc_op  = tf.metrics.accuracy(labels=real, predictions=tf.argmax(decoded,2))
        correct_prediction = tf.equal(tf.argmax(decoded,2, output_type=tf.int32), real)
        acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("loss_op", loss_op)
        return loss_op, acc
     

## Graph

In [18]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope('input'):
        real_sequences = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='real_sequence')
        substrate1 = tf.placeholder(tf.int32, [None, SMILES_LENGTH], name='substrate1')
        substrate2 = tf.placeholder(tf.int32, [None, SMILES_LENGTH], name='substrate2')
        product1 = tf.placeholder(tf.int32, [None, SMILES_LENGTH], name='product1')
        product2 = tf.placeholder(tf.int32, [None, SMILES_LENGTH], name='product2')
        is_training = tf.placeholder(tf.bool, name='is_train')

    dataset = tf.data.Dataset.from_tensor_slices((real_sequences, substrate1, substrate2, product1, product2))
    dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
    dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)).repeat(NUM_EPOCH)
    iterator = dataset.make_initializable_iterator()

#     weights_initializer = tf.constant_initializer(embedding_weights_array)
    embedding_weights = tf.get_variable(
                                name='embedding_weights', 
                                shape=(NUM_OF_ACIDS, EMBEDDING_SIZE), 
                                initializer=tf.zeros_initializer,
                                trainable=False)

    batch_sequences, batch_substrate1, batch_substrate2, batch_product1, batch_product2 = iterator.get_next()

    embedded_sequences = tf.nn.embedding_lookup(embedding_weights, batch_sequences)
    embedded_sequences = tf.reshape(embedded_sequences, shape=[-1, SEQUENCE_LENGTH, EMBEDDING_SIZE, 1], name='embedded_sequences')
    
    batch_substrate1_encoded = encoder(tf.one_hot(batch_substrate1, NUM_SMILES_CHARACTERS))
    batch_substrate2_encoded = encoder(tf.one_hot(batch_substrate2, NUM_SMILES_CHARACTERS))
    batch_product1_encoded = encoder(tf.one_hot(batch_product1, NUM_SMILES_CHARACTERS))
    batch_product2_encoded = encoder(tf.one_hot(batch_product2, NUM_SMILES_CHARACTERS))

In [19]:
with graph.as_default():
    fake = generator(batch_substrate1_encoded, batch_substrate2_encoded, batch_product2_encoded, batch_product2_encoded,
                     is_training=is_training)
    logits_real = discriminator(embedded_sequences, batch_substrate1_encoded, batch_substrate2_encoded, batch_product2_encoded,
                                batch_product2_encoded, is_training)
    logits_fake = discriminator(fake, batch_substrate1_encoded, batch_substrate2_encoded, batch_product2_encoded,
                                batch_product2_encoded, is_training)
    d_loss = tf.reduce_mean(logits_fake) - tf.reduce_mean(logits_real) # This optimizes the discriminator.
    g_loss = -tf.reduce_mean(logits_fake)  # This optimizes the generator.

    # # wgan-gp gradient panelty 
    with tf.name_scope("Gradient_penalty"):
        eps = tf.random_uniform([BATCH_SIZE,1, 1, 1], minval=0.0,maxval=1.0)
        interpolates = embedded_sequences + eps*(fake - embedded_sequences)

        gradients = tf.gradients(discriminator(interpolates, batch_substrate1_encoded, batch_substrate2_encoded, batch_product2_encoded, 
                                               batch_product2_encoded, is_training), [interpolates])[0]
        slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
        gradient_penalty = tf.reduce_mean(tf.square(slopes - 1.))
        d_loss += 10 * gradient_penalty
        tf.summary.scalar("d_loss", d_loss)
        tf.summary.scalar("g_loss", g_loss)
    
    with tf.name_scope("auto_encoder_loss"):
        auto_loss1, acc1 = get_auto_encoder_loss(batch_substrate1_encoded, batch_substrate1)
        auto_loss2, acc2 = get_auto_encoder_loss(batch_substrate2_encoded, batch_substrate2)
        auto_loss3, acc3 = get_auto_encoder_loss(batch_product1_encoded, batch_product1)
        auto_loss4, acc4 = get_auto_encoder_loss(batch_product2_encoded, batch_product2)
        
        a_loss = auto_loss1 + auto_loss2 + auto_loss3 + auto_loss4
        acc = (acc1 + acc2 + acc3 + acc4)/4
    
        
    D_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'discriminator')
    G_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'generator')
    A_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'auto_encoder')

    trainer_d = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.999).minimize(d_loss, var_list=D_vars)
    trainer_g = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.999).minimize(g_loss, var_list=G_vars)
    trainer_a = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.5, beta2=0.999).minimize(a_loss, var_list=A_vars)
    
    
    summ = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=3)

generator
(256, 168)
(256, 8, 32, 128)
(256, 16, 32, 64)
(256, 32, 32, 32)
(256, 64, 32, 16)
(256, 128, 32, 8)
(256, 256, 32, 4)
(256, 256, 32, 1)
discriminator
(256, 128, 32, 4)
(256, 64, 32, 8)
(256, 32, 32, 16)
(256, 16, 32, 32)
(256, 8, 32, 64)
(256, 4, 32, 128)
(256, 16384)
(256, 1)
(256,)
discriminator
(256, 128, 32, 4)
(256, 64, 32, 8)
(256, 32, 32, 16)
(256, 16, 32, 32)
(256, 8, 32, 64)
(256, 4, 32, 128)
(256, 16384)
(256, 1)
(256,)
discriminator
(256, 128, 32, 4)
(256, 64, 32, 8)
(256, 32, 32, 16)
(256, 16, 32, 32)
(256, 8, 32, 64)
(256, 4, 32, 128)
(256, 16384)
(256, 1)
(256,)


In [20]:
with graph.as_default():
    print("All parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.global_variables()]))
    print("Trainable parameters:", np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()]))
    [ print("{}{}".format(x.name, x.shape)) for x in tf.trainable_variables() if "LayerNorm" not in x.name]

All parameters: 14928384.0
Trainable parameters: 4975566
auto_encoder/encoded/kernel:0(4400, 10)
auto_encoder/encoded/bias:0(10,)
generator/dense1/kernel:0(168, 16384)
generator/dense1/bias:0(16384,)
generator/gbn0/gamma:0(128,)
generator/gbn0/beta:0(128,)
generator/conv0/kernel:0(3, 32, 128, 64)
generator/conv0/bias:0(64,)
generator/gbn1/gamma:0(64,)
generator/gbn1/beta:0(64,)
generator/conv1/kernel:0(3, 32, 64, 32)
generator/conv1/bias:0(32,)
generator/gbn2/gamma:0(32,)
generator/gbn2/beta:0(32,)
generator/conv2/kernel:0(3, 32, 32, 16)
generator/conv2/bias:0(16,)
generator/gbn3/gamma:0(16,)
generator/gbn3/beta:0(16,)
generator/conv3/kernel:0(3, 32, 16, 8)
generator/conv3/bias:0(8,)
generator/gbn4/gamma:0(8,)
generator/gbn4/beta:0(8,)
generator/conv4/kernel:0(3, 32, 8, 4)
generator/conv4/bias:0(4,)
generator/gbn5/gamma:0(4,)
generator/gbn5/beta:0(4,)
generator/conv5/kernel:0(3, 32, 4, 2)
generator/conv5/bias:0(2,)
generator/final_conv/kernel:0(3, 32, 2, 1)
generator/final_conv/bias:0(

## Helpers for training model

## Review generated examples

In [21]:
 def save_weights(saver, sess, path):
    save_path = saver.save(sess, path)
    print("Model saved in path: %s" % save_path)

In [22]:
def mean(l):
    if len(l) == 0:
        return 0
    else:
        return sum(l) / float(len(l))

In [23]:
def print_summary(steps, dLosses, gLosses):    
    if steps % int(STEPS_PER_EPOCH) == 0:
        print('steps:{} \td_loss:{:.4f} \tg_loss:{:.4f}'.format(steps, mean(dLosses), mean(gLosses)))
        dLosses, gLosses = [], [] 
    return dLosses, gLosses

In [24]:
def reverse_embedding_lookup(acid_embeddings, embedded_sequence):
    acid_embeddings_expanded = tf.tile(tf.expand_dims(acid_embeddings, axis = 0), [BATCH_SIZE, 1,1])
    emb_distances = tf.matmul(
        tf.nn.l2_normalize(acid_embeddings_expanded, axis=1),
        tf.nn.l2_normalize(embedded_sequence, axis=1),
        transpose_b=True)
    return tf.argmax(emb_distances, axis=1)

In [25]:
from common.bio.constants import ID_TO_AMINO_ACID
def display_sequence():
    sequences = reverse_embedding_lookup(embedding_weights, tf.squeeze(fake))
    generated_sequences, logits = sess.run([sequences, logits_fake], feed_dict={is_training: False})
    #indexToLetter = pretrained_emb["words"].to_dict()
    best_sequence = "".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in generated_sequences[np.argmax(logits)]]) 
    worst_sequence = "".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in generated_sequences[np.argmin(logits)]]) 
    print("{} | Discriminator value {}".format(best_sequence, logits[np.argmax(logits)]))
    print("{} | Discriminator value {}".format(worst_sequence, logits[np.argmin(logits)]))

In [26]:
import datetime
def save_model(saver, sess):
    # Epoch ended
    if steps % (STEPS_PER_EPOCH*10) == 0:
        display_sequence()
        print("Epoch {}. Fineshed at {}".format((steps/STEPS_PER_EPOCH), str(datetime.datetime.now()).split('.')[0]))
        save_weights(saver, sess, PATH)

## Running model

In [45]:
sess = tf.Session(graph=graph)

tb_writer = tf.summary.FileWriter("../../logs/wgan/", graph)
saver_restore = tf.train.Saver({"embedding/acid_embeddings": embedding_weights})
saver_restore.restore(sess, "../../logs/tcn_sequence/v1")
embeddings = sess.run(embedding_weights)
print("embeddings : %s" % embeddings[0])

INFO:tensorflow:Restoring parameters from ../../logs/tcn_sequence/v1
embeddings : [-0.19861157 -0.28935704 -0.12733226 -0.35233593  0.3301557   0.13117889
  0.16400765 -0.12967461  0.2070625  -0.1015301   0.10101962 -0.14296602
  0.3794662  -0.11086404 -0.20387682  0.24278641  0.2963467   0.24624394
  0.31158954 -0.25161746 -0.3626929   0.09720133 -0.04957089 -0.11402101
  0.27455717  0.13546558 -0.04326187 -0.16840957  0.11347007 -0.08396413
 -0.22988892 -0.3998772 ]


# CAUTION: Training the model

In [30]:
sess.run(init)
steps, gen_iterations = 0, 0

In [65]:
steps, gen_iterations

(3359, 75)

In [None]:
print ("Start training with batch size: {}, epoch num: {}".format(BATCH_SIZE, NUM_EPOCH))
dLosses, gLosses = [], [] 
sess.run(iterator.initializer, feed_dict={real_sequences: train_seq, substrate1: train_substrate_1,
                                                  substrate2: train_substrate_2, product1: train_product_1,
                                                  product2: train_product_2})
while True:
    try:
        d_iters = (100 if gen_iterations < 25 or gen_iterations % 500 == 0 else 5)
        for k in range(d_iters): # Discriminator
            _, _, dLoss = sess.run([trainer_d, trainer_a, d_loss], feed_dict={is_training: True})
            steps = steps + 1
            dLosses.append(dLoss)
            dLosses, gLosses = print_summary(steps, dLosses, gLosses)
            save_model(saver, sess)

        # Generator
        _, _, gLoss = sess.run([trainer_g, trainer_a, g_loss], feed_dict={is_training: True})
        gLosses.append(gLoss)
        steps = steps + 1
        gen_iterations = gen_iterations + 1
        dLosses, gLosses = print_summary(steps, dLosses, gLosses)
        save_model(saver, sess)
    except tf.errors.OutOfRangeError:
        print ("Training is finished")
        break;            

Start training with batch size: 256, epoch num: 3000
steps:85 	d_loss:0.0238 	g_loss:0.0000
steps:170 	d_loss:0.0293 	g_loss:-0.0089
steps:255 	d_loss:0.0266 	g_loss:-0.2557
steps:340 	d_loss:0.0239 	g_loss:-0.2582
steps:425 	d_loss:0.0224 	g_loss:-0.3047
steps:510 	d_loss:0.0191 	g_loss:-0.4632
steps:595 	d_loss:0.0184 	g_loss:0.0000
steps:680 	d_loss:0.0172 	g_loss:-0.3244
steps:765 	d_loss:0.0141 	g_loss:-0.4375
steps:850 	d_loss:0.0133 	g_loss:-0.5639
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 | Discriminator value 2.3118205070495605
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

steps:6800 	d_loss:0.0007 	g_loss:-1.2480
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 | Discriminator value 3.2444262504577637
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 | Discriminator value -0.9714657068252563
Epoch 80.0. Fineshed at 2018-07-26 15:30:24
Model saved in path: ../../weights/cgan_v1/version1
steps:6885 	d_loss:0.0076 	g_loss:-1.2480
steps:6970 	d_loss:0.0005 	g_loss:-1.1502
steps:7055 	d_loss:0.0011 	g_loss:-1.0945
steps:7140 	d_loss:0.0029 	g_loss:-1.0399
steps:7225 	d_loss:0.0049 	g_loss:-1.1422
steps:7310 	d_loss:0.0015 	g_loss:-1.0628
steps:7395 	d

Model saved in path: ../../weights/cgan_v1/version1
steps:12835 	d_loss:0.0049 	g_loss:-0.4759
steps:12920 	d_loss:0.0010 	g_loss:-0.4834
steps:13005 	d_loss:0.0000 	g_loss:-0.2351
steps:13090 	d_loss:-0.0001 	g_loss:-0.2024
steps:13175 	d_loss:-0.0002 	g_loss:-0.2058
steps:13260 	d_loss:0.0001 	g_loss:-0.2835
steps:13345 	d_loss:0.0001 	g_loss:-0.2605
steps:13430 	d_loss:0.0067 	g_loss:-0.5056
steps:13515 	d_loss:0.0012 	g_loss:-0.2738
steps:13600 	d_loss:0.0030 	g_loss:-0.3038
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 | Discriminator value 2.49950909614563
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

### Checks

In [None]:
embeddings = sess.run(embedding_weights)
print("embeddings : %s" % embeddings[0])

In [None]:
[ print("{}{}".format(x.name, x.shape)) for x in tf.trainable_variables()]

In [26]:
[ print("{}{}".format(x.name, x.shape)) for x in tf.global_variables()] 

[]

In [25]:
graph.get_collection("trainable_variables")

[<tf.Variable 'auto_encoder/encoded/kernel:0' shape=(12240, 20) dtype=float32_ref>,
 <tf.Variable 'auto_encoder/encoded/bias:0' shape=(20,) dtype=float32_ref>,
 <tf.Variable 'generator/dense1/kernel:0' shape=(208, 1228800) dtype=float32_ref>,
 <tf.Variable 'generator/dense1/bias:0' shape=(1228800,) dtype=float32_ref>,
 <tf.Variable 'generator/gbn1/gamma:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'generator/gbn1/beta:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'generator/conv2/kernel:0' shape=(3, 32, 256, 128) dtype=float32_ref>,
 <tf.Variable 'generator/conv2/bias:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'generator/gbn2/gamma:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'generator/gbn2/beta:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'generator/conv3/kernel:0' shape=(3, 32, 128, 64) dtype=float32_ref>,
 <tf.Variable 'generator/conv3/bias:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'generator/gbn3/gamma:0' shape=(64,) dtype=float32_ref>,
 <tf.Variab

In [43]:
generated= sess.run([fake], feed_dict={is_training: True})

## Validation of discriminator

In [41]:
with graph.as_default():
    val_real = discriminator(embedded_real_sequences, is_training=False)
    val_fake = discriminator(fake, is_training=False)
    val_loss = tf.reduce_mean(val_real-val_fake)
    real_predictions = tf.rint(val_real)
    fake_predictions = tf.rint(val_fake)
    correct_real_predictions = tf.equal(real_predictions, tf.zeros([BATCH_SIZE], dtype=tf.float32))
    correct_fake_predictions = tf.equal(fake_predictions, tf.ones([BATCH_SIZE], dtype=tf.float32))
    casted_real = tf.cast(correct_real_predictions, tf.float32)
    casted_fake = tf.cast(correct_fake_predictions, tf.float32)
    accuracy = (tf.reduce_mean(casted_real) + tf.reduce_mean(casted_fake))/2

discriminator
(16, 60, 32, 64)
(16, 30, 32, 128)
(16, 15, 32, 256)
(16, 1)
(16,)
discriminator
(16, 60, 32, 64)
(16, 30, 32, 128)
(16, 15, 32, 256)
(16, 1)
(16,)


In [None]:
#Validate discriminator by giving from validate data set and randomly generated
print ('validating discriminator...')
sess.run(iterator.initializer, 
         feed_dict={real_sequences: val_data, random_sequences: get_random_sequence(val_data.shape[0])})
losses = []
accuracies = []
while True:
    try:
        v_loss, v_accuracy = sess.run([val_loss, accuracy], feed_dict={is_training: False})
        losses.append(v_loss)
        accuracies.append(v_accuracy)
    except tf.errors.OutOfRangeError:
        print ('Validation g_loss:{:.4f} ,accuracy :{:.4f}'.format(mean(losses), mean(accuracies)))
        break

In [28]:
def restore_weights(saver, sess, path):
    saver.restore(sess, path)
    print("Model restored.")

In [31]:
restore_weights(saver, sess, PATH)

INFO:tensorflow:Restoring parameters from ../../weights/cgan_v1/version1
Model restored.


## Review generated examples

In [115]:
def discriminator_score(session, sequence):
    with graph.as_default():
        test_seq = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='real_sequence')
        print (test_seq.shape)
        embedded_test_seq = tf.nn.embedding_lookup(embedding_weights, test_seq)
        embedded_test_seq = tf.reshape(embedded_test_seq, shape=[-1, SEQUENCE_LENGTH, EMBEDDING_SIZE, 1])
        print (embedded_test_seq.shape)
        test_score = discriminator(embedded_test_seq, False)
        
    score = session.run(test_score, feed_dict={test_seq:sequence})
    return score    

In [78]:
def repeating_amino_acids_in_row(input_sequence):
    repeating2_count = 0
    repeating3_count = 0
    seq = input_sequence.replace("0", "")
    for index in range(len(seq)):
        if index == 0:
            continue
        if seq[index-1] == seq[index]:
            repeating2_count +=1
        if index == len(seq)-1:
            continue   
        if seq[index-1] == seq[index] and seq[index+1] == seq[index]:
            repeating3_count +=1
    return repeating2_count, repeating3_count

In [116]:
def print_seq_stats(input_sequence, score):
    sequence = "".join([ indexToLetter[acid_index] for acid_index in input_sequence ])
    print(sequence)
    repeating2_count, repeating3_count = repeating_amino_acids_in_row(sequence)
    print("Score: {} | Length: {} | Different characters: {} | Repeating 2s: {} | Repeating 3s: {}".format(
        score, len(sequence.replace("0", "")), (len(set(sequence))-1), repeating2_count, repeating3_count))

In [87]:
sequences = reverse_embedding_lookup(embedding_weights, tf.squeeze(fake))
print (sequences.shape)

(16, 120)


In [88]:
print ('Generating sequences...')
generated_sequences = sess.run([sequences], feed_dict={is_training: False})
generated_sequences[0]

Generating sequences...


array([[11, 16,  6, ...,  0,  0,  0],
       [11, 16, 15, ...,  0,  0,  0],
       [11, 17,  9, ...,  0,  0,  0],
       ...,
       [11, 15,  7, ...,  0,  0,  0],
       [11, 15,  7, ...,  0,  0,  0],
       [11,  1, 14, ...,  0,  0,  0]])

In [119]:
scores = discriminator_score(sess, generated_sequences[0])
for index, s in enumerate(generated_sequences[0]):
    print_seq_stats(s, scores[index])

(?, 120)
(?, 120, 32, 1)
discriminator
(?, 60, 32, 64)
(?, 30, 32, 128)
(?, 15, 32, 256)
(?, 1)
(?,)
MSGDDLDERIETYYVRVRGVGYVYYTRKKVQWWYKLRNLGDGSVEVDAPGEEQQVEKMVDWMRRGPSKANVSQVEERQVKLEYDYFRIEYGG00Y0Y00000000000000000000000
Score: 285.4346008300781 | Length: 94 | Different characters: 18 | Repeating 2s: 11 | Repeating 3s: 0
MSRVCYRRYVYGVVQGVGYRYYTQRQAQRLGVTGWVRNCDDGSVEAVYEGDSERVEEPPYWQRRGPRRWRRGPGSVEETSTRYEARRGYSRYERY0Q00000000000000000000000
Score: 277.64739990234375 | Length: 96 | Different characters: 16 | Repeating 2s: 11 | Repeating 3s: 0
MTKKRRYSWVRGRRRQVGYYQYGYGRRYYRVREYAQWQPWVEYYQQGDYEAVERYYLDKLEKCQGGPPRARVEEVEWWEPRRGGEYYEYR000000000000000000000000000000
Score: 254.7123260498047 | Length: 90 | Different characters: 16 | Repeating 2s: 17 | Repeating 3s: 1
MSKVRRQYYVRGRVQGVGYRAWTTYQAQQLGLTGWVRNLDDGSVEVYYYGEPERVEAEYVTEWLRGPPTRRPRVDVVDWEWWTEEEQPGGFERRF0Q00000000000000000000000
Score: 289.6255187988281 | Length: 96 | Different characters: 17 | Repeating 2s: 15 | Repeating 3s: 2
MKTLKY

In [117]:
scores = discriminator_score(sess, train_data)
for index, example in enumerate(train_data):
    print_seq_stats(example, scores[index])

(?, 120)
(?, 120, 32, 1)
discriminator
(?, 60, 32, 64)
(?, 30, 32, 128)
(?, 15, 32, 256)
(?, 1)
(?,)
MPGPTVVRFTARVVGRVQGVGFRDYVRTRGRRLGLVGTATNMPDGAVVVIAEGGAPACQNLARLLVTGHTPGWTDRVEVVWQRAQGDLADFRRK00000000000000000000000000
Score: 363.3921203613281 | Length: 94 | Different characters: 19 | Repeating 2s: 9 | Repeating 3s: 1
METQKILVSGQVQGVGFRWSATRLAKQLTLTGTVRNLANGQVEIIATGESATLQQFCQQLKHGLSPWINVMTLTTHSIPTHQFADFRIII000000000000000000000000000000
Score: 329.80377197265625 | Length: 90 | Different characters: 19 | Repeating 2s: 6 | Repeating 3s: 1
MKRVHVIVEGRVQGVGFRYFVQHEALKRQLTGWVKNNDDGTVEMEVQGNESALQLFLDTIEAGTMFAKVARMHIEPRDVRSDEKQFRIMYGSGF00000000000000000000000000
Score: 350.7937927246094 | Length: 94 | Different characters: 19 | Repeating 2s: 2 | Repeating 3s: 0
MVTDAQQARLTAWVHGRVQGVGFRWWTRARALELGLAGSATNLPGNRVEVVAEGPRESCERLLEALRSPDTPGDVDHVAEQWSEPKGGLTGFVER0000000000000000000000000
Score: 365.7681579589844 | Length: 95 | Different characters: 18 | Repeating 2s: 5 | Repeating 3s: 0
MTQVCIAAYV

In [49]:
repeating_amino_acids_in_row("MARDTAILRVTGFVQGVGFRYTTKHVAYKYDISGTVKNLDDGSVEIHAIAEEENLNKFIDAIKKGPSPGCRIEHVYIYKGAPVEDRKTFDIVY")

5


In [50]:
repeating_amino_acids_in_row("MKAARARLYYYGVVQGVYYRYYRYYYTYYQRYGYVGWYWVRRDREVVVVVQQQWQQCDELLKRWLRYTQPRARDTDVDWYEYQGDDQYYYVE")

21
