## Loading preprosed data

In [3]:
PATH = "../../weights/wgan_{}/emb/version_1".format("v1")

In [4]:
import os
os.makedirs(PATH, exist_ok=True)

In [5]:
BATCH_SIZE=128
LAMBDA = 10
NUM_EPOCH = 1000
DATA_TYPE = "mini_sample"

In [6]:
from common.bio.smiles import *
from common.bio.amino_acid import *
from common.bio.blast import *
from common.bio.constants import *
from common.model.architecture import *

import tensorflow as tf
tf.__version__

'1.8.0'

In [7]:
import numpy as np
train_data = np.load("../../data/wgan/{}/train_features.npy".format(DATA_TYPE))

In [8]:
train_data = np.pad(train_data, [(0, 0), (0,8)], mode='constant', constant_values=0)

In [9]:
train_data.shape

(271, 128)

In [10]:
NUM_OF_ACIDS = 21
EMBEDDING_SIZE = 32

In [11]:
SEQUENCE_LENGTH=train_data.shape[1]
SEQUENCE_LENGTH

128

In [12]:
STEPS_PER_EPOCH = int(train_data.shape[0]/BATCH_SIZE)+1
STEPS_PER_EPOCH

3

# Model

## Discriminator

In [13]:
NUM_OF_LAYERS=6

In [14]:
def discriminator_layer(x, level, filters):
    conv = tf.layers.conv2d(
            inputs=x,
            filters=filters,
            kernel_size=[3,EMBEDDING_SIZE],
            strides=(2,1),
            padding="same",
            activation=tf.nn.leaky_relu,
            name = "dconv{}".format(level))
    bn = tf.layers.batch_normalization(conv, name = "dbn{}".format(level))
    print(bn.shape)
    return bn 

In [15]:
def discriminator(x, is_training):

    with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE) as scope:
        print('discriminator')
        layer = x
        for layer_id in range(NUM_OF_LAYERS):            
            layer = discriminator_layer(layer, layer_id, 2*(2**(layer_id+1)))
        flat = tf.layers.flatten(layer, name="dflat")
        print(flat.shape)
        
        output = tf.layers.dense(inputs=flat,
                                 activation=None,
                                 units=1,
                                 name="doutput")
        print(output.shape)
        output = tf.reshape(output, [-1])
        print(output.shape)
        return output

# Generator

In [16]:
def generator_layer(x, level, filters):
        bn = tf.layers.batch_normalization(x, name = "gbn{}".format(level))
        up = tf.keras.layers.UpSampling2D(size=(2, 1))(bn)
        print(up.shape)
        return tf.layers.conv2d(inputs=up, 
                                 filters=filters,
                                 kernel_size=[3,EMBEDDING_SIZE],
                                 padding="same",
                                 activation=tf.nn.relu,
                                 name = "conv{}".format(level))

In [17]:
import math
NUM_FILTERS=64
def generator(input_batch=None, is_training=True):
    with tf.variable_scope('generator') as scope:
        print('generator')
        if input_batch is None:
            input_batch = tf.cast(tf.random_normal([BATCH_SIZE, 32]), tf.float32)
        dim = math.floor(SEQUENCE_LENGTH/(2**NUM_OF_LAYERS))          
        print(input_batch.shape)
        dense1 = tf.layers.dense(inputs=input_batch,
                                 kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 bias_initializer=tf.zeros_initializer (),
                                 units=dim*EMBEDDING_SIZE*NUM_FILTERS,
                                 activation=tf.nn.relu,
                                 name="dense1")
        layer = tf.reshape(dense1, shape=[-1, dim, EMBEDDING_SIZE, NUM_FILTERS], name='reshape1')
        for layer_id in range(NUM_OF_LAYERS):
            layer = generator_layer(layer, layer_id, NUM_FILTERS/(2**(layer_id+1)))
        final_conv = tf.layers.conv2d(inputs=layer, 
                         filters=1,
                         kernel_size=[3,EMBEDDING_SIZE],
                         padding="same",
                         activation=tf.nn.sigmoid,
                         name = "final_conv")    
        print(final_conv.shape)
        return final_conv

## Graph

In [18]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope('input'):
        real_sequences = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='real_sequence')
        is_training = tf.placeholder(tf.bool, name='is_train')

    dataset = tf.data.Dataset.from_tensor_slices(real_sequences)
    dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
    dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)).repeat(NUM_EPOCH)
    iterator = dataset.make_initializable_iterator()

#     weights_initializer = tf.constant_initializer(embedding_weights_array)
    embedding_weights = tf.get_variable(
                                name='embedding_weights', 
                                shape=(NUM_OF_ACIDS, EMBEDDING_SIZE), 
                                initializer=tf.zeros_initializer,
                                trainable=False)

    batch_real_sequences = iterator.get_next()

    embedded_real_sequences = tf.nn.embedding_lookup(embedding_weights, batch_real_sequences)
    embedded_real_sequences = tf.reshape(embedded_real_sequences, shape=[-1, SEQUENCE_LENGTH, EMBEDDING_SIZE, 1], name='embedded_real_sequences')

In [19]:
with graph.as_default():
    fake = generator(is_training=is_training)
    logits_real = discriminator(embedded_real_sequences, is_training)
    logits_fake = discriminator(fake, is_training)
    d_loss = tf.reduce_mean(logits_fake) - tf.reduce_mean(logits_real) # This optimizes the discriminator.
    g_loss = -tf.reduce_mean(logits_fake)  # This optimizes the generator.

    # # wgan-gp gradient panelty 
    with tf.name_scope("Gradient_penalty"):
        eps = tf.random_uniform([BATCH_SIZE,1, 1, 1], minval=0.0,maxval=1.0)
        interpolates = embedded_real_sequences + eps*(fake - embedded_real_sequences)

        gradients = tf.gradients(discriminator(interpolates, is_training), [interpolates])[0]
        slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
        gradient_penalty = tf.reduce_mean(tf.square(slopes - 1.))
        d_loss += 10 * gradient_penalty
        tf.summary.scalar("d_loss", d_loss)
        tf.summary.scalar("g_loss", g_loss)

    D_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'discriminator')
    G_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'generator')

    trainer_d = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.9).minimize(d_loss, var_list=D_vars)
    trainer_g = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.9).minimize(g_loss, var_list=G_vars)

    summ = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=3)

generator
(128, 32)
(128, 4, 32, 64)
(128, 8, 32, 32)
(128, 16, 32, 16)
(128, 32, 32, 8)
(128, 64, 32, 4)
(128, 128, 32, 2)
(128, 128, 32, 1)
discriminator
(128, 64, 32, 4)
(128, 32, 32, 8)
(128, 16, 32, 16)
(128, 8, 32, 32)
(128, 4, 32, 64)
(128, 2, 32, 128)
(128, 8192)
(128, 1)
(128,)
discriminator
(128, 64, 32, 4)
(128, 32, 32, 8)
(128, 16, 32, 16)
(128, 8, 32, 32)
(128, 4, 32, 64)
(128, 2, 32, 128)
(128, 8192)
(128, 1)
(128,)
discriminator
(128, 64, 32, 4)
(128, 32, 32, 8)
(128, 16, 32, 16)
(128, 8, 32, 32)
(128, 4, 32, 64)
(128, 2, 32, 128)
(128, 8192)
(128, 1)
(128,)


In [20]:
with graph.as_default():
    print_model_summary()

All parameters: 4365067.0 (out of them 1454545 are trainable)

generator/dense1/kernel:0(32, 4096)
generator/dense1/bias:0(4096,)
generator/gbn0/gamma:0(64,)
generator/gbn0/beta:0(64,)
generator/conv0/kernel:0(3, 32, 64, 32)
generator/conv0/bias:0(32,)
generator/gbn1/gamma:0(32,)
generator/gbn1/beta:0(32,)
generator/conv1/kernel:0(3, 32, 32, 16)
generator/conv1/bias:0(16,)
generator/gbn2/gamma:0(16,)
generator/gbn2/beta:0(16,)
generator/conv2/kernel:0(3, 32, 16, 8)
generator/conv2/bias:0(8,)
generator/gbn3/gamma:0(8,)
generator/gbn3/beta:0(8,)
generator/conv3/kernel:0(3, 32, 8, 4)
generator/conv3/bias:0(4,)
generator/gbn4/gamma:0(4,)
generator/gbn4/beta:0(4,)
generator/conv4/kernel:0(3, 32, 4, 2)
generator/conv4/bias:0(2,)
generator/gbn5/gamma:0(2,)
generator/gbn5/beta:0(2,)
generator/conv5/kernel:0(3, 32, 2, 1)
generator/conv5/bias:0(1,)
generator/final_conv/kernel:0(3, 32, 1, 1)
generator/final_conv/bias:0(1,)
discriminator/dconv0/kernel:0(3, 32, 1, 4)
discriminator/dconv0/bias:0(4,)

## Helpers for training model

## Review generated examples

In [21]:
 def save_weights(saver, sess, path):
    save_path = saver.save(sess, path)
    print("Model saved in path: %s" % save_path)

In [22]:
def mean(l):
    if len(l) == 0:
        return 0
    else:
        return sum(l) / float(len(l))

In [23]:
def print_summary(steps, dLosses, gLosses):    
    if steps % int(STEPS_PER_EPOCH) == 0:
        print('steps:{} \td_loss:{:.4f} \tg_loss:{:.4f}'.format(steps, mean(dLosses), mean(gLosses)))
        dLosses, gLosses = [], [] 
    return dLosses, gLosses

In [24]:
def reverse_embedding_lookup(acid_embeddings, embedded_sequence):
    acid_embeddings_expanded = tf.tile(tf.expand_dims(acid_embeddings, axis = 0), [BATCH_SIZE, 1,1])
    emb_distances = tf.matmul(
        tf.nn.l2_normalize(acid_embeddings_expanded, axis=1),
        tf.nn.l2_normalize(embedded_sequence, axis=1),
        transpose_b=True)
    return tf.argmax(emb_distances, axis=1)

In [25]:
def display_sequence():
    sequences = reverse_embedding_lookup(embedding_weights, tf.squeeze(fake))
    generated_sequences, logits = sess.run([sequences, logits_fake], feed_dict={is_training: False})
    #indexToLetter = pretrained_emb["words"].to_dict()
    best_sequence = "".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in generated_sequences[np.argmax(logits)]]) 
    worst_sequence = "".join([ ID_TO_AMINO_ACID[acid_index] for acid_index in generated_sequences[np.argmin(logits)]]) 
    print("{} | Discriminator value {}".format(best_sequence, logits[np.argmax(logits)]))
    print("{} | Discriminator value {}".format(worst_sequence, logits[np.argmin(logits)]))

In [30]:
import datetime
def save_model(saver, sess):
    # Epoch ended
    if steps % (STEPS_PER_EPOCH*50) == 0:
        display_sequence()
        print("Epoch {}. Fineshed at {}".format((steps/STEPS_PER_EPOCH), str(datetime.datetime.now()).split('.')[0]))
        save_weights(saver, sess, PATH)

## Running model

In [31]:
sess = tf.Session(graph=graph)

tb_writer = tf.summary.FileWriter("../../logs/wgan/", graph)
sess.run(init)
steps, gen_iterations = 0, 0

saver_restore = tf.train.Saver({"embedding/acid_embeddings": embedding_weights})
saver_restore.restore(sess, "../../logs/tcn_sequence/v1")
embeddings = sess.run(embedding_weights)
print("embeddings : %s" % embeddings[0])

INFO:tensorflow:Restoring parameters from ../../logs/tcn_sequence/v1
embeddings : [-0.19861157 -0.28935704 -0.12733226 -0.35233593  0.3301557   0.13117889
  0.16400765 -0.12967461  0.2070625  -0.1015301   0.10101962 -0.14296602
  0.3794662  -0.11086404 -0.20387682  0.24278641  0.2963467   0.24624394
  0.31158954 -0.25161746 -0.3626929   0.09720133 -0.04957089 -0.11402101
  0.27455717  0.13546558 -0.04326187 -0.16840957  0.11347007 -0.08396413
 -0.22988892 -0.3998772 ]


# CAUTION: Training the model

In [32]:
sess.run(iterator.initializer, feed_dict={real_sequences: train_data})
steps, gen_iterations

(0, 0)

In [33]:
print ("Start training with batch size: {}, epoch num: {}".format(BATCH_SIZE, NUM_EPOCH))
dLosses, gLosses = [], [] 
while True:
    try:
        d_iters = (100 if gen_iterations < 5 or gen_iterations % 500 == 0 else 5)
        for k in range(d_iters): # Discriminator
            _, dLoss = sess.run([trainer_d, d_loss], feed_dict={is_training: True})
            steps = steps + 1
            dLosses.append(dLoss)
            dLosses, gLosses = print_summary(steps, dLosses, gLosses)
            save_model(saver, sess)

        # Generator
        _, gLoss = sess.run([trainer_g, g_loss], feed_dict={is_training: True})
        gLosses.append(gLoss)
        steps = steps + 1
        gen_iterations = gen_iterations + 1
        dLosses, gLosses = print_summary(steps, dLosses, gLosses)
        save_model(saver, sess)
    except tf.errors.OutOfRangeError:
        print ("Training is finished")
        break;            

Start training with batch size: 128, epoch num: 1000
steps:3 	d_loss:9.7433 	g_loss:0.0000
steps:6 	d_loss:8.5584 	g_loss:0.0000
steps:9 	d_loss:3.1576 	g_loss:0.0000
steps:12 	d_loss:-19.6234 	g_loss:0.0000
steps:15 	d_loss:-98.6545 	g_loss:0.0000
steps:18 	d_loss:-296.6925 	g_loss:0.0000
steps:21 	d_loss:-542.5508 	g_loss:0.0000
steps:24 	d_loss:-629.3907 	g_loss:0.0000
steps:27 	d_loss:-691.1167 	g_loss:0.0000
steps:30 	d_loss:-732.4423 	g_loss:0.0000
steps:33 	d_loss:-761.8487 	g_loss:0.0000
steps:36 	d_loss:-789.6188 	g_loss:0.0000
steps:39 	d_loss:-812.9695 	g_loss:0.0000
steps:42 	d_loss:-838.8619 	g_loss:0.0000
steps:45 	d_loss:-848.6036 	g_loss:0.0000
steps:48 	d_loss:-861.7220 	g_loss:0.0000
steps:51 	d_loss:-873.3843 	g_loss:0.0000
steps:54 	d_loss:-891.9099 	g_loss:0.0000
steps:57 	d_loss:-891.3483 	g_loss:0.0000
steps:60 	d_loss:-903.9471 	g_loss:0.0000
steps:63 	d_loss:-915.6780 	g_loss:0.0000
steps:66 	d_loss:-913.8939 	g_loss:0.0000
steps:69 	d_loss:-926.0339 	g_loss:0.

steps:483 	d_loss:-992.0103 	g_loss:0.0000
steps:486 	d_loss:-987.3222 	g_loss:0.0000
steps:489 	d_loss:-980.9485 	g_loss:0.0000
steps:492 	d_loss:-985.5775 	g_loss:0.0000
steps:495 	d_loss:-987.4600 	g_loss:0.0000
steps:498 	d_loss:-992.2675 	g_loss:0.0000
steps:501 	d_loss:-985.1757 	g_loss:0.0000
steps:504 	d_loss:-986.8427 	g_loss:0.0000
steps:507 	d_loss:-956.1534 	g_loss:2047.9078
steps:510 	d_loss:-965.1577 	g_loss:0.0000
steps:513 	d_loss:-914.8094 	g_loss:1887.2635
steps:516 	d_loss:-915.8240 	g_loss:0.0000
steps:519 	d_loss:-862.0939 	g_loss:1688.0757
steps:522 	d_loss:-864.8016 	g_loss:0.0000
steps:525 	d_loss:-800.1861 	g_loss:1823.9565
steps:528 	d_loss:-807.6128 	g_loss:0.0000
steps:531 	d_loss:-737.6177 	g_loss:1688.0037
steps:534 	d_loss:-747.4608 	g_loss:0.0000
steps:537 	d_loss:-663.5185 	g_loss:1597.7861
steps:540 	d_loss:-681.6896 	g_loss:0.0000
steps:543 	d_loss:-619.3421 	g_loss:1422.4181
steps:546 	d_loss:-630.1660 	g_loss:0.0000
steps:549 	d_loss:-567.2601 	g_lo

steps:954 	d_loss:-139.4302 	g_loss:0.0000
steps:957 	d_loss:-139.5993 	g_loss:400.1296
steps:960 	d_loss:-139.4010 	g_loss:0.0000
steps:963 	d_loss:-139.6817 	g_loss:364.4210
steps:966 	d_loss:-142.4886 	g_loss:0.0000
steps:969 	d_loss:-145.1646 	g_loss:413.8026
steps:972 	d_loss:-143.0698 	g_loss:0.0000
steps:975 	d_loss:-142.0401 	g_loss:440.8730
steps:978 	d_loss:-142.3295 	g_loss:0.0000
steps:981 	d_loss:-140.9719 	g_loss:400.0787
steps:984 	d_loss:-143.1862 	g_loss:0.0000
steps:987 	d_loss:-144.3272 	g_loss:365.0956
steps:990 	d_loss:-144.7321 	g_loss:0.0000
steps:993 	d_loss:-143.9670 	g_loss:395.6391
steps:996 	d_loss:-146.6955 	g_loss:0.0000
steps:999 	d_loss:-146.2092 	g_loss:430.6588
steps:1002 	d_loss:-148.6627 	g_loss:0.0000
steps:1005 	d_loss:-147.1833 	g_loss:427.9850
steps:1008 	d_loss:-147.8728 	g_loss:0.0000
steps:1011 	d_loss:-146.5936 	g_loss:406.2425
steps:1014 	d_loss:-146.5149 	g_loss:0.0000
steps:1017 	d_loss:-147.6856 	g_loss:388.5936
steps:1020 	d_loss:-149.54

steps:1413 	d_loss:-168.4270 	g_loss:507.9502
steps:1416 	d_loss:-172.0216 	g_loss:0.0000
steps:1419 	d_loss:-170.0155 	g_loss:457.5886
steps:1422 	d_loss:-171.9737 	g_loss:0.0000
steps:1425 	d_loss:-173.1387 	g_loss:465.3860
steps:1428 	d_loss:-174.8952 	g_loss:0.0000
steps:1431 	d_loss:-175.6232 	g_loss:434.6534
steps:1434 	d_loss:-175.0420 	g_loss:0.0000
steps:1437 	d_loss:-176.5589 	g_loss:416.0647
steps:1440 	d_loss:-178.3911 	g_loss:0.0000
steps:1443 	d_loss:-177.9861 	g_loss:434.1572
steps:1446 	d_loss:-178.0418 	g_loss:0.0000
steps:1449 	d_loss:-177.3020 	g_loss:450.1169
steps:1452 	d_loss:-177.6227 	g_loss:0.0000
steps:1455 	d_loss:-177.0100 	g_loss:457.3556
steps:1458 	d_loss:-177.2684 	g_loss:0.0000
steps:1461 	d_loss:-173.7533 	g_loss:467.1082
steps:1464 	d_loss:-177.5878 	g_loss:0.0000
steps:1467 	d_loss:-175.1717 	g_loss:477.9495
steps:1470 	d_loss:-177.9889 	g_loss:0.0000
steps:1473 	d_loss:-182.7786 	g_loss:507.4893
steps:1476 	d_loss:-186.7900 	g_loss:0.0000
steps:1479

steps:1872 	d_loss:-176.5216 	g_loss:0.0000
steps:1875 	d_loss:-154.2145 	g_loss:478.7565
steps:1878 	d_loss:-170.7480 	g_loss:0.0000
steps:1881 	d_loss:-186.4995 	g_loss:558.8789
steps:1884 	d_loss:-199.8354 	g_loss:0.0000
steps:1887 	d_loss:-184.1506 	g_loss:656.5632
steps:1890 	d_loss:-191.9550 	g_loss:0.0000
steps:1893 	d_loss:-157.0772 	g_loss:603.4531
steps:1896 	d_loss:-168.5213 	g_loss:0.0000
steps:1899 	d_loss:-170.2429 	g_loss:494.3704
steps:1902 	d_loss:-174.8043 	g_loss:0.0000
steps:1905 	d_loss:-176.1062 	g_loss:478.6683
steps:1908 	d_loss:-178.3269 	g_loss:0.0000
steps:1911 	d_loss:-176.5230 	g_loss:483.4550
steps:1914 	d_loss:-178.0952 	g_loss:0.0000
steps:1917 	d_loss:-170.4415 	g_loss:491.2235
steps:1920 	d_loss:-173.5022 	g_loss:0.0000
steps:1923 	d_loss:-168.8286 	g_loss:495.9787
steps:1926 	d_loss:-176.9504 	g_loss:0.0000
steps:1929 	d_loss:-184.3558 	g_loss:559.1547
steps:1932 	d_loss:-186.2997 	g_loss:0.0000
steps:1935 	d_loss:-190.5090 	g_loss:633.8928
steps:1938

In [34]:
embeddings = sess.run(embedding_weights)
print("embeddings : %s" % embeddings[0])

embeddings : [-0.19861157 -0.28935704 -0.12733226 -0.35233593  0.3301557   0.13117889
  0.16400765 -0.12967461  0.2070625  -0.1015301   0.10101962 -0.14296602
  0.3794662  -0.11086404 -0.20387682  0.24278641  0.2963467   0.24624394
  0.31158954 -0.25161746 -0.3626929   0.09720133 -0.04957089 -0.11402101
  0.27455717  0.13546558 -0.04326187 -0.16840957  0.11347007 -0.08396413
 -0.22988892 -0.3998772 ]


In [35]:
generated= sess.run([fake], feed_dict={is_training: True})

## Validation of discriminator

In [36]:
with graph.as_default():
    val_real = discriminator(embedded_real_sequences, is_training=False)
    val_fake = discriminator(fake, is_training=False)
    val_loss = tf.reduce_mean(val_real-val_fake)
    real_predictions = tf.rint(val_real)
    fake_predictions = tf.rint(val_fake)
    correct_real_predictions = tf.equal(real_predictions, tf.zeros([BATCH_SIZE], dtype=tf.float32))
    correct_fake_predictions = tf.equal(fake_predictions, tf.ones([BATCH_SIZE], dtype=tf.float32))
    casted_real = tf.cast(correct_real_predictions, tf.float32)
    casted_fake = tf.cast(correct_fake_predictions, tf.float32)
    accuracy = (tf.reduce_mean(casted_real) + tf.reduce_mean(casted_fake))/2

discriminator
(128, 64, 32, 4)
(128, 32, 32, 8)
(128, 16, 32, 16)
(128, 8, 32, 32)
(128, 4, 32, 64)
(128, 2, 32, 128)
(128, 8192)
(128, 1)
(128,)
discriminator
(128, 64, 32, 4)
(128, 32, 32, 8)
(128, 16, 32, 16)
(128, 8, 32, 32)
(128, 4, 32, 64)
(128, 2, 32, 128)
(128, 8192)
(128, 1)
(128,)


In [None]:
#Validate discriminator by giving from validate data set and randomly generated
print ('validating discriminator...')
sess.run(iterator.initializer, 
         feed_dict={real_sequences: val_data, random_sequences: get_random_sequence(val_data.shape[0])})
losses = []
accuracies = []
while True:
    try:
        v_loss, v_accuracy = sess.run([val_loss, accuracy], feed_dict={is_training: False})
        losses.append(v_loss)
        accuracies.append(v_accuracy)
    except tf.errors.OutOfRangeError:
        print ('Validation g_loss:{:.4f} ,accuracy :{:.4f}'.format(mean(losses), mean(accuracies)))
        break

In [28]:
def restore_weights(saver, sess, path):
    saver.restore(sess, path)
    print("Model restored.")

In [29]:
restore_weights(saver, sess, PATH)

INFO:tensorflow:Restoring parameters from ../../weights/wgan_sequence/emb/version_3
Model restored.


## Review generated examples

In [115]:
def discriminator_score(session, sequence):
    with graph.as_default():
        test_seq = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='real_sequence')
        print (test_seq.shape)
        embedded_test_seq = tf.nn.embedding_lookup(embedding_weights, test_seq)
        embedded_test_seq = tf.reshape(embedded_test_seq, shape=[-1, SEQUENCE_LENGTH, EMBEDDING_SIZE, 1])
        print (embedded_test_seq.shape)
        test_score = discriminator(embedded_test_seq, False)
        
    score = session.run(test_score, feed_dict={test_seq:sequence})
    return score    

In [78]:
def repeating_amino_acids_in_row(input_sequence):
    repeating2_count = 0
    repeating3_count = 0
    seq = input_sequence.replace("0", "")
    for index in range(len(seq)):
        if index == 0:
            continue
        if seq[index-1] == seq[index]:
            repeating2_count +=1
        if index == len(seq)-1:
            continue   
        if seq[index-1] == seq[index] and seq[index+1] == seq[index]:
            repeating3_count +=1
    return repeating2_count, repeating3_count

In [116]:
def print_seq_stats(input_sequence, score):
    sequence = "".join([ indexToLetter[acid_index] for acid_index in input_sequence ])
    print(sequence)
    repeating2_count, repeating3_count = repeating_amino_acids_in_row(sequence)
    print("Score: {} | Length: {} | Different characters: {} | Repeating 2s: {} | Repeating 3s: {}".format(
        score, len(sequence.replace("0", "")), (len(set(sequence))-1), repeating2_count, repeating3_count))

In [87]:
sequences = reverse_embedding_lookup(embedding_weights, tf.squeeze(fake))
print (sequences.shape)

(16, 120)


In [88]:
print ('Generating sequences...')
generated_sequences = sess.run([sequences], feed_dict={is_training: False})
generated_sequences[0]

Generating sequences...


array([[11, 16,  6, ...,  0,  0,  0],
       [11, 16, 15, ...,  0,  0,  0],
       [11, 17,  9, ...,  0,  0,  0],
       ...,
       [11, 15,  7, ...,  0,  0,  0],
       [11, 15,  7, ...,  0,  0,  0],
       [11,  1, 14, ...,  0,  0,  0]])

In [119]:
scores = discriminator_score(sess, generated_sequences[0])
for index, s in enumerate(generated_sequences[0]):
    print_seq_stats(s, scores[index])

(?, 120)
(?, 120, 32, 1)
discriminator
(?, 60, 32, 64)
(?, 30, 32, 128)
(?, 15, 32, 256)
(?, 1)
(?,)
MSGDDLDERIETYYVRVRGVGYVYYTRKKVQWWYKLRNLGDGSVEVDAPGEEQQVEKMVDWMRRGPSKANVSQVEERQVKLEYDYFRIEYGG00Y0Y00000000000000000000000
Score: 285.4346008300781 | Length: 94 | Different characters: 18 | Repeating 2s: 11 | Repeating 3s: 0
MSRVCYRRYVYGVVQGVGYRYYTQRQAQRLGVTGWVRNCDDGSVEAVYEGDSERVEEPPYWQRRGPRRWRRGPGSVEETSTRYEARRGYSRYERY0Q00000000000000000000000
Score: 277.64739990234375 | Length: 96 | Different characters: 16 | Repeating 2s: 11 | Repeating 3s: 0
MTKKRRYSWVRGRRRQVGYYQYGYGRRYYRVREYAQWQPWVEYYQQGDYEAVERYYLDKLEKCQGGPPRARVEEVEWWEPRRGGEYYEYR000000000000000000000000000000
Score: 254.7123260498047 | Length: 90 | Different characters: 16 | Repeating 2s: 17 | Repeating 3s: 1
MSKVRRQYYVRGRVQGVGYRAWTTYQAQQLGLTGWVRNLDDGSVEVYYYGEPERVEAEYVTEWLRGPPTRRPRVDVVDWEWWTEEEQPGGFERRF0Q00000000000000000000000
Score: 289.6255187988281 | Length: 96 | Different characters: 17 | Repeating 2s: 15 | Repeating 3s: 2
MKTLKY

In [117]:
scores = discriminator_score(sess, train_data)
for index, example in enumerate(train_data):
    print_seq_stats(example, scores[index])

(?, 120)
(?, 120, 32, 1)
discriminator
(?, 60, 32, 64)
(?, 30, 32, 128)
(?, 15, 32, 256)
(?, 1)
(?,)
MPGPTVVRFTARVVGRVQGVGFRDYVRTRGRRLGLVGTATNMPDGAVVVIAEGGAPACQNLARLLVTGHTPGWTDRVEVVWQRAQGDLADFRRK00000000000000000000000000
Score: 363.3921203613281 | Length: 94 | Different characters: 19 | Repeating 2s: 9 | Repeating 3s: 1
METQKILVSGQVQGVGFRWSATRLAKQLTLTGTVRNLANGQVEIIATGESATLQQFCQQLKHGLSPWINVMTLTTHSIPTHQFADFRIII000000000000000000000000000000
Score: 329.80377197265625 | Length: 90 | Different characters: 19 | Repeating 2s: 6 | Repeating 3s: 1
MKRVHVIVEGRVQGVGFRYFVQHEALKRQLTGWVKNNDDGTVEMEVQGNESALQLFLDTIEAGTMFAKVARMHIEPRDVRSDEKQFRIMYGSGF00000000000000000000000000
Score: 350.7937927246094 | Length: 94 | Different characters: 19 | Repeating 2s: 2 | Repeating 3s: 0
MVTDAQQARLTAWVHGRVQGVGFRWWTRARALELGLAGSATNLPGNRVEVVAEGPRESCERLLEALRSPDTPGDVDHVAEQWSEPKGGLTGFVER0000000000000000000000000
Score: 365.7681579589844 | Length: 95 | Different characters: 18 | Repeating 2s: 5 | Repeating 3s: 0
MTQVCIAAYV

In [49]:
repeating_amino_acids_in_row("MARDTAILRVTGFVQGVGFRYTTKHVAYKYDISGTVKNLDDGSVEIHAIAEEENLNKFIDAIKKGPSPGCRIEHVYIYKGAPVEDRKTFDIVY")

5


In [50]:
repeating_amino_acids_in_row("MKAARARLYYYGVVQGVYYRYYRYYYTYYQRYGYVGWYWVRRDREVVVVVQQQWQQCDELLKRWLRYTQPRARDTDVDWYEYQGDDQYYYVE")

21
