## Loading preprosed data

In [1]:
LEVEL="Level_1"

In [2]:
import numpy as np
train_data = np.load("enzyme_class_data\\train_features_"+LEVEL+".npy")
train_label = np.load("enzyme_class_data\\train_labels_"+LEVEL+".npy")
val_data = np.load("enzyme_class_data\\val_features_"+LEVEL+".npy")
val_label = np.load("enzyme_class_data\\val_labels_"+LEVEL+".npy")

In [3]:
PATH = "saved_models/wgan_{}_v2/version1.ckpt".format(LEVEL)

In [4]:
train_data.shape, train_label.shape, val_data.shape, val_label.shape

((164674, 500), (164674,), (41169, 500), (41169,))

In [5]:
NUM_OF_ACIDS = 21
EMBEDDING_SIZE = 32
NUM_CLASSES = np.amax(val_label, axis=0)+1

In [6]:
NUM_EPOCH=3
BATCH_SIZE=64
LAMBDA = 10

In [7]:
import tensorflow as tf
tf.__version__

'1.6.0'

## Helpers

Random sequence

In [8]:
def get_random_sequence(batch_size):
    return np.random.randint(0, 20, size=(batch_size, 500))

In [9]:
def deconvolution(x, is_training, output_shape, iteration):
    bn = tf.layers.batch_normalization(x, training=is_training, name='bn'+str(iteration))
    act = tf.nn.relu(bn, name='act'+str(iteration))
    W_conv = tf.get_variable('g_wconv'+str(iteration), [5, output_shape[-1], int(act.get_shape()[-1])], 
                              initializer=tf.truncated_normal_initializer(stddev=0.1))
    b_conv = tf.get_variable('g_bconv'+str(iteration), [output_shape[-1]], initializer=tf.constant_initializer(.1))
    conv = conv1d_transpose(act, 
                            filter=W_conv, 
                            output_shape=output_shape, 
                            stride=2, 
                            padding="SAME",
                            name='conv'+str(iteration)) + b_conv
    return conv

In [10]:
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.framework import tensor_shape

def conv1d_transpose(value, filter, output_shape, stride, padding="SAME", name=None):
    with ops.name_scope(name, "conv1d_transpose", [value, filter, output_shape]) as name:
        output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
        if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
            raise ValueError("output_shape must have shape (3,), got {}".format(output_shape_.get_shape()))

    axis = 2
   
    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[2]):
        raise ValueError("input channels does not match filter's input channels, "
                       "{} != {}".format(value.get_shape()[axis],
                                         filter.get_shape()[2]))

    if isinstance(output_shape, (list, np.ndarray)):
      # output_shape's shape should be == [3] if reached this point.
      if not filter.get_shape()[1].is_compatible_with(output_shape[axis]):
        raise ValueError("output_shape does not match filter's output channels, {} != {}".format(output_shape[axis],
                              filter.get_shape()[1]))

    if padding != "VALID" and padding != "SAME":
        raise ValueError("padding must be either VALID or SAME: {}".format(padding))

    # Reshape the input tensor to [batch, 1, in_width, in_channels]
    output_shape_ = array_ops.concat([output_shape_[:1], [1], output_shape_[1:]], axis=0)
    spatial_start_dim = 1
    strides = [1, 1, stride, 1]
    value = array_ops.expand_dims(value, spatial_start_dim)
    filter = array_ops.expand_dims(filter, 0)
    result = tf.nn.conv2d_backprop_input(
        input_sizes=output_shape_,
        filter=filter,
        out_backprop=value,
        strides=strides,
        padding=padding,
        data_format="NHWC",
        name=name)
    return array_ops.squeeze(result, [spatial_start_dim])

In [16]:
def discriminator(sequences, is_training, reuse=False):

    with tf.variable_scope('discriminator') as scope:
        if reuse:
            scope.reuse_variables()
        # Convolutional Layer #1
        conv1 = tf.layers.conv1d(
            inputs=sequences,
            filters=16,
            kernel_size=4,
            strides=2,
            padding="same",
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            activation=tf.nn.leaky_relu,
            name = "conv1")
        # Convolutional Layer #2
        conv2 = tf.layers.conv1d(
            inputs=conv1,
            filters=32,
            kernel_size=4,
            strides=2,
            padding="same",
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            activation=tf.nn.leaky_relu,
            name = "conv2")

        #Batch Norm #1
        bn3 = tf.layers.batch_normalization(conv2, training=is_training)

        # Convolutional Layer #3
        conv3 = tf.layers.conv1d(
            inputs=bn3,
            filters=64,
            kernel_size=4,
            strides=2,
            padding="same",
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            activation=tf.nn.leaky_relu,
            name = "conv3")

        #Batch Norm #2
        bn5 = tf.layers.batch_normalization(conv3, training=is_training)

        # Dense Layer
        flat = tf.layers.flatten(bn3)
        #dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=is_training)
        output = tf.layers.dense(inputs=flat,
                                 activation=None,
                                 kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 bias_initializer=tf.zeros_initializer(),
                                 units=1)


        return output

In [17]:
def generator(input_batch, is_training, reuse=False):
    with tf.variable_scope('generator') as scope:
        if reuse:
            scope.reuse_variables()   
        flat_conv1 = tf.layers.dense(inputs=input_batch,
                                 kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 bias_initializer=tf.zeros_initializer (),
                                 units=32)
        conv1 = tf.reshape(flat_conv1, shape=[BATCH_SIZE, 32, 500], name='reshape')
        print(conv1.shape)
        conv2 = deconvolution(conv1, is_training, [BATCH_SIZE, 63, 256], 1)
        print(conv2.shape)
        conv3 = deconvolution(conv2, is_training, [BATCH_SIZE, 125, 128], 2)
        print(conv3.shape)
        conv4 = deconvolution(conv3, is_training, [BATCH_SIZE, 250, 64], 3)
        print(conv4.shape)
        conv5 = deconvolution(conv4, is_training, [BATCH_SIZE, 500, 32], 4)
        print(conv5.shape)
        act5 = tf.nn.tanh(conv5, name='act5')
        act5 = tf.reshape(act5, shape=[BATCH_SIZE, 500, 32], name='act5_reshape')
        return act5

In [18]:
tf.reset_default_graph()
with tf.variable_scope('input'):
    real_sequences = tf.placeholder(tf.int32, [None, 500], name='real_sequence')
    random_sequences = tf.placeholder(tf.int32, shape=[None, 500], name='random_sequence')
    is_training = tf.placeholder(tf.bool, name='is_train')

dataset = tf.data.Dataset.from_tensor_slices((real_sequences, random_sequences))
dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE)).repeat(NUM_EPOCH)
iterator = dataset.make_initializable_iterator()

acid_embeddings = tf.get_variable("acid_embeddings", [NUM_OF_ACIDS, EMBEDDING_SIZE])

batch_real_sequences, batch_random_sequences = iterator.get_next()

embedded_real_sequences = tf.nn.embedding_lookup(acid_embeddings, batch_real_sequences)
embedded_random_sequences = tf.nn.embedding_lookup(acid_embeddings, batch_random_sequences)

In [19]:
embedded_fake_sequences = generator(embedded_random_sequences, is_training)
logits_real = discriminator(embedded_real_sequences, is_training)
logits_fake = discriminator(embedded_fake_sequences, is_training, reuse=True)
# wgan-gp loss is same as wgan loss
fake_mean = tf.reduce_mean(logits_fake)
real_mean = tf.reduce_mean(logits_real)
# AR d_loss = tf.reduce_mean( logits_real - logits_fake)  # This optimizes the discriminator.
d_loss = tf.reduce_mean( logits_real - logits_fake)  # This optimizes the discriminator. #AR
g_loss = tf.reduce_mean(-logits_fake)  # This optimizes the generator.

# wgan-gp gradient panelty 
with tf.name_scope("Gradient_penalty"):
    eps = tf.random_uniform([BATCH_SIZE,1,1], minval=0.0,maxval=1.0)
    interpolates = eps*embedded_real_sequences + (1-eps)*embedded_fake_sequences

    gradients = tf.gradients(discriminator(interpolates, is_training, reuse=True), [interpolates])[0]
    grad_norm = tf.norm(gradients[0], axis=1, ord='euclidean')
    gradient_penalty = tf.reduce_mean(tf.square(grad_norm - 1))
    d_loss += LAMBDA * gradient_penalty

D_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'discriminator')
G_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'generator')
# AR trainer_d = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.9).minimize(d_loss, var_list=D_vars)
trainer_d = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.9).minimize(d_loss, var_list=D_vars) # AR
trainer_g = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.5, beta2=0.9).minimize(g_loss, var_list=G_vars)


(64, 32, 500)
(64, 63, 256)
(64, 125, 128)
(64, 250, 64)
(64, 500, 32)


# Model

In [20]:
sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

print ("Batch size: {}, epoch num: {}".format(BATCH_SIZE, NUM_EPOCH))
print ('start training...')
sess.run(iterator.initializer, 
         feed_dict={real_sequences: train_data, random_sequences: get_random_sequence(train_data.shape[0])})
steps = 0
while True:
    try:
        for k in range(5):
            # Update the discriminator
            _, dLoss, f_mean, r_mean = sess.run([trainer_d, d_loss, fake_mean, real_mean], feed_dict={is_training: True})
            steps = steps + 1

        # Update the generator, twice for good measure.
        _, gLoss = sess.run([trainer_g, g_loss], feed_dict={is_training: True})
        steps = steps + 1
        if steps % 100 == 0 :
            print ('steps:{}, d_loss:{:.4f} ,g_loss:{:.4f} | fake_mean: {:.4f}, real_mean: {:.4f}'.format(
                steps, dLoss, gLoss, f_mean, r_mean))
    except tf.errors.OutOfRangeError:
        print ("Training is finished")
        break;
            

Batch size: 64, epoch num: 3
start training...
steps:300, d_loss:-101.4620 ,g_loss:-106.7752 | fake_mean: 106.3761, real_mean: -4.3488
steps:600, d_loss:-221.1001 ,g_loss:-208.0482 | fake_mean: 207.6493, real_mean: -17.9700
steps:900, d_loss:-344.6225 ,g_loss:-310.6984 | fake_mean: 310.3452, real_mean: -36.3629
steps:1200, d_loss:-470.0674 ,g_loss:-418.9265 | fake_mean: 418.5522, real_mean: -53.2735
steps:1500, d_loss:-607.7389 ,g_loss:-532.8212 | fake_mean: 532.3080, real_mean: -76.7160
steps:1800, d_loss:-745.8331 ,g_loss:-652.2019 | fake_mean: 651.7377, real_mean: -96.4648
steps:2100, d_loss:-896.1913 ,g_loss:-776.7899 | fake_mean: 776.2832, real_mean: -120.4993
steps:2400, d_loss:-1049.7250 ,g_loss:-906.6599 | fake_mean: 906.0986, real_mean: -145.9856
steps:2700, d_loss:-1201.8959 ,g_loss:-1040.3845 | fake_mean: 1039.8555, real_mean: -172.7240
steps:3000, d_loss:-1341.6388 ,g_loss:-1178.7117 | fake_mean: 1178.2125, real_mean: -195.4733
steps:3300, d_loss:-1463.4080 ,g_loss:-1321.21

## Validation of discriminator

In [21]:
def mean(l):
    return sum(l) / float(len(l))

In [22]:
val_real = discriminator(embedded_real_sequences, is_training, reuse=True)
val_fake = discriminator(embedded_random_sequences, is_training, reuse=True)
val_loss = tf.reduce_mean(val_real-val_fake)
real_predictions = tf.rint(val_real)
fake_predictions = tf.rint(val_fake)
correct_real_predictions = tf.equal(real_predictions, tf.zeros([BATCH_SIZE], dtype=tf.float32))
correct_fake_predictions = tf.equal(fake_predictions, tf.ones([BATCH_SIZE], dtype=tf.float32))
casted_real = tf.cast(correct_real_predictions, tf.float32)
casted_fake = tf.cast(correct_fake_predictions, tf.float32)
accuracy = (tf.reduce_mean(casted_real) + tf.reduce_mean(casted_fake))/2

In [23]:
#Validate discriminator by giving from validate data set and randomly generated
print ('validating discriminator...')
sess.run(iterator.initializer, 
         feed_dict={real_sequences: val_data, random_sequences: get_random_sequence(val_data.shape[0])})
losses = []
accuracies = []
while True:
    try:
        v_loss, v_accuracy = sess.run([val_loss, accuracy], feed_dict={is_training: False})
        losses.append(v_loss)
        accuracies.append(v_accuracy)
    except tf.errors.OutOfRangeError:
        print ('Validation g_loss:{:.4f} ,accuracy :{:.4f}'.format(mean(losses), mean(accuracies)))
        break

validating discriminator...
Validation g_loss:-148.2925 ,accuracy :0.0010


## Review generated examples

In [24]:
def reverse_embedding_lookup(acid_embeddings, embedded_sequence):
    # assume embedded_chars.shape == (batch_size, length, embedding_size)
    # acid_embeddings.shape == (vocab_size, embedding_size)
    print (tf.nn.l2_normalize(acid_embeddings, dim=1).shape)
    acid_embeddings_expanded = tf.tile(tf.expand_dims(acid_embeddings, axis = 0), [64, 1,1])
    print (tf.nn.l2_normalize(acid_embeddings_expanded, dim=1).shape)
    print (tf.nn.l2_normalize(embedded_sequence, dim=1).shape)
    emb_distances = tf.matmul( # shape == (vocab_size, batch_size)
        tf.nn.l2_normalize(acid_embeddings_expanded, dim=1),
        tf.nn.l2_normalize(embedded_sequence, dim=1),
        transpose_b=True)
    print (emb_distances.shape)
    return tf.argmax(emb_distances, axis=1) # shape == (batch_size)

In [25]:
# Review generated examples
sequences = reverse_embedding_lookup(acid_embeddings, embedded_fake_sequences)
print (sequences.shape)

Instructions for updating:
dim is deprecated, use axis instead
(21, 32)
(64, 21, 32)
(64, 500, 32)
(64, 21, 500)
(64, 500)


In [26]:
print ('Generating sequences...')
sess.run(iterator.initializer, 
         feed_dict={real_sequences: val_data[:BATCH_SIZE], random_sequences: get_random_sequence(BATCH_SIZE)})
while True:
    try:
        generated_sequences = sess.run([sequences], feed_dict={is_training: False})
    except tf.errors.OutOfRangeError:
        print (len(generated_sequences[0][0]))
        print (generated_sequences[0])
        break

Generating sequences...
500
[[ 1 10 16 ...  0  0  0]
 [ 1 10 16 ...  0  0  0]
 [ 1 10 16 ...  0  0  0]
 ...
 [ 1 10 16 ...  0  0  0]
 [ 1 10 16 ...  0  0  0]
 [ 1 10 16 ...  0  0  0]]
