In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.framework import ops

from load_data import *
from models.customlayers import *
from models.activations import *
from training import *

import moviepy.editor as mpe
import models.ConvAE as cae
L = tf.layers

import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
data_dir = os.path.expanduser('~/Insight/video-representations/frames')
training_file = os.path.join(data_dir, 'training.tfrecords')
validation_file = os.path.join(data_dir, 'validation.tfrecords')
testing_file = os.path.join(data_dir, 'testing.tfrecords')

# Model

In [3]:
def invert_layer(input, invlayer_in, inv_layer_out):
    return tf.gradient(inv_layer_out, inv_layer_in, input)

def encoder(image):
    original_shape = image.get_shape().as_list()
    input_layer = tf.reshape(image, (-1, 60, 80, 3))
 
    conv1 = L.conv2d(
        input_layer, name='conv1',
        filters=16, kernel_size=3, activation=selu
    )
        
    conv2 = L.conv2d(
        conv1, name='conv2',
        filters=32, kernel_size=5, activation=selu
    )

    shape_ = conv2.get_shape().as_list()
    newdim = shape_[1] * shape_[2] * shape_[3]
    print(shape_, newdim)
    
    dense1 = dense_reshape(conv2, name='dense1', units=512, activation=selu)
    
    return tf.reshape(dense1, (-1, 64, 512))


def decoder(encoded):
    encoded_reshaped = tf.reshape(encoded, (-1, 512))
    
    dense1 = L.dense(encoded_reshaped, units=127872, activation=selu, name='dense1')

    dense1_reshaped = tf.reshape(dense1, (-1, 54, 74, 32))

    deconv3 = L.conv2d_transpose(
        dense1_reshaped, name='deconv3', 
        filters=32, kernel_size=5, activation=selu,
    )
    
    deconv4 = L.conv2d_transpose(
        deconv3, name='deconv4', 
        filters=3, kernel_size=3, activation=selu,
    )
    
    deconv4_reshaped = tf.transpose(deconv4, perm=(0, 3, 1, 2))

    return deconv4_reshaped

In [4]:
tf.reset_default_graph()

input_var = tf.placeholder(dtype=tf.float32, shape=(None, 3, 60, 80), name='input')
target_var = tf.placeholder(dtype=tf.float32, shape=(None, 3, 60, 80), name='target')

with tf.variable_scope('iencoder') as enc:
    encoded = encoder(input_var)
    
with tf.variable_scope('decoder') as dec:
    decoded = decoder(encoded)

[None, 54, 74, 32] 127872


In [4]:
def read_record(filepath_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filepath_queue)
    
    features = tf.parse_single_example(
        serialized_example,
        features={
            'height': tf.FixedLenFeature([], tf.int64),
            'width': tf.FixedLenFeature([], tf.int64),
            'video': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
            'length': tf.FixedLenFeature([], tf.int64)
        }
    )
    
    video = tf.decode_raw(features['video'], tf.uint8)   # feature may be renamed to video in future

    video_shape = tf.stack([-1, 60, 80, 3])
    video = tf.cast(tf.reshape(video, video_shape), tf.float32)
    video = tf.slice(video, [0, 0, 0, 0], [128, -1, -1, -1])
    
    return video
    
def inputs(split_type, batchsize, num_epochs): 
    if not num_epochs:
        num_epochs = None
        
    filepath = os.path.join(data_dir, '{}.tfrecords'.format(split_type))
    
    with tf.name_scope('input'):
        filepath_queue = tf.train.string_input_producer([filepath], num_epochs=num_epochs)
    
    video = read_record(filepath_queue)
    videos = tf.train.shuffle_batch(
        [video], batchsize, 
        capacity=128 + 2*batchsize, min_after_dequeue=128, num_threads=2
    )
    
    video_inputs = tf.slice(videos, begin=[0, 0, 0, 0, 0], size=[-1, 64, -1, -1, -1])
    video_outputs = tf.slice(videos, begin=[0, 1, 0, 0, 0], size=[-1, 64, -1, -1, -1])
    
    return video_inputs, video_outputs

In [5]:
def run_training(num_epochs, batchsize):
    video_inputs, video_outputs = inputs('training', batchsize, num_epochs)
    
    with tf.variable_scope('encoder'):
        encoded_input = encoder(video_inputs)
        encoded_input_reshaped = tf.reshape(encoded_input, (batchsize, -1, 512))
        
    with tf.variable_scope('encoder', reuse=True):
        encoded_output = encoder(video_outputs)
        encoded_output_reshaped = tf.reshape(encoded_output, (batchsize, -1, 512))
    
    with tf.variable_scope('lstm'):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=128)
        transitioned = tf.nn.dynamic_rnn(lstm_cell, encoded_input_reshaped, dtype=tf.float32)
        transitioned_reshaped = tf.reshape(transitioned[0], (-1, 128))
        encoded_prediction = L.dense(transitioned_reshaped, 512, activation=selu)
        encoded_prediction_reshaped = tf.reshape(encoded_prediction, (batchsize, -1, 512))
    
    with tf.variable_scope('decoder'):
        decoded = decoder(encoded_prediction)
        decoded_reshaped = tf.reshape(decoded, (batchsize, -1, 60, 80, 3))
        
    l2_weight = .05
    l2_term = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name])
    
    loss = tf.reduce_mean(tf.pow(encoded_output_reshaped - encoded_prediction_reshaped, 2))
    train_step = tf.train.AdamOptimizer().minimize(loss + l2_weight*l2_term)
    
    decoder_loss = tf.reduce_mean(tf.pow(decoded_reshaped - video_outputs, 2))
    decoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder')
    decoder_train_step = tf.train.AdamOptimizer().minimize(decoder_loss, var_list=decoder_vars)
    
    saver = tf.train.Saver()
    init_global = tf.global_variables_initializer()
    init_local = tf.local_variables_initializer()
    
    coord = tf.train.Coordinator()
        
    with tf.Session() as sesh:
        sesh.run(init_global)
        sesh.run(init_local)
        threads = tf.train.start_queue_runners(sess=sesh, coord=coord)

        losses = []
        decoder_losses = []
            
        try:
            step = 0
            print(np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()]) / 1000000)

            while not coord.should_stop():
                _, loss_value = sesh.run([train_step, loss])
                _, decoder_loss_value = sesh.run([decoder_train_step, decoder_loss])
                losses.append(loss_value)
                decoder_losses.append(decoder_loss_value)
                if step % 500 == 0:
                    print('Step {} loss: {:.4f}'.format(step, loss_value))
                    print('Step {} decoder loss: {:.4f}'.format(step, decoder_loss_value))

                step += 1

        except tf.errors.OutOfRangeError:
            print('Done; loss: {:.4f}'.format(step, loss_value))

        finally:
            coord.request_stop()

        coord.join(threads)
        
        saver.save(sesh, 'prototype-lstm')
    return losses, decoder_losses

In [None]:
losses, decoder_losses = run_training(3, 4)

[256, 54, 74, 32] 127872
[256, 54, 74, 32] 127872
131.503331
Step 0 loss: 4799.3037
Step 0 decoder loss: 24723.8887


In [6]:
def run_inference(batchsize, num_epochs):
    video_inputs, video_outputs = inputs('training', batchsize, num_epochs)
    
    with tf.variable_scope('encoder'):
        encoded_input = encoder(video_inputs)
        encoded_input_reshaped = tf.reshape(encoded_input, (batchsize, -1, 512))
        
    with tf.variable_scope('encoder', reuse=True):
        encoded_output = encoder(video_outputs)
        encoded_output_reshaped = tf.reshape(encoded_output, (batchsize, -1, 512))
    
    with tf.variable_scope('lstm'):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=128, activation=selu)
        transitioned = tf.nn.dynamic_rnn(lstm_cell, encoded_input_reshaped, dtype=tf.float32)
        transitioned_reshaped = tf.reshape(transitioned[0], (-1, 128))
        encoded_prediction = L.dense(transitioned_reshaped, 512, activation=selu)
        encoded_prediction_reshaped = tf.reshape(encoded_prediction, (batchsize, -1, 512))
    
    with tf.variable_scope('decoder'):
        decoded = decoder(encoded_prediction)
        decoded_reshaped = tf.reshape(decoded, (batchsize, -1, 60, 80, 3))

    saver = tf.train.Saver()
    
    loss = tf.reduce_mean(tf.pow(encoded_output_reshaped - encoded_prediction_reshaped, 2))
    decoder_loss = tf.reduce_mean(tf.pow(decoded_reshaped - video_outputs, 2))
    
    init_global = tf.global_variables_initializer()
    init_local = tf.local_variables_initializer()
    
    coord = tf.train.Coordinator()
    
    with tf.Session() as sesh:
        sesh.run(init_global)
        sesh.run(init_local)
        saver.restore(sesh, 'prototype-lstm')
        threads = tf.train.start_queue_runners(sess=sesh, coord=coord)

        losses = []
        predictions = []
            
        try:
            step = 0

            while not coord.should_stop():
                prediction, loss_value = sesh.run([decoded_reshaped, loss])
                losses.append(loss_value)

                if step % 100 == 0:
                    print('Step {} loss: {:.4f}'.format(step, loss_value))
                    predictions.append(prediction)

                step += 1

        except tf.errors.OutOfRangeError:
            print('Done; loss: {:.4f}'.format(step, loss_value))

        finally:
            coord.request_stop()

        coord.join(threads)
        
    return losses, predictions

In [None]:
losses, predictions = run_inference(4, 1)

[256, 54, 74, 32] 127872
[256, 54, 74, 32] 127872
INFO:tensorflow:Restoring parameters from prototype-lstm
Step 0 loss: 15485835.0000
Step 100 loss: 19480012.0000
Step 200 loss: 17056618.0000
Step 300 loss: 14340884.0000
Step 400 loss: 22057122.0000
Step 500 loss: 18438668.0000
Step 600 loss: 20790750.0000
Step 700 loss: 20148850.0000
Step 800 loss: 19212724.0000
Step 900 loss: 16599938.0000
Step 1000 loss: 16826722.0000
Step 1100 loss: 14351856.0000
Step 1200 loss: 24198440.0000
Step 1300 loss: 17847578.0000
Step 1400 loss: 15763869.0000
Step 1500 loss: 16696988.0000
Step 1600 loss: 15672744.0000
Step 1700 loss: 17134224.0000
Step 1800 loss: 16226702.0000
Step 1900 loss: 17119726.0000
Step 2000 loss: 13506564.0000


In [8]:
predictions[0][0].shape

(64, 60, 80, 3)

In [9]:
def render_movie(frame_array, output_file, fps, max_pixel=255):
    n_frames = frame_array.shape[0]
    clipped_frames = np.minimum(np.maximum(frame_array, 0), max_pixel)
    clip = mpe.ImageSequenceClip(list(clipped_frames), fps=fps)
    clip.write_videofile(output_file)
    return None

render_movie(predictions[1][0], 'test_lstm.mp4', 5)

[MoviePy] >>>> Building video test_lstm.mp4
[MoviePy] Writing video test_lstm.mp4


100%|██████████| 64/64 [00:00<00:00, 549.05it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: test_lstm.mp4 




