In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.framework import ops

from load_data import *
from models.customlayers import *
from models.activations import *
from training import *

import moviepy.editor as mpe
import models.ConvAE as cae
L = tf.layers

import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
data_dir = os.path.expanduser('~/Insight/video-representations/frames')
training_file = os.path.join(data_dir, 'training.tfrecords')
validation_file = os.path.join(data_dir, 'validation.tfrecords')
testing_file = os.path.join(data_dir, 'testing.tfrecords')

# Model

Try first:

- input full video, output full video
- lstm encoder-decoder: at each timestep, input frame, output prediction for next frame
- start with fixed size: 20 frames (later work with true recurrent net)

Below is wrong; just instantiate lstm outside of function for now


In [3]:
def encoder(image):
    original_shape = image.get_shape().as_list()
    input_layer = tf.reshape(image, (-1, 60, 80, 3))
    
    conv1 = L.conv2d(
        input_layer, name='conv1',
        filters=16, kernel_size=3, activation=selu
    )
        
    conv2 = L.conv2d(
        conv1, name='conv2',
        filters=32, kernel_size=3, activation=selu
    )
        
    conv3 = L.conv2d(
        conv2, name='conv3',
        filters=32, kernel_size=5, activation=selu
    )
        
    conv4 = L.conv2d(
        conv3, name='conv4',
        filters=32, kernel_size=5, activation=selu
    )
        
    shape = conv4.get_shape().as_list()
    print(shape)
    newdim = shape[1] * shape[2] * shape[3]
    print(newdim)
    conv4_flat = tf.reshape(conv4, (-1, newdim))
    print(conv4_flat.shape)

    dense1 = dense_reshape(
        conv4, name='dense1',
        units=512, activation=selu
    )
    
    return dense1


def decoder(encoded):
    dense1 = L.dense(encoded, units=104448, activation=selu, name='dense1')

    dense1_reshaped = tf.reshape(dense1, (-1, 48, 68, 32))

    deconv1 = L.conv2d_transpose(
        dense1_reshaped, name='deconv1',
        filters=64, kernel_size=5, activation=selu    
    )

    deconv2 = L.conv2d_transpose(
        deconv1, filters=32, kernel_size=5, activation=selu,
        name='deconv2'
    )

    deconv3 = L.conv2d_transpose(
        deconv2, name='deconv3', 
        filters=32, kernel_size=3, activation=selu,
    )
    
    deconv4 = L.conv2d_transpose(
        deconv3, name='deconv4', 
        filters=3, kernel_size=3, activation=selu,
    )
    
    deconv4_reshaped = tf.transpose(deconv4, perm=(0, 3, 1, 2))

    return deconv4_reshaped

In [4]:
tf.reset_default_graph()

input_var = tf.placeholder(dtype=tf.float32, shape=(None, 3, 60, 80), name='input')
target_var = tf.placeholder(dtype=tf.float32, shape=(None, 3, 60, 80), name='target')

with tf.variable_scope('encoder'):
    encoded = encoder(input_var)
    
with tf.variable_scope('decoder'):
    decoded = decoder(encoded)

[None, 48, 68, 32]
104448
(?, 104448)


In [4]:
def read_record(filepath_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filepath_queue)
    
    features = tf.parse_single_example(
        serialized_example,
        features={
            'height': tf.FixedLenFeature([], tf.int64),
            'width': tf.FixedLenFeature([], tf.int64),
            'video': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
            'length': tf.FixedLenFeature([], tf.int64)
        }
    )
    
    video = tf.decode_raw(features['video'], tf.uint8)   # feature may be renamed to video in future
    l = tf.cast(features['length'], tf.int32)
    h = tf.cast(features['height'], tf.int32)
    w = tf.cast(features['width'], tf.int32)

    video_shape = tf.stack([-1, 60, 80, 3])
    video = tf.cast(tf.reshape(video, video_shape), tf.float32)
    
    return video
    
def inputs(split_type, batchsize, num_epochs): 
    if not num_epochs:
        num_epochs = None
        
    filepath = os.path.join(data_dir, '{}.tfrecords'.format(split_type))
    
    with tf.name_scope('input'):
        filepath_queue = tf.train.string_input_producer([filepath], num_epochs=num_epochs)
    
    video = read_record(filepath_queue) # each video is a rank 4 tensor: [n_frames, x, y, c]
    print(video.get_shape().as_list())
    videos = tf.train.batch(
        [video], batchsize, 
        capacity=128 + 2*batchsize, 
        num_threads=2,
        dynamic_pad=True
    )
    
    return videos

In [5]:
def run_training(num_epochs, batchsize):
    videos = inputs('training', batchsize, num_epochs)
    video_inputs = tf.slice(videos, begin=[0, 0, 0, 0, 0], size=[-1, 127, -1, -1, -1])
    video_outputs = tf.slice(videos, begin=[0, 1, 0, 0, 0], size=[-1, 127, -1, -1, -1])
    
    with tf.variable_scope('encoder'):
        encoded = encoder(video_inputs)
        encoded_reshaped = tf.reshape(encoded, (1, -1, 512))
    
    with tf.variable_scope('lstm'):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=256)
        transitioned = tf.nn.dynamic_rnn(lstm_cell, encoded_reshaped, dtype=tf.float32)
        transitioned_reshaped = tf.reshape(transitioned[0], (-1, 256))
        decoder_input = L.dense(transitioned_reshaped, 512, activation=selu)
        
    with tf.variable_scope('decoder'):
        decoded = decoder(decoder_input)
        
    l2_weight = .01
    l2_term = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name])
    
    videos_reshaped = tf.reshape(video_outputs, (-1, 3, 60, 80))

    loss = tf.reduce_mean(tf.pow(decoded - videos_reshaped, 2))
    train_step = tf.train.AdamOptimizer().minimize(loss + l2_weight*l2_term)
    saver = tf.train.Saver()
    init_global = tf.global_variables_initializer()
    init_local = tf.local_variables_initializer()
    
    coord = tf.train.Coordinator()
    
    with tf.Session() as sesh:
        sesh.run(init_global)
        sesh.run(init_local)
        threads = tf.train.start_queue_runners(sess=sesh, coord=coord)

        losses = []
        try:
            step = 0
            print(np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()]) / 1000000)

            while not coord.should_stop():
                _, loss_value = sesh.run([train_step, loss])
                losses.append(loss_value)
                if step % 500 == 0:
                    print('Step {} loss: {:.4f}'.format(step, loss_value))
                
                step += 1
                
        except tf.errors.OutOfRangeError:
            print('Done; loss: {:.4f}'.format(step, loss_value))
            
        finally:
            coord.request_stop()
            
        coord.join(threads)

In [None]:
run_training(2, 1)

[None, 60, 80, 3]
[127, 48, 68, 32]
104448
(127, 104448)
108.147715
Step 0 loss: 4770.2798
Step 500 loss: 2056.1392
Step 1000 loss: 1613.0112
Step 1500 loss: 3014.6636
Step 2000 loss: 2132.7944


In [None]:
def render_movie(frame_array, output_file, fps, max_pixel=256):
    n_frames = frame_array.shape[0]
    clipped_frames = np.minimum(np.maximum(frame_array, 0), max_pixel)
    clip = mpe.ImageSequenceClip(list(clipped_frames), fps=fps)
    clip.write_videofile(output_file)
    return None