# SignGAN

# Importing the Libraries

In [1]:
import tensorflow as tf
import numpy as np
import glob
import os

from bert_utils import Bert
from utils.video import Video
from utils.conv_attention import *
from utils.generator import *
from utils.discriminator import *
from utils.losses import *

In [2]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

# Pretrained BERT Model
Multilingual Cased BERT is used

In [3]:
bert = Bert()
word_embeddings, sentence_embeddings = bert.predict(['sonst wechselhaft mit schauern und gewittern die uns auch am wochenende begleiten'])
print(word_embeddings.shape, sentence_embeddings.shape)

Done loading 196 BERT weights from: models/multi_cased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x000002ED2903A888> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
(1, 64, 768) (1, 768)


# Global Variables

In [3]:
num_clips = 32
T = 16 # let
MAX_VIDEO_LENGTH = 512      # 475 is the longest
FRAME_DIM = (64, 64, 3)
VIDEO_DIM = (512, 64, 64, 3)
data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'

# Video Object
with example from training set

In [4]:
video_obj = Video(T, MAX_VIDEO_LENGTH, FRAME_DIM, VIDEO_DIM, data_dir)

video_real = video_obj.get_video('train', '05January_2010_Tuesday_tagesschau-2664')
current_sequence_length = video_real.shape[0]

video_real = video_obj.preprocess_video(video_real)
print(video_real.shape)

video_real = video_obj.divide_sequence(video_real)
print(video_real.shape)

video_wrong = video_obj.get_video('train', '03June_2011_Friday_tagesschau-7649')
current_sequence_length = video_wrong.shape[0]

video_wrong = video_obj.preprocess_video(video_wrong)
print(video_wrong.shape)

video_wrong = video_obj.divide_sequence(video_wrong)
print(video_wrong.shape)

(640, 64, 64, 3)
(40, 16, 64, 64, 3)


'\npad_mask = video_obj.padding_mask(current_sequence_length)\nprint(pad_mask.shape)\n\nlook_mask = video_obj.look_ahead_mask()\nprint(look_mask.shape)\n'

# Attention
* First, perform convolutions across the whole video, then go for separable self attention for the whole video masked, i.e, (40, 40T, H, W, C), outputs (40, 40T, H, W, C), where T=16 and 40=640//T. 640 being the length of the whole video.
* Convolution operations will be restricted to only T frames at a time. There will be no intermingling of 2 or more sets of T frames. Thus 3D CONV will take care of extracting local, temporal and spatial features only. Hack : Batched
* After the conv operations there will be MAX_SEQ_LENGTH // T i.e, 40 (here) elements each of size (T, H, W, C), making the output of convolution ops, to be of dim -> (40, T, H, W, C) converted to (40T, H, W, C). This will be passed through attention blocks and outputs (40, 40T, H, W, C) along with masking.
* This attention performs masked attention. We will have MAX_SEQ_LENGTH // T i.e, 40 (here) masks in total each for time, height, width. Each mask being of shape (40, 40T, H, W, H*W) for time, W*T for height and H*T for width.
* Only after the whole video is generated are the losses calculated, and backpropped.
* During testing just "start" token will be provided and the rest of the sequence will be just padding, and each time the generator produces T frames, those T frames will be concatenated along with the "start" token and then convoluted again to produce the next T frames.

### Attention mechanism will require residual connections otherwise gradients will vanish

## Word Frame Attention
* Last dimension of both masked_attention_output and semantic_word_matrix must be same
* 2nd last dimenstion i.e, 1st dimenstion of semantic_word_matrix should equal to H*W of masked-separable-self-attention output 
* that is, we bring both to a common semantic space using conv for frames and dense for embeddings

In [4]:
attention = Attention(channel_dimension=3)

num_clips = 32
t, h, w, c = 16, 8, 8, 3

x = tf.random.normal(shape=(t, h, w, c), mean=0.5, stddev=0.5)

# this step is done only before the 1st attention block
x = tf.reshape(tf.repeat(x, repeats=num_clips, axis=0), (num_clips, t, h, w, c))

mask_t = look_ahead_mask(num_clips, (t, h, w, h*w))
mask_h = look_ahead_mask(num_clips, (t, h, w, t*w))
mask_w = look_ahead_mask(num_clips, (t, h, w, t*h))

word_embeddings = tf.squeeze(word_embeddings)

print(x.shape)

(4, 16, 8, 8, 3)


# Conv-Attention Block
* The input (640, 64, 64, 3) video will be explicitly split into (40, 16, 64, 64, 3).
* HACK : This reshaped input will be fed to 3D conv block with batch size being 40(hack), for local spatial and temporal feature extraction
* Conv Block output will be (40, 16, 8, 8, 64), which will be reshaped to (640, 8, 8, 64) and sent forward for masked-separable-self-attention followed by word-frame-attention for a few number of times (Attention Block), with addition and layer normalisation after each attention block
* Conv Block : 
    * Format : num_filters, kernel, strides, padding
    * {8, (3, 3, 3), (2, 2, 2), same} -> {16, (3, 3, 3), (2, 2, 2), same} -> {32, (3, 3, 3), (2, 2, 2), same} -> {out_channels(64), (3, 3, 3), (2, 2, 2), same}

In [5]:
num_clips = 4   # instead of 32
t, h, w, c = 16, 64, 64, 3
print("Overall Video : ", (num_clips * t, h, w, c))
print("Input Video Shape : ", (num_clips, t, h, w, c))

x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)
word_embeddings = tf.squeeze(word_embeddings)

conv_attn = ConvAttn(num_attention_blocks=4)
x = conv_attn(x, word_embeddings)
print("Attention Output Shape : ", x.shape)

Overall Video :  (64, 64, 64, 3)
Input Video Shape :  (4, 16, 64, 64, 3)
Attention Output Shape :  (4, 8, 8, 8, 64)


# Conditional GAN (3D)
* The conditions generated by ConvAttn block will be used along ith upscaled randomness to generate a video with T frames for each batch of size num_clips(40)
* use LayerNorm instead of BatchNorm, because BatchNorm outputs NaN because of the padding and masking
* In the future if this fails, even this block may have attention blocks in the intermediate layers
* Upsampling z : 
* Concatenate upsampled z with conv_attn_output
* upsample to produce the video
* this 'z' will be upscaled to num_clips x (num_clips x t, h, w, channels) and concatenated with conv_attn_output, along the channel dimension, which together will be upscaled using deconv to produce a 40 x (16, 64, 64, 3) video. Hack : Use 40 as batch size throughout
* We will feed 'z' from outside. If it is inside it'll stay constant and won't be random

In [5]:
z = tf.random.normal(shape=(1, 100))
cdcgan = CDCGAN()
z = cdcgan(z, x)
print("Output Shape : ", z.shape)
print("Output Video Shape : ", (z.shape[0] * z.shape[1], z.shape[2], z.shape[3], z.shape[4]))

Output Shape :  (4, 16, 64, 64, 3)
Output Video Shape :  (64, 64, 64, 3)


Thus output video is in the shape we wanted

# Full Generator


In [4]:
num_clips = 32   # instead of 32
t, h, w, c = 16, 64, 64, 3

x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)
word_embeddings = tf.squeeze(word_embeddings)
z = tf.random.normal(shape=(1, 100))

generator = Generator()
x = generator(x, word_embeddings, z)
print(x.shape)

(64, 768)
Conv1 Out Shape :  (32, 16, 64, 64, 8)
Conv2 Out Shape :  (32, 8, 32, 32, 16)
Conv3 Out Shape :  (32, 4, 16, 16, 32)
Conv4 Out Shape :  (32, 2, 8, 8, 64)
(32, 8, 8, 8, 256) (32, 8, 8, 8, 512)
(32, 16, 64, 64, 3)


# Discriminators
* Batched
* For discriminators we also consider the batch dimension
* Before the video goes into the discriminator we have to reshape the video to (1, num_clips * t, h, w, c), 1 -> batch_size

# Video Discriminator
* Gotta setup proper input pipelines for discriminator training
* 64, 64, 64, 3 -> 32, 64, 64, 16 -> 16, 64, 64, 32 -> 8, 32, 32, 64 -> 4, 16, 16, 128 -> 2, 8, 8, 256 -> 1, 4, 4, 512

In [3]:
v = tf.random.normal((2, 32 * 16, 64, 64, 3))   # batch dimension considered 
s = tf.random.normal((2, 768))

video_disc = VideoDiscriminator()
vid_disc_out = video_disc(v, s)
print(vid_disc_out)     # 2 outputs for batch_size = 2

tf.Tensor(
[[0.45877308]
 [0.6192273 ]], shape=(2, 1), dtype=float32)


# Frame Discriminator
* 2-fold
* Outputs "{0 ... 1}" for single frame level
* Outputs temporal (difference between 2 consecutive frames in euclidean norm, for each pair) downscaled as output, i.e, 1 number as output per pair of consecutive frames
* One part of both the discriminators are shared

In [4]:
# Total frames kept 8 instead of 640 because of ResourceExhaustError
v = tf.random.normal((2, 8, 64, 64, 3))    # kept batch_size = 2 here
s = tf.random.normal((2, 768))

frame_disc = FrameDiscriminator()
frame_disc_out, motion_disc_out = frame_disc(v, s)

print(frame_disc_out.shape, motion_disc_out.shape)
# Thus 2 outputs of each frame and motion disc, for batch_size=2

(2, 8) (2, 7)


# Full Discriminator

In [None]:
discriminator = Discriminator()

v = tf.random.normal((1, 32 * 16, 64, 64, 3))   # batch dimension considered 
s = tf.random.normal((1, 768))                  # keeping batch_size = 1

video_disc_out, frame_disc_out, motion_disc_out = discriminator(v, s)
print(video_disc_out.shape, frame_disc_out.shape, motion_disc_out.shape)
print(discriminator.summary())

# Losses
* Matching Aware Losses
* Output of motion discriminator is a bit high in value (though only used for the generator)
* Using Scheme 2 of Microsoft

In [6]:
# Doing with the same ones, but won't be the case
vid_loss = video_loss(vid_disc_out, vid_disc_out, vid_disc_out)
print(vid_loss)
fr_loss = frame_loss(frame_disc_out, frame_disc_out, frame_disc_out)
print(fr_loss)
mot_loss = motion_loss(motion_disc_out, motion_disc_out, motion_disc_out)
print(mot_loss)

tf.Tensor(0.7362369, shape=(), dtype=float32)
tf.Tensor(0.7079885, shape=(), dtype=float32)
tf.Tensor(0.68915033, shape=(), dtype=float32)


In [11]:
print(discriminator_loss(vid_loss, fr_loss, mot_loss))
print(generator_loss(vid_disc_out, frame_disc_out, motion_disc_out))

tf.Tensor(0.67633134, shape=(), dtype=float32)
tf.Tensor(0.20451142, shape=(), dtype=float32)


# Train Step

In [1]:
# Takes correct video, wrong video and word embeddings, sentence
# all of this must be preprocessed (padded and stuff)
# Video must be explicitly divided into T frames
def train_step(video_real, video_wrong, w, s):
    num_clips, t, h, w, c = video_real.shape

    w = tf.squeeze(w)
    z = tf.random.normal(shape=(1, 100))
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        video_fake = generator(video_real, w, z)

        # All frames put together with bs = 1
        video_real = tf.reshape(video_real, (1, num_clips * t, h, w, c))
        video_wrong = tf.reshape(video_wrong, (1, num_clips * t, h, w, c))
        video_fake = tf.reshape(video_fake, (1, num_clips * t, h, w, c))

        # Discriminator out
        disc_video_real, disc_frame_real, disc_motion_real = discriminator(video_real, s)
        disc_video_wrong, disc_frame_wrong, disc_motion_wrong = discriminator(video_wrong, s)
        disc_video_fake, disc_frame_fake, disc_motion_fake = discriminator(video_fake, s)

        # Losses
        total_video_loss = video_loss(disc_video_real, disc_video_wrong, disc_video_fake)
        total_frame_loss = frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake)
        total_motion_loss = motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake)

        disc_loss = discriminator_loss(total_video_loss, total_frame_loss, total_motion_loss)
        gen_loss = generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))


# Optimizers and Checkpoint

In [None]:
generator = Generator()
discriminator = Discriminator()

In [None]:
learning_rate = 0.0002  # Following microsoft
generator_optimizer = tf.keras.optimizers.Adam(learning_rate)     # rest default
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate)     # rest default

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

In [None]:
train_step(video_real[:8], video_wrong[:8], word_embeddings, sentence_embeddings)