# CRAFT

## Speech to animation using CRAFT Model


### Load

Load Dataset and annotations and Test Train Val split json files

In [1]:
import numpy as np
import json
from pprint import pprint


def load():
    print("Loading annotations .... ", end='')
    with open('flintstones_annotations_v1-0.json') as annotations:
        flintstones_annotations = json.load(annotations)
    print("done")

    print("Loading test train val split files .... ", end='')
    with open('train-val-test_split.json') as split:
        train_val_test_split = json.load(split)
    print("done")
return flintstones_annotations, train_val_test_split

def trainData(train_val_test_split):
    train = []
    for file in train_val_test_split['train']:
        video = np.load('flintstones_dataset/video_frames/'+file+'.npy')
        train.append(video)
    return np.array(train)

def valData(train_val_test_split):
    val = []
    for file in train_val_test_split['val']:
        video = np.load('flintstones_dataset/video_frames/'+file+'.npy')
        val.append(video)
    return np.array(val)

def testData(train_val_test_split):
    test = []
    for file in train_val_test_split['test']:
        video = np.load('flintstones_dataset/video_frames/'+file+'.npy')
        test.append(video)
    return np.array(test)

def getVideo(name):
    return np.load('flintstones_dataset/video_frames/'+name+'.npy')

In [10]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

MAX_LEN = 75  # Max length of review (in words)

def preProcessData():
    annotation, trainTest = load()
    data = []
    words = []
    tags = []
    sentences = []
    sentences_tag = []
    #sentences.append([])
    for i in range(len(annotation)):
        data.append(annotation[i]["parse"]["pos_tags"])

    for i in range(len(annotation)):
        sentences.append([])
        sentences_tag.append([])
        for j in range(len(annotation[i]["parse"]["pos_tags"])):
            words.append(annotation[i]["parse"]["pos_tags"][j][0])	
            sentences[i].append(annotation[i]["parse"]["pos_tags"][j][0])
            sentences_tag[i].append(annotation[i]["parse"]["pos_tags"][j][1])
            tags.append(annotation[i]["parse"]["pos_tags"][j][1])
    words = list(set(words))
    tags = list(set(tags))

    word2idx = {w: i + 2 for i, w in enumerate(words)}
    word2idx["UNK"] = 1 # Unknown words
    word2idx["PAD"] = 0 # Padding

    idx2word = {i: w for w, i in word2idx.items()}

    tag2idx = {t: i+1 for i, t in enumerate(tags)}
    tag2idx["PAD"] = 0

    idx2tag = {i: w for w, i in tag2idx.items()}
    print("Barney walks into the dining room and takes an apple out of a pig's mouth. The pig wakes up and speaks to him.: {}".format(word2idx["Barney"]))

    from keras.preprocessing.sequence import pad_sequences

    # Convert each sentence from list of Token to list of word_index
    X = []
    y = []
    for i in range(len(sentences)):
        X.append([])
        for j in range(len(sentences[i])):
            X[i].append(word2idx[sentences[i][j]])

    X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])

    for i in range(len(sentences)):
        y.append([])
        for j in range(len(sentences_tag[i])):
            y[i].append(tag2idx[sentences_tag[i][j]])

    y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])
    print(len(tags))
    from keras.utils import to_categorical
    # One-Hot encode
    y = [to_categorical(i, num_classes=len(tags)+1) for i in y]  # n_tags+1(PAD)


    from sklearn.model_selection import train_test_split
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
    X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape
    return X_tr, X_te, y_tr, y_te, len(words), len(tags), idx2tag

In [11]:
# %load lstm.py
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
import numpy as np

BATCH_SIZE = 512  # Number of examples used in each iteration
EPOCHS = 1  # Number of passes through entire dataset
MAX_LEN = 75  # Max length of review (in words)
EMBEDDING = 40  # Dimension of word embedding vector


def lstm():

    input = Input(shape=(MAX_LEN,))
    model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
    model = Bidirectional(LSTM(units=100, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

    model.summary()

    return model

X_tr, X_te, y_tr, y_te, n_words, n_tags, idx2tag = preProcessData()

model = lstm()
history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2)

Loading annotations .... done
Loading test train val split files .... done
Barney walks into the dining room and takes an apple out of a pig's mouth. The pig wakes up and speaks to him.: 1135
42


In [None]:
# Eval
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)


from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)



i = np.random.randint(0,X_te.shape[0]) # choose a random number between 0 and len(X_te)
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_te.shape[0]))
# Visualization
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

### CNN Backbone

In [None]:
from keras.models import Sequential, Model
from keras.layers import *

def cnnModel(F):

    CNN1 = Input((128, 128, 3*F))
    CNN2 = Conv2D(64, kernel_size=(3, 3), strides=(2, 2), dilation_rate=(1, 1), padding='same', activation='relu')(CNN1)
    CNN3 = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), dilation_rate=(1, 1), padding='same', activation='relu')(CNN2)
    CNN4 = Conv2D(256, kernel_size=(3, 3), strides=(1, 1), dilation_rate=(2, 2), padding='same', activation='relu')(CNN3)
    CNN5 = Conv2D(512, kernel_size=(3, 3), strides=(1, 1), dilation_rate=(4, 4), padding='same', activation='relu')(CNN4)
    CNN6 = Conv2D(100, kernel_size=(1, 1), padding='same', activation='relu')(CNN5)
    CNN7 = Conv2D(2, kernel_size=(1, 1), padding='same', activation='relu')(CNN6)
    model = Model(inputs=CNN1, outputs=CNN7)

    return model

In [None]:
# %load layoutcomposer.py
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import plot_model
from keras import backend as K
from cnn import *
import numpy as np

F = 8

def bilinear_kernel(h, w, channels, use_bias = True, dtype = "float32") :

    y = np.zeros((h,w,channels,channels), dtype = dtype)
    for i in range(0, h):
        for j in range(0, w):
            y[i,j,:,:] = np.identity(channels) / float(h*w*1)
    if use_bias : return [y,np.array([0.], dtype = dtype)]
    else : return [y]


def channelPool(x):
  
    return K.max(x,axis=-1)

def createModel():


    cnn = cnnModel(F)
    # FULLY CONV. LOCATION MLP
    Ploc1 = Conv2D(256, kernel_size=(1, 1), padding='same', activation='relu')(cnn.layers[4].output) 
    Ploc2 = Conv2D(128, kernel_size=(1, 1), padding='same', activation='relu')(Ploc1)
    Ploc3 = Conv2D(128, kernel_size=(1, 1), padding='same', activation='relu')(Ploc2)
    Ploc4 = Conv2D(F, kernel_size=(1, 1), padding='same', activation='relu')(Ploc3)

    # UPSAMPLING FOR LOCATION OUTPUT
    #Ploc = UpSampling2D(size=(2, 2), data_format=None, interpolation='bilinear')

    #TODO
    Ploc = Conv2D(filters = F, kernel_size = (4, 4), strides=(1,1), 
        activation = 'softmax', padding = 'same', use_bias = False,
        weights = bilinear_kernel(4, 4, F, False))(Ploc4)
    #print(CNN5.shape)
    # CHANNEL MAXPOOLING AND MERGE WITH CNN
    #TODO
    Max = Lambda(channelPool)(Ploc4)
    Avg = average(cnn.output, Max)
    # SCALE MLP
    #TODO
    #mu1 = Dense(256, activation='relu')(attention)
    #mu2 = Dense(128, activation='relu')(mu1)
    #mu3 = Desne(2*F, activation='sigmoid')(mu2)

    # MODEL SUMMARY
    model = Model(inputs=cnn.input, outputs=Max)
    model.summary()
    plot_model(model, to_file='LAYOUTCOMPOSER.png', show_shapes=True)
    return model

createModel()


In [None]:
# %load entityretriever.py
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import plot_model
import numpy as np
from cnn import *

F = 8

#Returns Model of Query/Target Embedding Network
#Input: [l1,V1,l2,V2.........lF,VF]
#Ouput: Query/Target Embeddings

def EmbeddingNetwork(Entity, Query):
    # Input
    In = []

    # CNN for Video Frames
    cnn = []

    # Ouput of CNNs
    CNN = []

    # Querry/Target Embedding CNNs
    for i in range(F):
        cnn.append(cnnModel(1))

    # CNN Input and output
    for i in range(F):
        In.append(Input((128, 128, 1)))
        In.append(cnn[i].input)
        CNN.append(cnn[i].output)

    # ROI/Global Pooling
    #TODO
    if Entity == True:
        # ROI Polling
    else:
        # Global Polling

    # Query Video Bi-LSTM
    #TODO

    # LSTM and Text LSTM Concat
    #TODO	
    #concat = 

    # Query MLP
    if Query == True:
        MLP = Dense(256, activation='relu')(concat)
        MLP = Dense(128, activation='relu')(MLP)

    # L2 Normalize
    #TODO

    model = Model(inputs=In, outputs=CNN)
    model.summary()
    if Query == True:
        plot_model(model, to_file='QUERYEMBEDDING.png', show_shapes=True)
    else:
        plot_model(model, to_file='TARGETEMBEDDING.png', show_shapes=True)
    return model

# Returns Model of Entity/Background Retriever
# Input: [l1,V1,l2,V2, ... ,lF,VF, l1,V1,l2,V2, ... ,lF,VF]
# OUTPUT: [q . r]

def Retriever(Entity):
    querry = QueryEmbeddingNetwork(Entity, True)
    target = TargetEmbeddingNetwork(Entity, False)
    model = Model(inputs=[querry.inputs target.inputs], outputs=CNN)
    model.summary()
    if Entity == True:
        plot_model(model, to_file='ENTITYRETRIEVER.png', show_shapes=True)
    else:
        plot_model(model, to_file='BACKGROUNDRETRIEVER.png', show_shapes=True)
    return model

