In [1]:
from data import load_data
import matplotlib.pyplot as plt
import os
import numpy as np

import time

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.recurrent import GRU
from keras.layers import Input
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding3D
from keras.layers.core import Lambda, Dropout, Flatten, Dense, Activation
from keras.optimizers import Adam
from keras import backend as K

  (fname, cnt))
  (fname, cnt))
Using TensorFlow backend.


In [2]:
CURRENT_PATH = '/home/ubuntu/assignments/machine-lip-reading/preprocessing'
DATA_PATH = CURRENT_PATH + '/../data'

In [3]:
def ctc_lambda_func(args):
    import tensorflow as tf
    y_pred, labels, input_length, label_length = args
    # From Keras example image_ocr.py:
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # y_pred = y_pred[:, 2:, :]
    label_length = K.cast(tf.squeeze(label_length),'int32')
    input_length = K.cast(tf.squeeze(input_length),'int32')
    labels = K.ctc_label_dense_to_sparse(labels, label_length)
    #y_pred = y_pred[:, :, :]
    #return K.ctc_batch_cost(labels, y_pred, input_length, label_length, ignore_longer_outputs_than_inputs=True)
    return tf.nn.ctc_loss(labels, y_pred, input_length, ctc_merge_repeated=False,
                         ignore_longer_outputs_than_inputs = True, time_major = False)

In [4]:
def CTC(name, args):
	return Lambda(ctc_lambda_func, output_shape=(1,), name=name)(args)

In [5]:
def build_model(input_size, output_size = 28, max_string_len = 10):
    # model = Sequential()
    input_data = Input(name='the_input', shape=input_size, dtype='float32')
    x = ZeroPadding3D(padding=(0,2,2), name='padding1')(input_data)
    x = TimeDistributed(Conv2D(filters = 32, kernel_size = 5, strides = (2,2),
                             padding = 'same', activation = 'relu'))(x)
    print
    x = TimeDistributed(MaxPooling2D(pool_size=(2,2), strides=None, name='max1'))(x)
    x = Dropout(0.5)(x)

    x = TimeDistributed(Conv2D(filters=32, kernel_size=5, strides=(2, 2),
                               padding='same', activation='relu'))(x)
    x = TimeDistributed(MaxPooling2D(pool_size=(2,2), strides=None, name='max1'))(x)
    x = Dropout(0.5)(x)

    x = TimeDistributed(Conv2D(filters=4, kernel_size=5, strides=(2, 2),
                               padding='same', activation='relu'))(x)
    x = TimeDistributed(MaxPooling2D(pool_size=(2,2), strides=None, name='max1'))(x)
    x = Dropout(0.5)(x)

    input_lstm = TimeDistributed(Flatten())(x)

    x_lstm = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(input_lstm)
    x_lstm = Dense(output_size, kernel_initializer='he_normal', name='dense1')(x_lstm)
    print("after dense1")
    y_pred = Activation('softmax', name='softmax')(x_lstm)

    labels = Input(name='the_labels', shape = [max_string_len], dtype='int32')
    input_length = Input(name = 'input_length', shape =[1], dtype = 'int32')
    label_length = Input(name = 'label_length', shape = [1], dtype = 'int32')
    loss = CTC('ctc',[y_pred, labels, input_length, label_length])
    model = Model(inputs=[input_data, labels, label_length, input_length],
                  outputs = loss)
    model.summary()
    # Build model here...

    return model

In [6]:
def pad_labels(labels, max_string_len):
    padding = np.ones((labels.shape[0], max_string_len - labels.shape[1])) * -1
    return np.concatenate((labels, padding), axis = 1)

In [7]:
def train(model, x_train, y_train, label_len_train, input_len_train, batch_size=256, epochs=100, val_train_ratio=0.2):
    max_string_len = 10
    if y_train.shape[1] != max_string_len:
        y_train = pad_labels(y_train, max_string_len)

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
    history = model.fit(x = {'the_input':x_train, 'the_labels':y_train, 'label_length':label_len_train,
                             'input_length':input_len_train}, y = {'ctc': np.zeros([x_train.shape[0]])},
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=val_train_ratio,
                        shuffle=True,
                        verbose=1)

    return history

In [8]:
def read_data():
    oh = OneHotEncoder()
    le = LabelEncoder()

    x = list()
    y = list()
    t = list()
    print("loading images...")
    for i, (img, words) in enumerate(load_data(DATA_PATH, verbose=False, framebyframe=False)):
        if img.shape[0] != 75:
            continue
        x.append(img)
        y.append(words)

        t += words.tolist()
        if i == 3:
            break

    t = le.fit_transform(t)
    oh.fit(t.reshape(-1, 1))

    print("convering to np array...")
    x = np.stack(x, axis=0)

    print("transforming y...")
    for i in range(len(y)):
        y_ = le.transform(y[i])
        y[i] = np.asarray(oh.transform(y_.reshape(-1, 1)).todense())
    y = np.stack(y, axis=0)

    return x, y

In [13]:
import numpy as np
import os
from align import read_align
from video import read_video
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import re

CURRENT_PATH = '/home/ubuntu/assignments/machine-lip-reading/preprocessing'
DATA_PATH = CURRENT_PATH + '/../data'
PREDICTOR_PATH = CURRENT_PATH + '/shape_predictor_68_face_landmarks.dat'
SAVE_NUMPY_PATH = CURRENT_PATH + '/../data/numpy_results'


def text_to_labels(text):
    ret = []
    for char in text:
        if char >= 'a' and char <= 'z':
            ret.append(ord(char) - ord('a'))
        elif char == ' ':
            ret.append(26)
    return ret

def labels_to_text(labels):
# 26 is space, 27 is CTC blank char
    text = ''
    for c in labels:
        if c >= 0 and c < 26:
            text += chr(c + ord('a'))
        elif c == 26:
            text += ' '
    return text

In [22]:
import numpy as np
import os
from align import read_align
from video import read_video
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

CURRENT_PATH = '/home/ubuntu/assignments/machine-lip-reading/preprocessing'
# CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = CURRENT_PATH + '/../data'
PREDICTOR_PATH = CURRENT_PATH + '/shape_predictor_68_face_landmarks.dat'


def text_to_labels(text):
    ret = []
    for char in text:
        if char >= 'a' and char <= 'z':
            ret.append(ord(char) - ord('a'))
        elif char == ' ':
            ret.append(26)
    return ret

def labels_to_text(labels):
# 26 is space, 27 is CTC blank char
    text = ''
    for c in labels:
        if c >= 0 and c < 26:
            text += chr(c + ord('a'))
        elif c == 26:
            text += ' '
    return text

def load_data(datapath, speaker, verbose=True, num_samples=1000, ctc_encoding=True):
    oh = OneHotEncoder()
    le = LabelEncoder()

    counter = 0
    done = False

    max_len = 0
    max_word_len = 0

    x = list()
    y = list()
    
    word_len_list = []
    input_len_list = []
    
    path = datapath + '/' + str(speaker)
    for root, dirs, files in os.walk(datapath):
        for name in files:
            if '.mpg' in name:
                if verbose is True:
                    print(str(counter) + ": reading - " + root + name)

                video = read_video(os.path.join(root, name), PREDICTOR_PATH)
                alignments = read_align(os.path.join(root, '../align/', name.split(".")[0] + ".align"))

                for start, stop, word in alignments:
                    if word == 'sil' or word == 'sp':
                        continue
                   
                    if (len(x) > 0):
                        _, d1, d2, d3 = video[start:stop].shape
                        _, prev_d1, prev_d2, prev_d3 = x[-1].shape
                        if (d1, d2, d3) != (prev_d1, prev_d2, prev_d3):
                            if verbose is True:
                                print("different size, skip")
                            continue
                    
                    x.append(video[start:stop])
                    y.append(word)
                            
                    max_word_len = max(max_word_len, len(word))
                    max_len = max(max_len, stop-start)

                    word_len_list.append(len(word))
                    input_len_list.append(stop-start)
                    
                    counter += 1
                    if counter % num_samples == 0:
                        
                        if not ctc_encoding:
                            y = le.fit_transform(y)
                            y = oh.fit_transform(y.reshape(-1, 1)).todense()

                        for i in range(len(x)):
                            result = np.zeros((max_len, 50, 100, 3))
                            result[:x[i].shape[0], :x[i].shape[1], :x[i].shape[2], :x[i].shape[3]] = x[i]
                            x[i] = result

                            if ctc_encoding:
                                res = np.ones(max_word_len) * -1
                                enc = np.array(text_to_labels(y[i]))
                                res[:enc.shape[0]] = enc
                                y[i] = res

                        if ctc_encoding:
                            y = np.stack(y, axis=0)

                        x = np.stack(x, axis=0)

                        print('saving numpy')
                        np.savez_compressed(str(speaker) + '_x_' + str(counter % num_samples), x=x)
                        np.savez_compressed(str(speaker) + '_y_' + str(counter % num_samples), y=y)
                        np.savez_compressed(str(speaker) + '_wi_' + str(counter % num_samples),
                                            word_length=word_len_list, input_length=input_len_list)
                        

                        max_len = 0
                        max_word_len = 0

                        x = list()
                        y = list()

                        word_len_list = []
                        input_len_list = []
    

In [None]:
load_data(DATA_PATH, 's1')

0: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videopwaj8p.mpg
6: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videobbifzp.mpg
12: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videobras7s.mpg
18: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videosbwh8p.mpg
24: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videolwwm1s.mpg
30: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videobgbb2p.mpg
36: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videolrik4p.mpg
42: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videolgamzp.mpg
48: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videopwwq8n.mpg
54: reading - /home/ubuntu/assignments/machine-lip-reading/preprocessing/../data/s1/videolrar3a.mpg
60