## define our video data loader

In [1]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
from PIL import Image
import albumentations as A
import torchvision.transforms as transforms

# data augmentation for training
augmentation = A.Compose([
    A.OneOf([
        A.IAAAdditiveGaussianNoise(p=0.9),
        A.GaussNoise(p=0.9),
    ], p=0.9),
    A.OneOf([
        A.MotionBlur(p=0.9),
        A.MedianBlur(blur_limit=3, p=0.9),
        A.Blur(blur_limit=4, p=0.9),
    ], p=0.9),
    A.OneOf([
        A.CLAHE(clip_limit=2, p=0.9),
        A.IAASharpen(p=0.9),
        A.IAAEmboss(p=0.9),
        A.RandomBrightnessContrast(p=0.95),
    ], p=0.9),
    A.OneOf(
        [
            A.HueSaturationValue(p=0.9),
            A.RandomGamma(p=0.9),
            A.IAAPerspective(p=0.05),
        ], p=0.9,
    )
])


def build_transform(shape):
    transform = transforms.Compose([
        transforms.Resize((shape[0], shape[1])),
        transforms.ToTensor()
    ])
    return transform


class VideoDataset(Dataset):
    def __init__(self, folder_list, char_dict,
                 fixed_frame_num=200, fixed_max_len=6,
                 image_shape=(100, 100),
                 aug=augmentation):
        """
        
        :param folder_list: video image folders for training or validation
        :param char_dict: 
        :param fixed_frame_num: 
        :param fixed_max_len: 
        :param image_shape: 
        :param aug: whether to use data augmentation or not, None or augmentation object
        """
        self.folders = folder_list
        np.random.shuffle(self.folders)
        self.fixed_frame_num = fixed_frame_num
        self.char_dict = char_dict
        self.fixed_max_len = fixed_max_len
        self.augmentation = aug
        self.image_shape = image_shape
        self.transform = build_transform(shape=self.image_shape)

    def __len__(self):
        return len(self.folders)

    def __getitem__(self, index):
        image_folder = self.folders[index]
        label = image_folder.split("/")[-1].split("_")[-1].strip(" ")
        # encode the char text to class index for training
        label_digit = [self.char_dict[i] for i in label]
        assert len(label_digit) < self.fixed_max_len
        label_digit.append(self.char_dict["<eos>"])
        rest = self.fixed_max_len - len(label_digit)
        if rest:
            label_digit += [self.char_dict["<blank>"]] * rest

        image_list = [os.path.join(image_folder, i) for i in os.listdir(image_folder) if i.endswith(".jpg")]
        image_list = sorted(image_list)
        images = []
    
        if len(image_list) >= self.fixed_frame_num:
            # due to GPU limitation, we can not generate too huge videos,
            # so we have to set a fixed max frame number
            image_list = image_list[:self.fixed_frame_num]
        else:
            # if the image number is lower than fixed frame number, we pad it with default RGB images
            image_list += ["pad"] * (self.fixed_frame_num - len(image_list))

        for i in image_list:
            if i != "pad":
                img = Image.open(i).convert("RGB")
                if self.augmentation is not None:
                    img = self.augmentation(image=np.array(img, dtype=np.uint8))["image"]
                    img = Image.fromarray(img)
            else:
                img = Image.new("RGB", (self.image_shape[1], self.image_shape[0]))

            img = self.transform(img)
            images.append(img)
        x = torch.stack(images)
        y = torch.tensor(label_digit, dtype=torch.long)
        return x, y






## Build 3DCNN + RNN lipsreading  model 

In [2]:
import torch
import torch.nn.functional as F


class BidirectionalLSTM(torch.nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = torch.nn.LSTM(nIn, nHidden, bidirectional=True, batch_first=True)
        self.embedding = torch.nn.Linear(nHidden * 2, nOut)
        # self.embedding_1 = torch.nn.Linear(nHidden * 2, nHidden)
        # self.embedding_2 = torch.nn.Linear(nHidden, nHidden//2)
        # self.embedding_3 = torch.nn.Linear(nHidden//2, nOut)
        # self.dropout_1 = torch.nn.Dropout(p=0.1)
        # self.dropout_2 = torch.nn.Dropout(p=0.25)

    def forward(self, inputs):
        recurrent, _ = self.rnn(inputs)
        T, b, h = recurrent.size()
        t_rec = recurrent.reshape(T * b, h)

        # output = self.embedding_1(t_rec)  # [T * b, nOut]
        # output = self.dropout_1(output)
        # output = F.relu(output)
        #
        # output = self.embedding_2(output)
        # # output = self.dropout_2(output)
        # output = F.relu(output)
        #
        # output = self.embedding_3(output)

        output = self.embedding(t_rec)

        output = output.reshape(T, b, -1)
        # output = F.softmax(output, dim=-1)
        return output


class VideoModel(torch.nn.Module):
    def __init__(self, number_classes=28, max_len=6, image_shape=(100, 100)):
        """

        :param number_classes:
        our char dictionary is:
        0: <blank>
        1: a
        2: b
        3: c
        ...
        26: z
        27: <eos>
        :param max_len: max_len = 6,
        Suppose we said abcde,
        the the label should be abcde<eos>
        abc -> abc<eos><blank><blank>
        number_classes = 28, 26 characters + <eos> + <blank>
        """
        super(VideoModel, self).__init__()
        self.number_classes = number_classes
        self.max_len = max_len
        self.conv_block_1 = self._conv_block(3, 32)
        self.conv_block_2 = self._conv_block(32, 64)
        self.conv_block_3 = self._conv_block(64, 128)
        self.conv_block_4 = self._conv_block(128, 256)
        assert image_shape[0] in [100, 60]
        nIn = 21504 if image_shape[0] == 100 else 5376
        self.lstm_decoder = BidirectionalLSTM(nIn=nIn,
                                              nHidden=256,
                                              nOut=number_classes)

    def _conv_block(self, input_c, output_c):
        conv_block = torch.nn.Sequential(
            torch.nn.Conv3d(input_c, output_c, kernel_size=(3, 3, 2), padding=1),
            torch.nn.LeakyReLU(),
            # torch.nn.BatchNorm3d(output_c),
            torch.nn.Conv3d(output_c, output_c, kernel_size=(3, 3, 2), padding=1),
            torch.nn.LeakyReLU(),
            # torch.nn.BatchNorm3d(output_c),
            torch.nn.MaxPool3d((2, 2, 2))
        )
        return conv_block

    def forward(self, x):
        x = x.permute(dims=(0, 2, 3, 4, 1))
        x = self.conv_block_1(x)
        x = self.conv_block_2(x)
        x = self.conv_block_3(x)
        x = self.conv_block_4(x)
        shape = x.size()
        # bs, 256, 3, 3, 14
        x = x.view(shape[0], self.max_len, -1)  # bs, max_len, rest
        x = self.lstm_decoder(x)
        return x

## define the dataloader in this task

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
from torch.utils.data import DataLoader
import string
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


def make_char_dict():
    chars = string.ascii_lowercase
    char_dict = {"<blank>": 0}
    for idx, c in enumerate(chars):
        char_dict[c] = idx + 1
    current_len = len(list(char_dict.keys()))
    char_dict["<eos>"] = current_len
    print(char_dict)
    return char_dict


def get_train_test_folders():
    test = open("data/eval_lst.txt", "r", encoding="utf-8").readlines()
    train = open("data/train_lst.txt", "r", encoding="utf-8").readlines()
    train_folders = [os.path.join("data", "data_aligned", i.strip("\n")) for i in train]
    test_folders = [os.path.join("data", "data_aligned", i.strip("\n")) for i in test]
    print("train videos:{}".format(len(train_folders)))
    print("test videos:{}".format(len(test_folders)))
    return train_folders, test_folders


image_shape = (60, 60)

char_dict = make_char_dict()
train_folders, test_folders = get_train_test_folders()
train_dataset = VideoDataset(
    folder_list=train_folders,
    char_dict=char_dict,
    fixed_frame_num=200,
    fixed_max_len=6,
    image_shape=image_shape,
)
batch_size = 10
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
test_dataset = VideoDataset(
    folder_list=test_folders,
    char_dict=char_dict,
    fixed_frame_num=200,
    fixed_max_len=6,
    aug=None,  # No need to do data augmentation in testing dataset
    image_shape=image_shape,
)
test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True
)

cuda
{'<blank>': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '<eos>': 27}
train videos:171
test videos:20


## init our model

In [4]:
model = VideoModel(number_classes=len(list(char_dict.keys())),
                   max_len=6,
                   image_shape=image_shape)
model = model.to(device)
print(model)

VideoModel(
  (conv_block_1): Sequential(
    (0): Conv3d(3, 32, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(32, 32, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (3): LeakyReLU(negative_slope=0.01)
    (4): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_2): Sequential(
    (0): Conv3d(32, 64, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(64, 64, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (3): LeakyReLU(negative_slope=0.01)
    (4): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_3): Sequential(
    (0): Conv3d(64, 128, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(128, 128, kernel_size=(3, 3, 2), stride=(1, 1, 1), pad

## setup the loss function and optimizer

In [5]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

## setup the training epochs

In [7]:
# in real traning process, we set 300 epochs with early stopping,
# here we just use 10 epochs to show the entire pipeline for your reference
epochs = 10  # for easy running

## setup learning rate scheduler for training

In [8]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
                                                       verbose=True,
                                                       factor=0.1,
                                                       patience=5,
                                                       threshold=0.00001)

## define the acc metric function of our task

In [9]:
def compute_val_acc(scores, y):
    num = scores.size(0)
    prediction = scores.argmax(dim=1)
    indicator = (prediction == y)
    num_matches = indicator.sum()
    return num_matches.float() / num

## summarize the training process as a function

In [10]:
def train_process():
    running_loss = 0
    num_batches = 0

    model.train()
    for idx, data in enumerate(train_dataloader):
        optimizer.zero_grad()

        x, y = data
        size = y.size()
        x = x.to(device)
        y = y.to(device)

        x.requires_grad_()

        scores = model(x)

        scores = scores.view(size[0] * size[1], -1)
        y = y.view(size[0] * size[1])
        loss = criterion(scores, y)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        running_loss += loss.detach().item()
        num_batches += 1
        print("time:{}, epoch: {} step: {}, avg running loss is {}".format(
            time.ctime(), epoch + 1, idx + 1, running_loss / num_batches
        ))
    return running_loss, num_batches

## summarize the validation process

In [11]:
def testing_process():
    running_loss = 0
    num_batches = 0
    running_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, data in enumerate(test_dataloader):
            x, y = data
            size = y.size()
            x = x.to(device)
            y = y.to(device)
            scores = model(x)

            scores = scores.view(size[0] * size[1], -1)
            y = y.view(size[0] * size[1])
            loss = criterion(scores, y)
            running_loss += loss.item()
            num_batches += 1
            running_acc += compute_val_acc(scores, y)
    return running_loss, num_batches, running_acc


## training and evaluate models for each epoch  
#### plus: using learning rate scheduler to improve the performance
#### plus: using early stopping for real training task to avoid over-fitting

In [13]:
lowest_loss = 100000000
lowest_loss_epoch = 0
c = 0
patiences = 10
for epoch in range(epochs):
    running_loss, num_batches = train_process()
    test_running_loss, test_num_batches, running_acc = testing_process()
    print("*" * 100)
    print("epoch: {}, avg training loss:{}, avg validation loss:{}, validation acc: {}".format(epoch + 1,
                                                                                               running_loss / num_batches,
                                                                                               test_running_loss / test_num_batches,
                                                                                               running_acc / test_num_batches))
    scheduler.step(test_running_loss / test_num_batches)
    print("*" * 100)
    # early stopping
    if test_running_loss / test_num_batches < lowest_loss:
        c = 0
        lowest_loss = test_running_loss / test_num_batches
        lowest_loss_epoch = epoch
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': lowest_loss,
        }, "best_3D_model.pt")
        print("save best model in best_3D_model.pt")
    else:
        c += 1
    if c == patiences:
        print("no improvement for {} epochs, model is stopped".format(patiences))
        break

time:Wed Nov 17 21:45:39 2021, epoch: 1 step: 1, avg running loss is 3.3444902896881104
time:Wed Nov 17 21:45:46 2021, epoch: 1 step: 2, avg running loss is 3.3380552530288696
time:Wed Nov 17 21:45:53 2021, epoch: 1 step: 3, avg running loss is 3.330162763595581
time:Wed Nov 17 21:46:01 2021, epoch: 1 step: 4, avg running loss is 3.311287820339203
time:Wed Nov 17 21:46:08 2021, epoch: 1 step: 5, avg running loss is 3.292432117462158
time:Wed Nov 17 21:46:16 2021, epoch: 1 step: 6, avg running loss is 3.271060347557068
time:Wed Nov 17 21:46:24 2021, epoch: 1 step: 7, avg running loss is 3.2505810260772705
time:Wed Nov 17 21:46:31 2021, epoch: 1 step: 8, avg running loss is 3.2124876379966736
time:Wed Nov 17 21:46:38 2021, epoch: 1 step: 9, avg running loss is 3.1843248473273382
time:Wed Nov 17 21:46:46 2021, epoch: 1 step: 10, avg running loss is 3.140131688117981
time:Wed Nov 17 21:46:51 2021, epoch: 1 step: 11, avg running loss is 3.075522921302102
time:Wed Nov 17 21:46:57 2021, epoch

## inference process  
### load model from check point
### do evaluation 

In [16]:
model = VideoModel(number_classes=len(list(char_dict.keys())),
                   max_len=6,
                   image_shape=image_shape)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

checkpoint = torch.load("best_3D_model.pt")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
model = model.to(device)
print(model)

model.eval()
acc = 0
count = 0
with torch.no_grad():
    for idx, data in enumerate(test_dataloader):
        x, y = data
        size = y.size()
        x = x.to(device)
        y = y.to(device)
        scores = model(x)

        scores = scores.view(size[0] * size[1], -1)
        y = y.view(size[0] * size[1])
        acc += compute_val_acc(scores, y)
        count += 1

print("Acc in inference process is {}".format(acc / count))


VideoModel(
  (conv_block_1): Sequential(
    (0): Conv3d(3, 32, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(32, 32, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (3): LeakyReLU(negative_slope=0.01)
    (4): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_2): Sequential(
    (0): Conv3d(32, 64, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(64, 64, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (3): LeakyReLU(negative_slope=0.01)
    (4): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_3): Sequential(
    (0): Conv3d(64, 128, kernel_size=(3, 3, 2), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
    (2): Conv3d(128, 128, kernel_size=(3, 3, 2), stride=(1, 1, 1), pad