# Define Concatenation Image Data Loader

In [8]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
from PIL import Image
import albumentations as A
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import string
import math

augmentation = A.Compose([
    A.OneOf([
        A.IAAAdditiveGaussianNoise(p=0.9),
        A.GaussNoise(p=0.9),
    ], p=0.9),
    A.OneOf([
        A.MotionBlur(p=0.9),
        A.MedianBlur(blur_limit=3, p=0.9),
        A.Blur(blur_limit=4, p=0.9),
    ], p=0.9),
    A.OneOf([
        A.CLAHE(clip_limit=2, p=0.9),
        A.IAASharpen(p=0.9),
        A.IAAEmboss(p=0.9),
        A.RandomBrightnessContrast(p=0.95),
    ], p=0.9),
    A.OneOf(
        [
            A.HueSaturationValue(p=0.9),
            A.RandomGamma(p=0.9),
            A.IAAPerspective(p=0.05),
        ], p=0.9,
    )
])


def build_transform(shape):
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
    return transform


class VideoDataset(Dataset):
    def __init__(self, folder_list, char_dict,
                 fixed_frame_num=200, fixed_max_len=6,
                 image_shape=(100, 100),
                 aug=augmentation):
        self.folders = folder_list
        np.random.shuffle(self.folders)
        self.fixed_frame_num = fixed_frame_num
        self.char_dict = char_dict
        self.fixed_max_len = fixed_max_len
        self.augmentation = aug
        self.image_shape = image_shape
        self.transform = build_transform(shape=self.image_shape)

    def __len__(self):
        return len(self.folders)

    def __getitem__(self, index):
        image_folder = self.folders[index]
        label = image_folder.split("/")[-1].split("_")[-1].strip(" ")
        label_digit = [self.char_dict[i] for i in label]
        assert len(label_digit) < self.fixed_max_len
        label_digit.append(self.char_dict["<eos>"])
        rest = self.fixed_max_len - len(label_digit)
        if rest:
            label_digit += [self.char_dict["<blank>"]] * rest

        image_list = [os.path.join(image_folder, i) for i in os.listdir(image_folder) if i.endswith(".jpg")]
        image_list = sorted(image_list)
        images = []
        
        k_col, k_row = 4, 4
        max_frame_num = k_col * k_row

        if len(image_list) <= max_frame_num:
            image_list += ["pad"] * (max_frame_num - len(image_list))
        
        k_frame_pick_one = math.floor(len(image_list) / (k_col * k_row))
        
        # print('k_frame_pick_one: ', k_frame_pick_one)

        for index,i in enumerate(image_list):
            if index%k_frame_pick_one == 0:
                if i != "pad":
                    img = Image.open(i).convert("RGB")
                    if self.augmentation is not None:
                        img = self.augmentation(image=np.array(img, dtype=np.uint8))["image"]
                        img = Image.fromarray(img)
                else:
                    img = Image.new("RGB", (self.image_shape[1], self.image_shape[0]))

                img = img.resize(self.image_shape)
                images.append(img)

        x = Image.new('RGB', (self.image_shape[1] * k_row, self.image_shape[0] * k_col))

        for i in range(k_col):
            for k in range(k_row):
                x.paste(images[i * k_col + k], (self.image_shape[1] * k, self.image_shape[0] * i))
        

        x.save('./test.jpg', quality=50)

        x = self.transform(x)
        y = torch.tensor(label_digit, dtype=torch.long)
        
        return x, y


# Build 2DCNN + RNN lipsreading model

In [2]:
import torch
import torch.nn.functional as F


class BidirectionalLSTM(torch.nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = torch.nn.LSTM(nIn, nHidden, bidirectional=True, batch_first=True)
        self.embedding = torch.nn.Linear(nHidden * 2, nOut)
        # self.embedding_1 = torch.nn.Linear(nHidden * 2, nHidden)
        # self.embedding_2 = torch.nn.Linear(nHidden, nHidden//2)
        # self.embedding_3 = torch.nn.Linear(nHidden//2, nOut)
        # self.dropout_1 = torch.nn.Dropout(p=0.1)
        # self.dropout_2 = torch.nn.Dropout(p=0.25)

    def forward(self, inputs):
        recurrent, _ = self.rnn(inputs)
        T, b, h = recurrent.size()
        t_rec = recurrent.reshape(T * b, h)

        # output = self.embedding_1(t_rec)  # [T * b, nOut]
        # output = self.dropout_1(output)
        # output = F.relu(output)
        #
        # output = self.embedding_2(output)
        # # output = self.dropout_2(output)
        # output = F.relu(output)
        #
        # output = self.embedding_3(output)

        output = self.embedding(t_rec)

        output = output.reshape(T, b, -1)
        # output = F.softmax(output, dim=-1)
        return output


class VideoModel(torch.nn.Module):
    def __init__(self, number_classes=28, max_len=6, image_shape=(60, 60)):
        """

        :param number_classes:
        our char dictionary is:
        0: <blank>
        1: a
        2: b
        3: c
        ...
        26: z
        27: <eos>
        :param max_len: max_len = 6,
        Suppose we said abcde,
        the the label should be abcde<eos>
        abc -> abc<eos><blank><blank>
        number_classes = 28, 26 characters + <eos> + <blank>
        """
        super(VideoModel, self).__init__()
        self.number_classes = number_classes
        self.max_len = max_len
        
        self.conv_block_1 = self. _cnn2d_block_2_conv_layer(3, 32)
        self.conv_block_2 = self. _cnn2d_block_2_conv_layer(32, 64)
        self.conv_block_3 = self. _cnn2d_block_2_conv_layer(64, 128)
        self.conv_block_4 = self. _cnn2d_block_2_conv_layer(128, 256)
        
        self.lstm_decoder = BidirectionalLSTM(nIn=9600,
                                              nHidden=256,
                                              nOut=number_classes)
    
    def _cnn2d_block_2_conv_layer(self, input_size, output_size):
        conv2d_block = torch.nn.Sequential(
            torch.nn.Conv2d(input_size,  output_size,  kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(output_size,  output_size,  kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2,2)
        )
        return conv2d_block
    
    def _cnn2d_block_1_conv_layer(self, input_size, output_size):
        conv2d_block = torch.nn.Sequential(
            torch.nn.Conv2d(input_size,  output_size,  kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2,2)
        )
        return conv2d_block
    
    def forward(self, x):
        # x = x.permute(dims=(0, 2, 3, 4, 1))
        x = self.conv_block_1(x)
        x = self.conv_block_2(x)
        x = self.conv_block_3(x)
        x = self.conv_block_4(x)
        shape = x.size()
        # bs, 256, 3, 3, 14
        x = x.view(shape[0], self.max_len, -1)  # bs, max_len, rest
        x = self.lstm_decoder(x)
        return x

# Define the dataloader in this task

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from cnn2d_image_generator import VideoDataset
from image_2dcrnn import VideoModel
import torch
from torch.utils.data import DataLoader
import string
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


def make_char_dict():
    chars = string.ascii_lowercase
    char_dict = {"<blank>": 0}
    for idx, c in enumerate(chars):
        char_dict[c] = idx + 1
    current_len = len(list(char_dict.keys()))
    char_dict["<eos>"] = current_len
    print(char_dict)
    return char_dict


def get_train_test_folders():
    test = open("data/eval_lst.txt", "r", encoding="utf-8").readlines()
    train = open("data/train_lst.txt", "r", encoding="utf-8").readlines()
    train_folders = [os.path.join("data", "data_aligned", i.strip("\n")) for i in train]
    test_folders = [os.path.join("data", "data_aligned", i.strip("\n")) for i in test]
    print("train videos:{}".format(len(train_folders)))
    print("test videos:{}".format(len(test_folders)))
    return train_folders, test_folders


image_shape = (60, 60)

char_dict = make_char_dict()
train_folders, test_folders = get_train_test_folders()
train_dataset = VideoDataset(
    folder_list=train_folders,
    char_dict=char_dict,
    fixed_frame_num=200,
    fixed_max_len=6,
    image_shape=image_shape,
)
batch_size = 10
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
test_dataset = VideoDataset(
    folder_list=test_folders,
    char_dict=char_dict,
    fixed_frame_num=200,
    fixed_max_len=6,
    aug=None,  # No need to do data augmentation in testing dataset
    image_shape=image_shape,
)
test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True
)

cpu
{'<blank>': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '<eos>': 27}
train videos:171
test videos:20


# Init model

In [9]:
model = VideoModel(number_classes=len(list(char_dict.keys())),
                   max_len=6,
                   image_shape=image_shape)
model = model.to(device)
print(model)

VideoModel(
  (conv_block_1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1,

# Set up for training

In [11]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1, momentum=0.9)

steps_per_epoch = len(train_folders) // 10 + 1
epochs = 10
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
                                                       verbose=True,
                                                       factor=0.1,
                                                       patience=5,
                                                       threshold=0.00001)

# Define training process

In [12]:
def train_process():
    running_loss = 0
    num_batches = 0

    model.train()
    for idx, data in enumerate(train_dataloader):
        optimizer.zero_grad()

        x, y = data
        size = y.size()
        x = x.to(device)
        y = y.to(device)

        x.requires_grad_()

        scores = model(x)

        scores = scores.view(size[0] * size[1], -1)
        y = y.view(size[0] * size[1])
        loss = criterion(scores, y)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        running_loss += loss.detach().item()
        num_batches += 1
        print("time:{}, epoch: {} step: {}, avg running loss is {}".format(
            time.ctime(), epoch + 1, idx + 1, running_loss / num_batches
        ))
    return running_loss, num_batches

# Define validation process

In [13]:
def testing_process():
    running_loss = 0
    num_batches = 0

    model.eval()
    with torch.no_grad():
        for idx, data in enumerate(test_dataloader):
            x, y = data
            size = y.size()
            x = x.to(device)
            y = y.to(device)
            scores = model(x)

            scores = scores.view(size[0] * size[1], -1)
            y = y.view(size[0] * size[1])
            loss = criterion(scores, y)
            running_loss += loss.item()
            num_batches += 1
    return running_loss, num_batches

# Train

In [14]:
for epoch in range(epochs):
    running_loss, num_batches = train_process()
    test_running_loss, test_num_batches = testing_process()
    print("*" * 100)
    print("epoch: {}, avg training loss:{}, avg validation loss:{}".format(epoch + 1, running_loss / num_batches,
                                                                           test_running_loss / test_num_batches))
    scheduler.step(test_running_loss / test_num_batches)
    print("*" * 100)

time:Thu Nov 18 16:31:22 2021, epoch: 1 step: 1, avg running loss is 3.3331379890441895
time:Thu Nov 18 16:31:29 2021, epoch: 1 step: 2, avg running loss is 3.23698627948761
time:Thu Nov 18 16:31:35 2021, epoch: 1 step: 3, avg running loss is 3.0955111980438232
time:Thu Nov 18 16:31:40 2021, epoch: 1 step: 4, avg running loss is 2.996290445327759
time:Thu Nov 18 16:31:45 2021, epoch: 1 step: 5, avg running loss is 2.969486379623413
time:Thu Nov 18 16:31:50 2021, epoch: 1 step: 6, avg running loss is 2.854193329811096
time:Thu Nov 18 16:31:56 2021, epoch: 1 step: 7, avg running loss is 2.8410747732434953
time:Thu Nov 18 16:32:02 2021, epoch: 1 step: 8, avg running loss is 2.8046317100524902
time:Thu Nov 18 16:32:08 2021, epoch: 1 step: 9, avg running loss is 2.768410470750597
time:Thu Nov 18 16:32:13 2021, epoch: 1 step: 10, avg running loss is 2.741999077796936
time:Thu Nov 18 16:32:18 2021, epoch: 1 step: 11, avg running loss is 2.7109274213964287
time:Thu Nov 18 16:32:23 2021, epoch:

time:Thu Nov 18 16:40:15 2021, epoch: 5 step: 10, avg running loss is 3.665429949760437
time:Thu Nov 18 16:40:21 2021, epoch: 5 step: 11, avg running loss is 3.6101166768507524
time:Thu Nov 18 16:40:25 2021, epoch: 5 step: 12, avg running loss is 3.5473761359850564
time:Thu Nov 18 16:40:30 2021, epoch: 5 step: 13, avg running loss is 3.5040150605715237
time:Thu Nov 18 16:40:35 2021, epoch: 5 step: 14, avg running loss is 3.5232543775013516
time:Thu Nov 18 16:40:40 2021, epoch: 5 step: 15, avg running loss is 3.4981461842854817
time:Thu Nov 18 16:40:45 2021, epoch: 5 step: 16, avg running loss is 3.4872216433286667
time:Thu Nov 18 16:40:49 2021, epoch: 5 step: 17, avg running loss is 3.4649929299074063
time:Thu Nov 18 16:40:50 2021, epoch: 5 step: 18, avg running loss is 3.553756594657898
****************************************************************************************************
epoch: 5, avg training loss:3.553756594657898, avg validation loss:3.7243123054504395
**************

time:Thu Nov 18 16:47:13 2021, epoch: 9 step: 18, avg running loss is 2.3701753748787775
****************************************************************************************************
epoch: 9, avg training loss:2.3701753748787775, avg validation loss:2.55965518951416
****************************************************************************************************
time:Thu Nov 18 16:47:22 2021, epoch: 10 step: 1, avg running loss is 1.9023246765136719
time:Thu Nov 18 16:47:28 2021, epoch: 10 step: 2, avg running loss is 2.592995524406433
time:Thu Nov 18 16:47:34 2021, epoch: 10 step: 3, avg running loss is 2.4896981716156006
time:Thu Nov 18 16:47:39 2021, epoch: 10 step: 4, avg running loss is 2.4361849427223206
time:Thu Nov 18 16:47:44 2021, epoch: 10 step: 5, avg running loss is 2.375188636779785
time:Thu Nov 18 16:47:51 2021, epoch: 10 step: 6, avg running loss is 2.3305323918660483
time:Thu Nov 18 16:47:56 2021, epoch: 10 step: 7, avg running loss is 2.3427252769470215
tim

In [18]:
k_col, k_row = 5, 5
save_name = '2dcrnn_model_'+str(k_col*k_row)+'_epoch_'+str(epochs)+'.pkl'
torch.save(model, save_name)

# Load model

In [19]:
model = torch.load(save_name)

# Test accuracy

In [21]:
def compute_val_acc(scores, y):
    num = scores.size(0)
    prediction = scores.argmax(dim=1)
    indicator = (prediction == y)
    num_matches = indicator.sum()
    return num_matches.float() / num


model.eval()
acc = 0
count = 0
with torch.no_grad():
    for idx, data in enumerate(test_dataloader):
        x, y = data
        size = y.size()
        x = x.to(device)
        y = y.to(device)
        scores = model(x)

        scores = scores.view(size[0] * size[1], -1)
        y = y.view(size[0] * size[1])
        acc += compute_val_acc(scores, y)
        count += 1

print("Acc in inference process is {}".format(acc / count))

Acc in inference process is 0.32499998807907104
