In [1]:
import json
import os
import pandas as pd
import numpy as np
import collections
import re
import copy
import tensorflow as tf

from transformer import PositionalEncoding
from model import ImageCaptionModel, accuracy_function, loss_function
from decoder import TransformerDecoder, RNNDecoder

In [2]:
root_path = './YouCookII/'
anno_path = root_path + "annotations/youcookii_annotations_trainval.json"
feat_path = root_path + 'features/feat_csv/'

feat_path_keywords = {'train': 'train_frame_feat_csv', 'val': 'val_frame_feat_csv', 'test': 'test_frame_feat_csv'}

In [3]:
f = open(anno_path, 'rb')
anno_dict = json.load(f)

In [4]:
food_types = pd.read_csv(root_path + 'label_foodtype.csv', header=None)
idx, types = food_types[0], food_types[1]
idx2type = {i: t for i, t in zip(idx, types)}

In [5]:
train_dir = feat_path + feat_path_keywords['train']
val_dir = feat_path + feat_path_keywords['val']
test_dir = feat_path + feat_path_keywords['test']
demo_dir = train_dir + '/101'
feat = pd.read_csv(demo_dir + '/0O4bxhpFX9o/0001/resnet_34_feat_mscoco.csv', header=None)

In [6]:
anno_dict['database']['0O4bxhpFX9o']

{'duration': 311.77,
 'subset': 'training',
 'recipe_type': '101',
 'annotations': [{'segment': [41, 54],
   'id': 0,
   'sentence': 'place the bacon slices on a baking pan and cook them in an oven'},
  {'segment': [84, 122],
   'id': 1,
   'sentence': 'cut the tomatoes into thin slices'},
  {'segment': [130, 135],
   'id': 2,
   'sentence': 'toast the bread slices in the toaster'},
  {'segment': [147, 190],
   'id': 3,
   'sentence': 'spread mayonnaise on the bread and place bacon slices lettuce and tomato slices on top'},
  {'segment': [192, 195], 'id': 4, 'sentence': 'top the sandwich with bread'}],
 'video_url': 'https://www.youtube.com/watch?v=0O4bxhpFX9o'}

# Preprocessing Captions

In [16]:
window_size = 20

In [17]:
def preprocess_captions(captions, window_size):
    caps_ret = []
    for i, caption in enumerate(captions):
        # Taken from:
        # https://towardsdatascience.com/image-captions-with-attention-in-tensorflow-step-by-step-927dad3569fa

        # Convert the caption to lowercase, and then remove all special characters from it
        # caption_nopunct = re.sub(r"[^a-zA-Z0-9]+", ' ', caption.lower())
        # TODO: this step can be handled with keras tokenizer?

        # Split the caption into separate words, and collect all words which are more than 
        # one character and which contain only alphabets (ie. discard words with mixed alpha-numerics)
        clean_words = []
        for word in caption.split():
            if word.isalpha():
                clean_words.append(word)
            elif word.isnumeric():
                clean_words.append('<num>')
        #         clean_words = [word for word in caption_nopunct.split() if ((len(word) > 1) and (word.isalpha()))]

        # Join those words into a string
        caption_new = ['<start>'] + clean_words[:window_size-1] + ['<end>']

        # Replace the old caption in the captions list with this new cleaned caption
        caps_ret.append(caption_new)
    return caps_ret

In [18]:
def pad_captions(captions, window_size):
    pad_cap = copy.deepcopy(captions)
    for caption in pad_cap:
        caption += (window_size + 1 - len(caption)) * ['<pad>']
    return pad_cap

In [19]:
def get_data(dir_name):
    all_vid_names = os.listdir(dir_name)
    all_frames = []
    all_caps = []
    for vid in all_vid_names:
        if vid == '.DS_Store':
            continue
        vid_dir = dir_name + '/' + vid
        for version in os.listdir(vid_dir):
            if version == '.DS_Store':
                continue
            vid_feat = pd.read_csv(dir_name + '/' + vid + '/' + version + '/resnet_34_feat_mscoco.csv', header=None)
            vid_segs = anno_dict['database'][vid]['annotations']
            vid_len = anno_dict['database'][vid]['duration']
            samp_rate = vid_len / 500
            for segments in vid_segs:
                start, end = segments['segment']
                cap = segments['sentence']
                start_fr = int(np.ceil(start / samp_rate))
                end_fr = int(np.floor(end / samp_rate))
                for frame_num in range(start_fr, end_fr + 1, 10):
                    frame = vid_feat.iloc[frame_num]
                    all_frames.append(frame.to_numpy())
                    all_caps.append(cap)
    return np.array(all_frames), np.array(all_caps)

In [20]:
def unk_captions(captions, word_count, minimum_frequency):
    temp = copy.deepcopy(captions)
    for caption in temp:
        for index, word in enumerate(caption):
            if word_count[word] <= minimum_frequency:
                caption[index] = '<unk>'
    return temp

In [21]:
def preproc_all(captions_train, captions_test, window_size):
    clean_caps_train = preprocess_captions(captions_train, window_size)
    clean_caps_test = preprocess_captions(captions_test, window_size)
    
    word_count = collections.Counter()
    for caption in clean_caps_train:
        word_count.update(caption)
    
    masked_caps_train = unk_captions(clean_caps_train, word_count, 50)
    masked_caps_test = unk_captions(clean_caps_test, word_count, 50)
    
    padded_caps_train = pad_captions(masked_caps_train, window_size)
    padded_caps_test = pad_captions(masked_caps_test, window_size)
    
    word2idx = {}
    vocab_size = 0
    for caption in padded_caps_train:
        for index, word in enumerate(caption):
            if word in word2idx:
                caption[index] = word2idx[word]
            else:
                word2idx[word] = vocab_size
                caption[index] = vocab_size
                vocab_size += 1
    for caption in padded_caps_test:
        for index, word in enumerate(caption):
            caption[index] = word2idx[word] 
    
    return np.array(padded_caps_train), np.array(padded_caps_test), word2idx

In [22]:
train_frames, train_caps = get_data(demo_dir)
val_frames, val_caps = get_data(val_dir+'/101')
train_caps, val_caps, word2idx = preproc_all(train_caps, val_caps, window_size)

In [23]:
def train_model(train, word2idx, epochs, batch_size, hidden_size, window_size, valid=None):

    decoder = TransformerDecoder(
        vocab_size  = len(word2idx), 
        hidden_size = hidden_size, 
        window_size = window_size
    )

    model = ImageCaptionModel(decoder)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(
        optimizer   = optimizer,
        loss        = loss_function,
        metrics     = [accuracy_function]
    )

    stats = []
    for epoch in range(epochs):
        stats += [model.train(train[0], train[1], word2idx['<pad>'], batch_size=batch_size)]
        if valid:
            model.test(valid[0], valid[1], word2idx['<pad>'], batch_size=batch_size)

In [24]:
train_model((train_caps, train_frames), word2idx, epochs=5, batch_size=50, hidden_size=64, window_size=window_size, valid=(val_caps, val_frames))

[Training 80/80]	 loss=3.516	 acc: 0.228	 perp: 33.641
[Valid 2/2]	 loss=3.197	 acc: 0.253	 perp: 24.456
[Training 80/80]	 loss=1.855	 acc: 0.585	 perp: 6.3911
[Valid 2/2]	 loss=3.345	 acc: 0.248	 perp: 28.370
[Training 80/80]	 loss=0.974	 acc: 0.796	 perp: 2.649
[Valid 2/2]	 loss=3.599	 acc: 0.223	 perp: 36.561
[Training 80/80]	 loss=0.520	 acc: 0.902	 perp: 1.682
[Valid 2/2]	 loss=4.011	 acc: 0.200	 perp: 55.221
[Training 80/80]	 loss=0.293	 acc: 0.955	 perp: 1.341
[Valid 2/2]	 loss=4.345	 acc: 0.191	 perp: 77.113
