In [1]:
import json
import os
import pandas as pd
import numpy as np
import collections
import re
import copy
import tensorflow as tf
import random

from transformer import PositionalEncoding
from model import ImageCaptionModel, accuracy_function, loss_function
from decoder import TransformerDecoder, RNNDecoder

In [2]:
%load_ext autoreload
%autoreload 2
import transformer, model, decoder
%aimport transformer, model, decoder

In [3]:
root_path = './YouCookII/'
anno_path = root_path + "annotations/youcookii_annotations_trainval.json"
feat_path = root_path + 'features/feat_csv/'

feat_path_keywords = {'train': 'train_frame_feat_csv', 'val': 'val_frame_feat_csv', 'test': 'test_frame_feat_csv'}

In [4]:
f = open(anno_path, 'rb')
anno_dict = json.load(f)

In [5]:
food_types = pd.read_csv(root_path + 'label_foodtype.csv', header=None)
idx, types = food_types[0], food_types[1]
idx2type = {i: t for i, t in zip(idx, types)}

In [6]:
train_dir = feat_path + feat_path_keywords['train']
val_dir = feat_path + feat_path_keywords['val']
test_dir = feat_path + feat_path_keywords['test']

In [7]:
anno_dict['database']['0O4bxhpFX9o']

{'duration': 311.77,
 'subset': 'training',
 'recipe_type': '101',
 'annotations': [{'segment': [41, 54],
   'id': 0,
   'sentence': 'place the bacon slices on a baking pan and cook them in an oven'},
  {'segment': [84, 122],
   'id': 1,
   'sentence': 'cut the tomatoes into thin slices'},
  {'segment': [130, 135],
   'id': 2,
   'sentence': 'toast the bread slices in the toaster'},
  {'segment': [147, 190],
   'id': 3,
   'sentence': 'spread mayonnaise on the bread and place bacon slices lettuce and tomato slices on top'},
  {'segment': [192, 195], 'id': 4, 'sentence': 'top the sandwich with bread'}],
 'video_url': 'https://www.youtube.com/watch?v=0O4bxhpFX9o'}

In [8]:
def get_data(dirs):
    all_frames = []
    all_caps = []
    c_length = []
    for dir_name in dirs:
        all_vid_names = os.listdir(dir_name)
        for vid in all_vid_names:
            if vid == '.DS_Store':
                continue
            vid_dir = dir_name + '/' + vid
            vid_feat1 = pd.read_csv(dir_name + '/' + vid + '/0001/resnet_34_feat_mscoco.csv',
                                    header=None)
            feats = [vid_feat1]
            for vid_feat in feats:
                vid_segs = anno_dict['database'][vid]['annotations']
                vid_len = anno_dict['database'][vid]['duration']
                samp_rate = vid_len / 500  # 一般来说这采样率 = length / 一秒多少帧
                num_segs = len(vid_segs)
                for segments in vid_segs:
                    start, end = segments['segment']
                    cap = segments['sentence']
                    start_fr = int(np.ceil(start / samp_rate))
                    end_fr = int(np.floor(end / samp_rate))
                    c_length.append(end_fr-start_fr+1)
                    frame_idx = []
                    # random sample返回的时间顺序会是错的 && 不等距 -> 0 + i * frame gap
                    # if end_fr - start_fr < 3:
                    #     frame_idx = list(range(start_fr, end_fr + 1))
                    #     while len(frame_idx) < 3:
                    #         frame_idx += random.sample(list(range(start_fr, end_fr + 1)), 1)
                    # else:
                    #     frame_idx = random.sample(list(range(start_fr, end_fr + 1)), 3)
                    frame_gap = (end_fr - start_fr + 1) // min(3, end_fr - start_fr + 1)
                    i = 0
                    while i * frame_gap + start_fr <= end_fr:
                        frame_idx.append(i * frame_gap + start_fr)
                        i += 1
                    # 一个segment的frame对应一个caption，而不是把每一帧都当做一个caption
                    sub_frames = []
                    for frame_num in frame_idx:
                        frame = vid_feat.iloc[frame_num]
                        sub_frames.append(frame.to_numpy())
                    # print(len(sub_frames))
                    max_len = 15
                    sub_frames = sub_frames[0:max_len]
                    padd_num = max_len-len(sub_frames)
                    if padd_num > 0:
                        for k in range(padd_num):
                            padd_instant = np.zeros_like(sub_frames[0])
                            sub_frames.append(padd_instant)
                    sub_frames = np.array(sub_frames)
                    all_frames.append(sub_frames)
                    all_caps.append(cap)
    print("avg_length {}".format(np.mean(c_length)))
    return np.array(all_frames), np.array(all_caps)

# Preprocessing Captions

In [9]:
window_size = 20

In [10]:
def preprocess_captions(captions, window_size):
    caps_ret = []
    for i, caption in enumerate(captions):
        # Taken from:
        # https://towardsdatascience.com/image-captions-with-attention-in-tensorflow-step-by-step-927dad3569fa

        # Convert the caption to lowercase, and then remove all special characters from it
        # caption_nopunct = re.sub(r"[^a-zA-Z0-9]+", ' ', caption.lower())
        # TODO: this step can be handled with keras tokenizer?

        # Split the caption into separate words, and collect all words which are more than 
        # one character and which contain only alphabets (ie. discard words with mixed alpha-numerics)
        clean_words = []
        for word in caption.split():
            if word.isalpha():
                clean_words.append(word)
            elif word.isnumeric():
                clean_words.append('<num>')

        # Join those words into a string
        caption_new = ['<start>'] + clean_words[:window_size - 1] + ['<end>']

        # Replace the old caption in the captions list with this new cleaned caption
        caps_ret.append(caption_new)
    return caps_ret

In [11]:
def build_vocab(captions, vocab_size):
    word_count = collections.Counter()
    for caption in captions:
        word_count.update(caption)
    
    vocab = [word for word, count in word_count.most_common(vocab_size)]
    return vocab

In [12]:
def unk_captions(captions, vocab):
    temp = copy.deepcopy(captions)
    for caption in temp:
        for index, word in enumerate(caption):
            if word not in vocab:
                caption[index] = '<unk>'
    return temp

In [13]:
def pad_captions(captions, window_size):
    pad_cap = copy.deepcopy(captions)
    for caption in pad_cap:
        caption += (window_size + 1 - len(caption)) * ['<pad>']
    return pad_cap

In [14]:
def preproc_all(captions_train, captions_test, window_size):
    clean_caps_train = preprocess_captions(captions_train, window_size)
    clean_caps_test = preprocess_captions(captions_test, window_size)
    
    vocab = build_vocab(clean_caps_train, 1800)
    
    masked_caps_train = unk_captions(clean_caps_train, vocab)
    masked_caps_test = unk_captions(clean_caps_test, vocab)
    
    padded_caps_train = pad_captions(masked_caps_train, window_size)
    padded_caps_test = pad_captions(masked_caps_test, window_size)
    
    word2idx = {}
    vocab_size = 0
    for caption in padded_caps_train:
        for index, word in enumerate(caption):
            if word in word2idx:
                caption[index] = word2idx[word]
            else:
                word2idx[word] = vocab_size
                caption[index] = vocab_size
                vocab_size += 1
    for caption in padded_caps_test:
        for index, word in enumerate(caption):
            caption[index] = word2idx[word]
    
    return np.array(padded_caps_train), np.array(padded_caps_test), word2idx

In [15]:
train_subdir = os.listdir(train_dir)
# train_subdir.remove('.DS_Store')
train_paths = [os.path.join(train_dir, subdir) for subdir in train_subdir]
val_paths = [os.path.join(val_dir, subdir) for subdir in train_subdir]

In [16]:
train_frames, train_caps = get_data(train_paths)
val_frames, val_caps = get_data(val_paths)

avg_length 31.420334719938086
avg_length 32.19730813287514


In [17]:
train_caps_token, val_caps_token, word2idx = preproc_all(train_caps, val_caps, window_size)

In [18]:
print(train_caps.shape, val_caps.shape)
print(train_caps_token.shape, val_caps_token.shape)

(10337,) (3492,)
(10337, 21) (3492, 21)


In [19]:
# 最好用TFVisionEncoderDecoderModel
def train_model(train, word2idx, epochs, batch_size, hidden_size, window_size, valid=None):
    print('train[0] shape', train[0].shape)
    print('train[1] shape', train[1].shape)

    decoder = TransformerDecoder(
        vocab_size  = len(word2idx), 
        hidden_size = hidden_size, 
        window_size = window_size
    )

    model = ImageCaptionModel(decoder)

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer   = optimizer,
        loss        = loss_function,
        metrics     = [accuracy_function]
    )

    stats = []
    for epoch in range(epochs):
        stats += [model.train(train[0], train[1], word2idx['<pad>'], batch_size=batch_size)]
        if valid:
            model.test(valid[0], valid[1], word2idx['<pad>'], batch_size=batch_size)
    return model

In [20]:
model = train_model((train_caps_token, train_frames),
                    word2idx,
                    epochs=60,
                    batch_size=64,
                    hidden_size=128, #256 #block num <=2
                    window_size=window_size,
                    valid=(val_caps_token, val_frames))

train[0] shape (10337, 21)
train[1] shape (10337, 15, 512)
Metal device set to: Apple M1


2022-12-08 13:29:19.252328: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-08 13:29:19.252962: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-12-08 13:29:20.181212: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-08 13:29:20.189026: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-08 13:29:20.637192: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[Valid 1/54]	 loss=6.006	 acc: 0.177	 perp: 405.8553263

2022-12-08 13:29:36.683857: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[Valid 54/54]	 loss=5.981	 acc: 0.173	 perp: 396.001
[Valid 54/54]	 loss=5.130	 acc: 0.222	 perp: 169.02130
[Valid 54/54]	 loss=4.635	 acc: 0.225	 perp: 103.06958
[Valid 54/54]	 loss=4.302	 acc: 0.248	 perp: 73.82934
[Valid 54/54]	 loss=4.085	 acc: 0.268	 perp: 59.44277
[Valid 54/54]	 loss=3.925	 acc: 0.285	 perp: 50.63004
[Valid 54/54]	 loss=3.790	 acc: 0.299	 perp: 44.27133
[Valid 54/54]	 loss=3.681	 acc: 0.308	 perp: 39.68376
[Valid 54/54]	 loss=3.596	 acc: 0.321	 perp: 36.44635
[Valid 54/54]	 loss=3.525	 acc: 0.332	 perp: 33.96209
[Valid 54/54]	 loss=3.473	 acc: 0.336	 perp: 32.23327
[Valid 54/54]	 loss=3.422	 acc: 0.343	 perp: 30.61905
[Valid 54/54]	 loss=3.386	 acc: 0.347	 perp: 29.54064
[Valid 54/54]	 loss=3.348	 acc: 0.352	 perp: 28.45313
[Valid 54/54]	 loss=3.322	 acc: 0.355	 perp: 27.71689
[Valid 54/54]	 loss=3.295	 acc: 0.358	 perp: 26.97737
[Valid 54/54]	 loss=3.279	 acc: 0.361	 perp: 26.56138
[Valid 54/54]	 loss=3.256	 acc: 0.363	 perp: 25.93841
[Valid 54/54]	 loss=3.239	 

In [24]:
train_frames.shape

(10337, 15, 512)

In [31]:
def gen_caption_temperature(model, image_embedding, tokens, wordToIds, padID, temp, window_length):
    """
    Function used to generate a caption using an ImageCaptionModel given
    an image embedding. 
    """
    print('tokens', tokens[1:])  # answer
    idsToWords = {id: word for word, id in wordToIds.items()}
    unk_token = wordToIds['<unk>']
    caption_so_far = [wordToIds['<start>']]
    teacher_based_out = []
    inp = np.array([tokens[0:-1]])
    logits = model(np.expand_dims(image_embedding, 0), inp)
    probs = tf.nn.softmax(logits).numpy()
    teacher_based_out = np.argmax(probs, axis=2)
    print("tea", list(teacher_based_out[0]))  # prediction
    # 前几个word的预测效果很差导致最后采样生成的句子偏差很大, 
    # 1. 模型：teaching forcing loss本身的缺陷 
    # 2. 预处理：图片信息的预处理有问题，信息没有被有效的利用
    # 3. dataset本身：可能作者自己都没有跑过这个task，只提出了dataset，他们自己也不知道baseline；或者这个dataset本身就不可解
    while len(caption_so_far) < window_length and caption_so_far[-1] != wordToIds['<end>']:
        caption_input = np.array([caption_so_far + ((window_length - len(caption_so_far)) * [padID])])
        logits = model(np.expand_dims(image_embedding, 0), caption_input)
        logits = logits[0][len(caption_so_far) - 1]
        probs = tf.nn.softmax(logits / temp).numpy()
        next_token = unk_token
        attempts = 0
        while next_token == unk_token and attempts < 5:
            next_token = np.random.choice(len(probs), p=probs)
            next_token =np.argmax(probs)
            attempts += 1
        caption_so_far.append(next_token)
    return ' '.join([idsToWords[x] for x in caption_so_far][1:])


temperature = .05
for i in range(10):
    t = gen_caption_temperature(model, train_frames[i], train_caps_token[i], word2idx, word2idx['<pad>'], temperature, window_size)
    #print(t)

tokens [1 2 3 4 5 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
tea [1, 4, 3, 4, 145, 6, 6, 9, 9, 9, 9, 435, 435, 435, 546, 546, 546, 546, 9, 6]
tokens [ 1  8  9 10  3  4  2  6  7  7  7  7  7  7  7  7  7  7  7  7]
tea [14, 4, 25, 10, 3, 4, 65, 6, 6, 3, 9, 435, 435, 9, 546, 546, 546, 546, 3, 6]
tokens [ 1 11  3  4  2  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7]
tea [14, 4, 3, 4, 108, 6, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 546, 9, 9, 6]
tokens [12 13  4  5  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7]
tea [14, 13, 4, 253, 9, 31, 9, 9, 9, 9, 10, 10, 10, 10, 10, 546, 546, 546, 9, 6]
tokens [14  4 12 15 16 17 18  6  7  7  7  7  7  7  7  7  7  7  7  7]
tea [14, 4, 1097, 31, 4, 18, 18, 6, 193, 9, 9, 9, 9, 10, 9, 546, 546, 9, 9, 6]
tokens [19  4  2 20  4  5  6  7  7  7  7  7  7  7  7  7  7  7  7  7]
tea [1, 4, 181, 25, 4, 253, 6, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
tokens [21  4  5 22  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7]
tea [14, 4, 1096, 22, 6, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 6]
token

In [30]:
train_caps[:10]

array(['add rice to the seaweed', 'add avocado and cucumber to the rice',
       'add crab to the rice', 'roll up the seaweed',
       'cut the roll with a sharp knife',
       'spread the rice onto the seaweed', 'flip the seaweed over',
       'place the crab and celery on the seaweed in a line',
       'roll up and press down with the mat', 'cut the roll into pieces'],
      dtype='<U226')