In [1]:
import json
import os
import pandas as pd
import numpy as np
import collections
import re
import copy
import tensorflow as tf

In [2]:
root_path = './YouCookII/'
anno_path = root_path + "annotations/youcookii_annotations_trainval.json"
feat_path = root_path + 'features/feat_csv/'

feat_path_keywords = {'train': 'train_frame_feat_csv', 'val': 'val_frame_feat_csv', 'test': 'test_frame_feat_csv'}

In [3]:
f = open(anno_path, 'rb')
anno_dict = json.load(f)

In [4]:
food_types = pd.read_csv(root_path + 'label_foodtype.csv', header=None)
idx, types = food_types[0], food_types[1]
idx2type = {i: t for i, t in zip(idx, types)}

In [5]:
demo_dir = feat_path + feat_path_keywords['train'] + '/101'
feat = pd.read_csv(demo_dir + '/0O4bxhpFX9o/0001/resnet_34_feat_mscoco.csv', header=None)

In [6]:
anno_dict['database']['0O4bxhpFX9o']

{'duration': 311.77,
 'subset': 'training',
 'recipe_type': '101',
 'annotations': [{'segment': [41, 54],
   'id': 0,
   'sentence': 'place the bacon slices on a baking pan and cook them in an oven'},
  {'segment': [84, 122],
   'id': 1,
   'sentence': 'cut the tomatoes into thin slices'},
  {'segment': [130, 135],
   'id': 2,
   'sentence': 'toast the bread slices in the toaster'},
  {'segment': [147, 190],
   'id': 3,
   'sentence': 'spread mayonnaise on the bread and place bacon slices lettuce and tomato slices on top'},
  {'segment': [192, 195], 'id': 4, 'sentence': 'top the sandwich with bread'}],
 'video_url': 'https://www.youtube.com/watch?v=0O4bxhpFX9o'}

In [7]:
all_vid_names = os.listdir(demo_dir)
vid2cap = {}

In [8]:
all_frames = []
all_caps = []
for vid in all_vid_names:
    vid2cap[vid] = {'frames': [], 'cap': []}
    if vid == '.DS_Store':
        continue
    vid_feat = pd.read_csv(demo_dir + '/' + vid + '/0001/resnet_34_feat_mscoco.csv', header=None)
    vid_segs = anno_dict['database'][vid]['annotations']
    vid_len = anno_dict['database'][vid]['duration']
    samp_rate = vid_len / 500
    for segments in vid_segs:
        start, end = segments['segment']
        cap = segments['sentence']
        start_fr = int(np.ceil(start / samp_rate))
        end_fr = int(np.floor(end / samp_rate))
        for frame_num in range(start_fr, end_fr + 1):
            frame = vid_feat.iloc[frame_num]
            all_frames.append(frame.to_numpy())
            all_caps.append(cap)

# Preprocessing Captions

In [35]:
def preprocess_captions(captions):
    caps_ret = []
    for i, caption in enumerate(captions):
        # Taken from:
        # https://towardsdatascience.com/image-captions-with-attention-in-tensorflow-step-by-step-927dad3569fa

        # Convert the caption to lowercase, and then remove all special characters from it
        # caption_nopunct = re.sub(r"[^a-zA-Z0-9]+", ' ', caption.lower())
        # TODO: this step can be handled with keras tokenizer?

        # Split the caption into separate words, and collect all words which are more than 
        # one character and which contain only alphabets (ie. discard words with mixed alpha-numerics)
        clean_words = []
        for word in caption.split():
            if word.isalpha():
                clean_words.append(word)
            elif word.isnumeric():
                clean_words.append('<num>')
        #         clean_words = [word for word in caption_nopunct.split() if ((len(word) > 1) and (word.isalpha()))]

        # Join those words into a string
        caption_new = ['<start>'] + clean_words + ['<end>']

        # Replace the old caption in the captions list with this new cleaned caption
        caps_ret.append(caption_new)
    return caps_ret

In [36]:
clean_caps = preprocess_captions(all_caps)

In [37]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [38]:
tokenizer.fit_on_texts(copy.deepcopy(clean_caps))
# print out the most frequent 50 words
top_50 = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)[:50]
for i, (word, count) in enumerate(top_50):
    print(f'{i + 1}. {word}: {count}')

1. <start>: 3650
2. <end>: 3650
3. the: 3140
4. and: 2438
5. on: 2204
6. bread: 1405
7. bacon: 1381
8. of: 1290
9. a: 1094
10. slices: 1093
11. top: 1037
12. place: 783
13. pan: 624
14. lettuce: 607
15. spread: 586
16. with: 571
17. some: 561
18. in: 514
19. tomato: 509
20. add: 469
21. slice: 426
22. mayonnaise: 388
23. put: 387
24. into: 366
25. to: 308
26. toast: 305
27. fry: 291
28. tomatoes: 288
29. other: 285
30. pieces: 270
31. cut: 251
32. cheese: 238
33. it: 236
34. sandwich: 217
35. take: 216
36. lemon: 215
37. oil: 209
38. cook: 203
39. side: 202
40. pepper: 201
41. salt: 195
42. half: 189
43. one: 187
44. cover: 178
45. mustard: 178
46. onto: 173
47. them: 171
48. chicken: 170
49. juice: 164
50. avocado: 163


In [39]:
masked_caps = []
word_count = tokenizer.word_counts
for cap in clean_caps:
    mcap = []
    for word in cap:
        if word_count[word] <= 30:
            mcap.append('<unk>')
        else:
            mcap.append(word)
    masked_caps.append(mcap)

In [40]:
tokenizer.fit_on_texts(copy.deepcopy(masked_caps))

In [41]:
max_seq_len = 0
for seq in masked_caps:
    max_seq_len = max(len(seq), max_seq_len)

In [42]:
def pad_captions(captions, pad_len):
    pad_cap = copy.deepcopy(captions)
    for caption in pad_cap:
        caption += (pad_len - len(caption)) * ['<pad>']
    return pad_cap


padded_caps = pad_captions(masked_caps, max_seq_len)

In [43]:
tokenizer.fit_on_texts(copy.deepcopy(padded_caps))
word2idx, idx2word = {}, {}
for w, i in tokenizer.word_index.items():
    word2idx[w] = i
    idx2word[i] = w

In [44]:
caps_mat = np.array(tokenizer.texts_to_sequences(padded_caps))
frames_mat = np.array(all_frames)

In [45]:
print(caps_mat.shape)
print(frames_mat.shape)

(3650, 29)
(3650, 512)


In [46]:
from transformer import PositionalEncoding
from model import ImageCaptionModel, accuracy_function, loss_function
from decoder import TransformerDecoder, RNNDecoder

In [47]:
epochs = 5
batch_size = 100
hidden_size = 64
window_size = 29

decoder = TransformerDecoder(
    vocab_size  = len(word2idx), 
    hidden_size = hidden_size, 
    window_size = window_size
)

model = ImageCaptionModel(decoder)

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(
    optimizer   = optimizer,
    loss        = loss_function,
    metrics     = [accuracy_function]
)

stats = []
try:
    for epoch in range(epochs):
        stats += [model.train(caps_mat, frames_mat, word2idx['<pad>'], batch_size=batch_size)]
        if False:
            model.test(valid[0], valid[1], pad_idx, batch_size=args.batch_size)
except KeyboardInterrupt as e:
    if epoch > 0:
        print("Key-value interruption. Trying to early-terminate. Interrupt again to not do that!")
    else: 
        raise e

2022-12-01 15:56:04.289640: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-01 15:56:04.295505: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-01 15:56:04.880786: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[Training 36/36]	 loss=4.469	 acc: 0.104	 perp: 87.2369
[Training 36/36]	 loss=3.125	 acc: 0.306	 perp: 22.760
[Training 36/36]	 loss=2.030	 acc: 0.527	 perp: 7.6153
[Training 36/36]	 loss=1.253	 acc: 0.711	 perp: 3.500
[Training 36/36]	 loss=0.802	 acc: 0.823	 perp: 2.229
