In [108]:
import json
import os
import pandas as pd
import numpy as np
import collections
import re

In [9]:
root_path = './YouCookII/'
anno_path = root_path + "annotations/youcookii_annotations_trainval.json"
feat_path = root_path + 'features/feat_csv/'

feat_path_keywords = {'train': 'train_frame_feat_csv', 'val': 'val_frame_feat_csv', 'test': 'test_frame_feat_csv'}

In [7]:
f = open(anno_path, 'rb')
anno_dict = json.load(f)

In [24]:
food_types = pd.read_csv(root_path + 'label_foodtype.csv', header=None)
idx, types = food_types[0], food_types[1]
idx2type = {i: t for i, t in zip(idx, types)}

In [44]:
demo_dir = feat_path + feat_path_keywords['train'] + '/101'
feat = pd.read_csv(demo_dir + '/0O4bxhpFX9o/0001/resnet_34_feat_mscoco.csv', header=None)

In [57]:
anno_dict['database']['0O4bxhpFX9o']

{'duration': 311.77,
 'subset': 'training',
 'recipe_type': '101',
 'annotations': [{'segment': [41, 54],
   'id': 0,
   'sentence': 'place the bacon slices on a baking pan and cook them in an oven'},
  {'segment': [84, 122],
   'id': 1,
   'sentence': 'cut the tomatoes into thin slices'},
  {'segment': [130, 135],
   'id': 2,
   'sentence': 'toast the bread slices in the toaster'},
  {'segment': [147, 190],
   'id': 3,
   'sentence': 'spread mayonnaise on the bread and place bacon slices lettuce and tomato slices on top'},
  {'segment': [192, 195], 'id': 4, 'sentence': 'top the sandwich with bread'}],
 'video_url': 'https://www.youtube.com/watch?v=0O4bxhpFX9o'}

In [89]:
all_vid_names = os.listdir(demo_dir)
vid2cap = {}

In [90]:
all_frames = []
all_caps = []

In [91]:
for vid in all_vid_names:
    vid2cap[vid] = {'frames':[], 'cap':[]}
    if vid == '.DS_Store':
        continue
    vid_feat = pd.read_csv(demo_dir + '/' + vid + '/0001/resnet_34_feat_mscoco.csv', header=None)
    vid_segs = anno_dict['database'][vid]['annotations']
    vid_len = anno_dict['database'][vid]['duration']
    samp_rate = vid_len / 500
    for segments in vid_segs:
        start, end = segments['segment']
        cap = segments['sentence']
        start_fr = int(np.ceil(start / samp_rate))
        end_fr = int(np.floor(end / samp_rate))
        for frame_num in range(start_fr, end_fr + 1):
            frame = vid_feat.iloc[frame_num]
            all_frames.append(frame)
            all_caps.append(cap)
#             vid2cap[vid]['frames'].append(frame)
#             vid2cap[vid]['cap'].append(cap)

In [102]:
def preprocess_captions(captions, window_size):
    caps_ret = []
    for i, caption in enumerate(captions):
        # Taken from:
        # https://towardsdatascience.com/image-captions-with-attention-in-tensorflow-step-by-step-927dad3569fa

        # Convert the caption to lowercase, and then remove all special characters from it
        caption_nopunct = re.sub(r"[^a-zA-Z0-9]+", ' ', caption.lower())
      
        # Split the caption into separate words, and collect all words which are more than 
        # one character and which contain only alphabets (ie. discard words with mixed alpha-numerics)
        clean_words = [word for word in caption_nopunct.split() if ((len(word) > 1) and (word.isalpha()))]
      
        # Join those words into a string
        caption_new = ['<start>'] + clean_words[:window_size-1] + ['<end>']
      
        # Replace the old caption in the captions list with this new cleaned caption
        caps_ret.append(caption_new)
    return caps_ret

In [105]:
clean_caps = preprocess_captions(all_caps, 20)

In [109]:
word_count = collections.Counter()
for caption in clean_caps:
    word_count.update(caption)

In [111]:
word_count.most_common(50)

[('<start>', 3650),
 ('<end>', 3650),
 ('the', 3130),
 ('and', 2438),
 ('on', 2204),
 ('bread', 1405),
 ('bacon', 1381),
 ('of', 1290),
 ('slices', 1093),
 ('top', 1037),
 ('place', 783),
 ('pan', 624),
 ('lettuce', 607),
 ('spread', 586),
 ('with', 571),
 ('some', 561),
 ('in', 514),
 ('tomato', 509),
 ('add', 469),
 ('slice', 426),
 ('mayonnaise', 388),
 ('put', 387),
 ('into', 356),
 ('toast', 305),
 ('fry', 291),
 ('tomatoes', 288),
 ('other', 285),
 ('to', 271),
 ('pieces', 260),
 ('cut', 251),
 ('cheese', 238),
 ('it', 236),
 ('take', 216),
 ('lemon', 215),
 ('oil', 209),
 ('sandwich', 207),
 ('side', 202),
 ('pepper', 201),
 ('salt', 195),
 ('half', 189),
 ('one', 187),
 ('cover', 178),
 ('mustard', 178),
 ('onto', 173),
 ('them', 171),
 ('chicken', 170),
 ('cook', 167),
 ('juice', 164),
 ('avocado', 163),
 ('dressing', 154)]