In [3]:
import pandas as pd
import cv2, numpy, youtube_dl
import json
import re
import csv

# activity net util

### NOTE: changed file organization, hardcoded filepaths might not work

In [202]:
data = json.load(open('raw_data/activity_net.json'))
d = {
    "id": [],
    "duration": [],
    "subset": [],
    "resolution": [],
    "url": [],
    "annotations": [],
}
for k, v in data['database'].items():
    d['id'].append(k)
    d['duration'].append(v['duration'])
    d['subset'].append(v['subset'])
    d['resolution'].append(v['resolution'])
    d['url'].append(v['url'])
    d['annotations'].append(v['annotations'])
    if len(d['annotations']) == 1: print(v['annotations'])

activitynet_df = pd.DataFrame(d)
activitynet_df.head()
# print(json.dumps(d['annotations'][:10], indent=2))

[{'segment': [0.01, 123.42336739937599], 'label': 'Fun sliding down'}]


Unnamed: 0,id,duration,subset,resolution,url,annotations
0,sJFgo9H6zNo,139.042,training,640x360,https://www.youtube.com/watch?v=sJFgo9H6zNo,"[{'segment': [0.01, 123.42336739937599], 'labe..."
1,V1zhqaGFY2A,136.98,testing,658x480,https://www.youtube.com/watch?v=V1zhqaGFY2A,[]
2,JDg--pjY5gg,126.178,validation,1920x1080,https://www.youtube.com/watch?v=JDg--pjY5gg,"[{'segment': [61.420442338881465, 64.560145614..."
3,KsFid_YVsn0,235.13,testing,1920x1080,https://www.youtube.com/watch?v=KsFid_YVsn0,[]
4,-TmWR_keSfI,104.955,testing,1280x720,https://www.youtube.com/watch?v=-TmWR_keSfI,[]


In [195]:
activitynet_df['annotations'] = activitynet_df['annotations'].apply(json.dumps)
activitynet_df.head()

Unnamed: 0,id,duration,subset,resolution,url,annotations
0,sJFgo9H6zNo,139.042,training,640x360,https://www.youtube.com/watch?v=sJFgo9H6zNo,"[{""segment"": [0.01, 123.42336739937599], ""labe..."
1,V1zhqaGFY2A,136.98,testing,658x480,https://www.youtube.com/watch?v=V1zhqaGFY2A,[]
2,JDg--pjY5gg,126.178,validation,1920x1080,https://www.youtube.com/watch?v=JDg--pjY5gg,"[{""segment"": [61.420442338881465, 64.560145614..."
3,KsFid_YVsn0,235.13,testing,1920x1080,https://www.youtube.com/watch?v=KsFid_YVsn0,[]
4,-TmWR_keSfI,104.955,testing,1280x720,https://www.youtube.com/watch?v=-TmWR_keSfI,[]


In [198]:
activitynet_df.to_csv('activity_net.csv')

In [115]:
list(activitynet_df.loc[activitynet_df['id'] == '-1IBHYS3L-Y']['annotations'].items())

[(15994,
  [{'segment': [0, 184.9797935803432], 'label': 'Removing ice from car'}])]

# save frame util

In [2]:
# stackoverflow: https://stackoverflow.com/questions/66272740/extract-specific-frames-of-youtube-video-without-downloading-video
# youtubedl github: https://github.com/ytdl-org/youtube-dl/blob/master/docs/module_guide.rst
# opencv-python docs https://docs.opencv.org/3.4/d8/dfe/classcv_1_1VideoCapture.html#a9ac7f4b1cdfe624663478568486e6712

def save_frames(yt_id_list: list, time_list: list, out_dir, res='144p'):
    assert len(yt_id_list) == len(time_list)

    ydl = youtube_dl.YoutubeDL()
    ydl.add_default_info_extractors()

    count_saved = 0
    for _id, time in zip(yt_id_list, time_list):
        url = 'https://www.youtube.com/watch?v=' + _id
        info = ydl.extract_info(url, download=False)
        for f in info['formats']:
            fn = f['format_note']
            if res is None or fn == res:
                url = f['url']
                cap = cv2.VideoCapture(url)

                # amount_of_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
                frame_num = f['fps'] * time
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num-1)
                _, frame = cap.read()
                cv2.imwrite(f'{out_dir}{_id}.png', frame)
                print(f'saved {_id}')
                count_saved += 1

                break
    print(f'{len(yt_id_list)} total videos, {count_saved} saved')

def build_images(hellaswag_csv, activity_net_csv, out_dir='hellaswag_images/'):
    hellaswag_df = pd.read_csv(hellaswag_csv)
    activity_net_df = pd.read_csv(activity_net_csv)

    yt_id_list = []
    time_list = []
    for source_id in hellaswag_df['source_id']:
        yt_id = re.search(r'activitynet~v_(.*)', source_id).group(1)
        time = get_time(activity_net_df.loc[activity_net_df['id'] == yt_id]['annotations'].item())

        yt_id_list.append(yt_id)
        time_list.append(time)
    print('start saving frames...')
    save_frames(yt_id_list, time_list, out_dir)
    print('...end saving frames')

def get_time(annotations):
    '''
    return the middle of the time range specified in the first annotation.
    !!! assumes first annotation is used in hellaswag.
    if there are no annotations, return middle of full video

    example annotation:
    [{'segment': [0.01, 123.42336739937599], 'label': 'Fun sliding down'}]
    '''
    # replaces all single with double quotes, fine since we just need segment
    annotations = json.loads(annotations.replace("'", '"'))
    annotation = annotations[0]
    # replaces all single with double quotes, fine since we just need segment
    segment = annotation['segment']
    start, end = segment[0], segment[1]
    return (start + end) / 2



In [241]:
build_images('csv/tiny_hellaswag_train.csv', 'csv/activity_net.csv')

start saving frames...
[youtube] z8VqGGu5vPc: Downloading webpage
saved z8VqGGu5vPc
[youtube] 8FSKFy1tPQc: Downloading webpage
saved 8FSKFy1tPQc
[youtube] sxQbiXWFdKs: Downloading webpage
saved sxQbiXWFdKs
[youtube] pev7rvOE8eM: Downloading webpage
[youtube] pev7rvOE8eM: Downloading MPD manifest
saved pev7rvOE8eM
[youtube] 6dc-fQCzOiw: Downloading webpage
saved 6dc-fQCzOiw
[youtube] DW7Zm9DzEDk: Downloading webpage
saved DW7Zm9DzEDk
[youtube] AVL31l6H1uI: Downloading webpage
saved AVL31l6H1uI
[youtube] Pv6oIFroaCQ: Downloading webpage
saved Pv6oIFroaCQ
[youtube] krFle3KU4Ts: Downloading webpage
[youtube] krFle3KU4Ts: Downloading MPD manifest
saved krFle3KU4Ts
[youtube] ATk8OkvNHHQ: Downloading webpage
saved ATk8OkvNHHQ
10 total videos, 10 saved
...end saving frames


In [212]:

ydl = youtube_dl.YoutubeDL()
ydl.add_default_info_extractors()
url = 'https://www.youtube.com/watch?v=-2dxp-mv2zo'
try:
    info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
    print('private!')


[youtube] -2dxp-mv2zo: Downloading webpage


ERROR: Private video
Sign in if you've been granted access to this video


private!


# hellaswag util

In [135]:
def create_hellaswag_csv(jsonl_filepath, out_file):
    '''
    Preprocess hellaswag jsonl file from `filepath`, create and save csv into `data` directory.

    Preprocessing:
    * Keep only ActivityNet prompts
    '''
    with open(jsonl_filepath, 'r') as f:
        df = pd.read_json(f, lines=True)

    # preprocessing
    df = df.loc[df['source_id'].str.contains('activitynet')]
    df.to_csv(out_file)


In [137]:
for name in ['train', 'test', 'val']:
    create_hellaswag_csv(f'raw_data/hellaswag_{name}.jsonl', f'csv/hellaswag_{name}.csv')

In [138]:
with open('raw_data/hellaswag_train.jsonl', 'r') as f:
    hellaswag_train_df = pd.read_json(f, lines=True)

print(hellaswag_train_df.shape[0])
hellaswag_train_df.head()

39905


Unnamed: 0,ind,activity_label,ctx_a,ctx_b,ctx,split,split_type,label,endings,source_id
0,4,Removing ice from car,"Then, the man writes over the snow covering th...",then,"Then, the man writes over the snow covering th...",train,indomain,3,"[, the man adds wax to the windshield and cuts...",activitynet~v_-1IBHYS3L-Y
1,8,Baking cookies,A female chef in white uniform shows a stack o...,the pans,A female chef in white uniform shows a stack o...,train,indomain,3,"[contain egg yolks and baking soda., are then ...",activitynet~v_-2dxp-mv2zo
2,9,Baking cookies,A female chef in white uniform shows a stack o...,a knife,A female chef in white uniform shows a stack o...,train,indomain,3,[is seen moving on a board and cutting out its...,activitynet~v_-2dxp-mv2zo
3,12,Baking cookies,A tray of potatoes is loaded into the oven and...,a large tray of meat,A tray of potatoes is loaded into the oven and...,train,indomain,3,"[is placed onto a baked potato., , ls, and pic...",activitynet~v_-2dxp-mv2zo
4,27,Getting a haircut,The man in the center is demonstrating a hairs...,the man in the blue shirt,The man in the center is demonstrating a hairs...,train,indomain,2,[is standing on the sponge cutting the hair of...,activitynet~v_-JqLjPz-07E


In [139]:
hellaswag_train_df = hellaswag_train_df.loc[hellaswag_train_df['source_id'].str.contains('activitynet')]
print(hellaswag_train_df.shape[0])
hellaswag_train_df.head()

14740


Unnamed: 0,ind,activity_label,ctx_a,ctx_b,ctx,split,split_type,label,endings,source_id
0,4,Removing ice from car,"Then, the man writes over the snow covering th...",then,"Then, the man writes over the snow covering th...",train,indomain,3,"[, the man adds wax to the windshield and cuts...",activitynet~v_-1IBHYS3L-Y
1,8,Baking cookies,A female chef in white uniform shows a stack o...,the pans,A female chef in white uniform shows a stack o...,train,indomain,3,"[contain egg yolks and baking soda., are then ...",activitynet~v_-2dxp-mv2zo
2,9,Baking cookies,A female chef in white uniform shows a stack o...,a knife,A female chef in white uniform shows a stack o...,train,indomain,3,[is seen moving on a board and cutting out its...,activitynet~v_-2dxp-mv2zo
3,12,Baking cookies,A tray of potatoes is loaded into the oven and...,a large tray of meat,A tray of potatoes is loaded into the oven and...,train,indomain,3,"[is placed onto a baked potato., , ls, and pic...",activitynet~v_-2dxp-mv2zo
4,27,Getting a haircut,The man in the center is demonstrating a hairs...,the man in the blue shirt,The man in the center is demonstrating a hairs...,train,indomain,2,[is standing on the sponge cutting the hair of...,activitynet~v_-JqLjPz-07E


In [142]:
tiny_hellaswag_train_df = hellaswag_train_df.iloc[:10]
tiny_hellaswag_train_df.head(20)
tiny_hellaswag_train_df.to_csv('csv/tiny_hellaswag_train.csv')

In [237]:
import random

def create_sampled_hellaswag_csv(hellaswag_csv, out_file, n):
    '''
    randomly sample rows from hellaswag csv to create a new csv of length n. all
    rows have been checked for their corresponding video being public
    '''
    with open(hellaswag_csv, 'r') as f:
        df = pd.read_csv(f)

    seen = set()
    ydl = youtube_dl.YoutubeDL()
    ydl.add_default_info_extractors()

    while len(seen) < n:
        m = random.randint(0, df.shape[0])
        if m in seen:
            continue
        # check if corresponding video is not private
        series = df.iloc[m]
        yt_id = re.search(r'activitynet~v_(.*)', series['source_id']).group(1)
        url = f'https://www.youtube.com/watch?v={yt_id}'
        try:
            ydl.extract_info(url, download=False)
        except youtube_dl.utils.YoutubeDLError:
            continue
        seen.add(m)

    sampled_df = df.iloc[list(seen)]
    sampled_df = sampled_df.drop(labels='Unnamed: 0', axis=1)
    sampled_df.to_csv(out_file)

In [238]:
create_sampled_hellaswag_csv('csv/hellaswag_train.csv', 'csv/tiny_hellaswag_train.csv', 10)

[youtube] z8VqGGu5vPc: Downloading webpage
[youtube] DW7Zm9DzEDk: Downloading webpage
[youtube] o-BGGr-DU5g: Downloading webpage


ERROR: Video unavailable


[youtube] krFle3KU4Ts: Downloading webpage
[youtube] krFle3KU4Ts: Downloading MPD manifest
[youtube] sxQbiXWFdKs: Downloading webpage
[youtube] ATk8OkvNHHQ: Downloading webpage
[youtube] a0hFDXB1RgY: Downloading webpage


ERROR: Video unavailable


[youtube] qr5vqi5tTL8: Downloading webpage


ERROR: Private video
Sign in if you've been granted access to this video


[youtube] 8xsLp6lqijo: Downloading webpage


ERROR: Video unavailable


[youtube] 8FSKFy1tPQc: Downloading webpage
[youtube] AVL31l6H1uI: Downloading webpage
[youtube] pev7rvOE8eM: Downloading webpage
[youtube] pev7rvOE8eM: Downloading MPD manifest
[youtube] JVfhBvlv0IY: Downloading webpage


ERROR: Video unavailable
This video is no longer available because the YouTube account associated with this video has been terminated.


[youtube] Xbk1XePzTIA: Downloading webpage


ERROR: Private video
Sign in if you've been granted access to this video


[youtube] Pv6oIFroaCQ: Downloading webpage
[youtube] 6dc-fQCzOiw: Downloading webpage


In [234]:
with open('csv/hellaswag_train.csv', 'r') as f:
    df = pd.read_csv(f)
df = df.drop(labels='Unnamed: 0', axis=1)
df.to_csv('test.csv')
df.head()

Unnamed: 0,ind,activity_label,ctx_a,ctx_b,ctx,split,split_type,label,endings,source_id
0,4,Removing ice from car,"Then, the man writes over the snow covering th...",then,"Then, the man writes over the snow covering th...",train,indomain,3,"[', the man adds wax to the windshield and cut...",activitynet~v_-1IBHYS3L-Y
1,8,Baking cookies,A female chef in white uniform shows a stack o...,the pans,A female chef in white uniform shows a stack o...,train,indomain,3,"['contain egg yolks and baking soda.', 'are th...",activitynet~v_-2dxp-mv2zo
2,9,Baking cookies,A female chef in white uniform shows a stack o...,a knife,A female chef in white uniform shows a stack o...,train,indomain,3,['is seen moving on a board and cutting out it...,activitynet~v_-2dxp-mv2zo
3,12,Baking cookies,A tray of potatoes is loaded into the oven and...,a large tray of meat,A tray of potatoes is loaded into the oven and...,train,indomain,3,"['is placed onto a baked potato.', ', ls, and ...",activitynet~v_-2dxp-mv2zo
4,27,Getting a haircut,The man in the center is demonstrating a hairs...,the man in the blue shirt,The man in the center is demonstrating a hairs...,train,indomain,2,['is standing on the sponge cutting the hair o...,activitynet~v_-JqLjPz-07E


In [243]:
for name in ['train', 'test', 'val']:
    with open(f'csv/hellaswag_{name}.csv', 'r') as f:
        print(f'{name}: {len(f.readlines())}')

train: 14741
test: 3522
val: 3244


In [4]:
# save frame from single video

save_frames(['-1IBHYS3L-Y'], [92.5], 'hellaswag_images/')

[youtube] -1IBHYS3L-Y: Downloading webpage
saved -1IBHYS3L-Y
1 total videos, 1 saved
