Activate conda w251_project3

In [1]:
import cv2
import os
import urllib
import pandas as pd
import re
import numpy as np

In [53]:
project_path = r'D:\UCBerkeley\CourseWork\202001\W251\Homework\Project'
excel_name = 'SignVideoInventory.xlsx'
excel_sheet_name = 'select19_input'
video_url_prefix = 'http://csr.bu.edu/ftp/asl/asllvd/asl-data2/quicktime/'
dir_original_video = 'original_videos'
dir_trimmed_video = 'trimmed_videos'
dir_train = 'train_videos'
dir_val = 'val_videos'
dir_train_val = 'train_val_videos'
validation_pct = 0.4

In [3]:
# Read Excel file 
inventory_df = pd.read_excel(os.path.join(project_path, excel_name), sheet_name=excel_sheet_name)
# Construct video names
inventory_video_name_df = inventory_df['Session'].map(str) + '/scene' \
                          + inventory_df['Scene'].map(str) + '-camera1.mov'
# Get a list of all videos
inventory_video_name_lst = inventory_video_name_df.tolist()
# Get a list of unique videos
inventory_video_name_set = set(inventory_video_name_lst)
# Get a list of tuples with start and end frames
inventory_start_end_frame_lst = inventory_df[['Start', 'End']].apply(tuple, axis=1).tolist()
# Get a list of words
inventory_word_lst = inventory_df['Main New Gloss'].tolist()
# Get a list of variants
inventory_variant_lst = inventory_df['Variant'].tolist()

# Sanity check
# 95 original videos should be downloaded
print(len(inventory_video_name_set))

95


In [4]:
def download_original_videos(p_path, out_dir, video_name_set):
    for video_name in video_name_set:
        # Use regular expression to replace / with _
        saved_video_name = re.sub('/', '_', video_name)
        # Download only if it doesn't exist yet
        print(saved_video_name)
        if not os.path.isfile(os.path.join(p_path, out_dir, saved_video_name)):
            # Create an URL to download each video
            video_url = video_url_prefix + video_name
            # Download videos
            urllib.request.urlretrieve(video_url, os.path.join(p_path, out_dir, saved_video_name)) 

download_original_videos(project_path, dir_original_video, inventory_video_name_set)

ASL_2011_06_08_Brady_scene29-camera1.mov
ASL_2008_05_12a_scene29-camera1.mov
ASL_2008_05_29a_scene1-camera1.mov
ASL_2008_05_12b_scene15-camera1.mov
ASL_2006_10_10_scene5-camera1.mov
ASL_2008_08_06_scene22-camera1.mov
ASL_2011_06_08_Brady_scene30-camera1.mov
ASL_2008_08_04_scene18-camera1.mov
ASL_2008_08_04_scene25-camera1.mov
ASL_2008_02_15_scene45-camera1.mov
ASL_2008_01_11_scene9-camera1.mov
ASL_2008_05_12a_scene18-camera1.mov
ASL_2006_10_10_scene6-camera1.mov
ASL_2008_05_12b_scene6-camera1.mov
ASL_2008_08_06_scene6-camera1.mov
ASL_2011_06_08_Brady_scene36-camera1.mov
ASL_2011_06_14_Brady_scene2-camera1.mov
ASL_2008_08_04_scene28-camera1.mov
ASL_2008_01_11_scene81-camera1.mov
ASL_2011_06_08_Brady_scene18-camera1.mov
ASL_2011_07_19_Brady_scene39-camera1.mov
ASL_2007_05_24_scene12-camera1.mov
ASL_2011_06_08_Brady_scene37-camera1.mov
ASL_2008_05_12a_scene44-camera1.mov
ASL_2008_08_06_scene15-camera1.mov
ASL_2011_06_08_Brady_scene28-camera1.mov
ASL_2008_01_18_scene15-camera1.mov
ASL_2008

In [5]:
def trim_one_video (p_path, in_dir, in_video, our_dir, out_video, start_frame, end_frame):
    cap = cv2.VideoCapture(os.path.join(p_path, in_dir, in_video))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width_frame  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height_frame = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
    out = cv2.VideoWriter(os.path.join(p_path, our_dir, out_video),fourcc, fps, (width_frame, height_frame))
    counter_frame = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            # frame = cv2.flip(frame,0)
            # write the flipped frame
            if start_frame <= counter_frame <= end_frame:
                out.write(frame)
                # cv2.imshow('Preview',frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            counter_frame += 1
        else:
            break
    # Release everything if job is finished
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [9]:
def trim_all_videos(p_path, in_dir, out_dir, word_lst, video_name_lst, start_end_frame_lst, variant_lst):
    out_video_name_lst = []
    previous_word = ''
    counter = 0
    for word, video_name, (start_frame, end_frame), variant \
            in zip(word_lst, video_name_lst, start_end_frame_lst, variant_lst):
        if word == previous_word:
            counter += 1
        else:
            counter = 0
        original_video_name = re.sub('/', '_', video_name)
        # Construct saved video name: ASL_2008_01_11_scene2-camera1_[word]_v[variant]_[counter].mov
        new_video_name = re.sub('/', '_', video_name)[:-4] + '_' + word \
                           + '_v' + str(variant) + '_' + str(counter) + '.mov'
        previous_word = word
        print(new_video_name)
        out_video_name_lst.append(new_video_name)
        if not os.path.isfile(os.path.join(p_path, out_dir, new_video_name)):
            trim_one_video (p_path, in_dir, original_video_name, out_dir, new_video_name, start_frame, end_frame)
    return out_video_name_lst
    
trimmed_video_name_lst = trim_all_videos(project_path, dir_original_video, dir_trimmed_video, 
                                       inventory_word_lst, inventory_video_name_lst, 
                                       inventory_start_end_frame_lst, inventory_variant_lst)

ASL_2008_01_11_scene9-camera1_AGAIN_v0_0.mov
ASL_2008_01_11_scene9-camera1_AGAIN_v2_1.mov
ASL_2008_05_12a_scene8-camera1_AGAIN_v0_2.mov
ASL_2008_05_12a_scene8-camera1_AGAIN_v1_3.mov
ASL_2008_08_04_scene7-camera1_AGAIN_v0_4.mov
ASL_2008_08_04_scene7-camera1_AGAIN_v2_5.mov
ASL_2011_06_08_Brady_scene9-camera1_AGAIN_v0_6.mov
ASL_2011_06_14_Brady_scene2-camera1_AGAIN_v3_7.mov
ASL_2011_07_19_Brady_scene116-camera1_AGAIN_v0_8.mov
ASL_2006_10_10_scene2-camera1_AGAIN_v0_9.mov
ASL_2007_05_24_scene5-camera1_AGAIN_v0_10.mov
ASL_2008_01_11_scene11-camera1_ALL_v0_0.mov
ASL_2008_05_12a_scene9-camera1_ALL_v0_1.mov
ASL_2008_08_04_scene8-camera1_ALL_v0_2.mov
ASL_2011_06_08_Brady_scene10-camera1_ALL_v0_3.mov
ASL_2006_10_10_scene2-camera1_ALL_v0_4.mov
ASL_2007_05_24_scene5-camera1_ALL_v0_5.mov
ASL_2008_01_11_scene27-camera1_AWKWARD_v0_0.mov
ASL_2008_05_12a_scene18-camera1_AWKWARD_v0_1.mov
ASL_2008_08_04_scene18-camera1_AWKWARD_v0_2.mov
ASL_2011_06_08_Brady_scene14-camera1_AWKWARD_v0_3.mov
ASL_2006_10_10_s

In [31]:
def manual_stratified_sampling(word_lst, video_lst, val_pct):
    # Default: all videos are for training, will switch x% to validation later
    out_train_val_dict = dict.fromkeys(video_lst , 'train')
    video_by_word_lst = []
    previous_word = ''
    for (word, video) in zip(word_lst, video_lst):
        if word == previous_word:
            video_by_word_lst.append(video)
        else:
            # If video_by_word_lst is empty, don't perform the following steps
            # This is to handle the first iteration of the loop
            if len(video_by_word_lst) != 0:
                if len(video_by_word_lst) == 1:
                    print('ALERT: only one video for', previous_word)
                # For a given word, calculate the number of videos for validation, floored at 1 
                val_count = max(int(len(video_by_word_lst)*val_pct), 1)
                video_by_word_val_lst = np.random.choice(video_by_word_lst, val_count, replace=False)
                for i in video_by_word_val_lst:
                    out_train_val_dict[i] = 'val'
            video_by_word_lst = [video]
            previous_word = word
    # Handle the last word      
    if len(video_by_word_lst) == 1:
        print('ALERT: only one video for', previous_word)
    val_count = int(len(video_by_word_lst)*val_pct)
    video_by_word_val_lst = np.random.choice(video_by_word_lst, val_count, replace=False)
    for i in video_by_word_val_lst:
        out_train_val_dict[i] = 'val'
        
    return out_train_val_dict

video_train_val_dict = manual_stratified_sampling(inventory_word_lst, trimmed_video_name_lst, validation_pct)

In [32]:
video_train_val_dict



{'ASL_2008_01_11_scene9-camera1_AGAIN_v0_0.mov': 'train',
 'ASL_2008_01_11_scene9-camera1_AGAIN_v2_1.mov': 'val',
 'ASL_2008_05_12a_scene8-camera1_AGAIN_v0_2.mov': 'train',
 'ASL_2008_05_12a_scene8-camera1_AGAIN_v1_3.mov': 'train',
 'ASL_2008_08_04_scene7-camera1_AGAIN_v0_4.mov': 'train',
 'ASL_2008_08_04_scene7-camera1_AGAIN_v2_5.mov': 'val',
 'ASL_2011_06_08_Brady_scene9-camera1_AGAIN_v0_6.mov': 'val',
 'ASL_2011_06_14_Brady_scene2-camera1_AGAIN_v3_7.mov': 'train',
 'ASL_2011_07_19_Brady_scene116-camera1_AGAIN_v0_8.mov': 'train',
 'ASL_2006_10_10_scene2-camera1_AGAIN_v0_9.mov': 'train',
 'ASL_2007_05_24_scene5-camera1_AGAIN_v0_10.mov': 'val',
 'ASL_2008_01_11_scene11-camera1_ALL_v0_0.mov': 'val',
 'ASL_2008_05_12a_scene9-camera1_ALL_v0_1.mov': 'train',
 'ASL_2008_08_04_scene8-camera1_ALL_v0_2.mov': 'train',
 'ASL_2011_06_08_Brady_scene10-camera1_ALL_v0_3.mov': 'val',
 'ASL_2006_10_10_scene2-camera1_ALL_v0_4.mov': 'train',
 'ASL_2007_05_24_scene5-camera1_ALL_v0_5.mov': 'train',
 'ASL_

In [52]:
# Save training and validation videos in separate folders
def save_train_val(p_path, in_dir, out_train_dir, out_val_dir, in_train_val_dict):
    for k, v in in_train_val_dict.items():
        if v == 'train':
            if not os.path.isfile(os.path.join(p_path, out_train_dir, k)):
                !copy {os.path.join(p_path, in_dir, k)} {os.path.join(p_path, out_train_dir)}
        else:
            if not os.path.isfile(os.path.join(p_path, out_val_dir, k)):
                !copy {os.path.join(p_path, in_dir, k)} {os.path.join(p_path, out_val_dir)}

save_train_val(project_path, dir_trimmed_video, dir_train, dir_val, video_train_val_dict)

        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 fi

In [54]:
# Save training and validation videos in the same folder
def save_train_val2(p_path, in_dir, out_train_val_dir, in_train_val_dict):
    for k, v in in_train_val_dict.items():
        if v == 'train':
            video_name = k[:-4] + '_train.mov'
            if not os.path.isfile(os.path.join(p_path, out_train_val_dir, video_name)):
                !copy {os.path.join(p_path, in_dir, k)} {os.path.join(p_path, out_train_val_dir, video_name)}
        else:
            video_name = k[:-4] + '_val.mov'
            if not os.path.isfile(os.path.join(p_path, out_train_val_dir, video_name)):
                !copy {os.path.join(p_path, in_dir, k)} {os.path.join(p_path, out_train_val_dir, video_name)}

save_train_val2(project_path, dir_trimmed_video, dir_train_val, video_train_val_dict)

        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 file(s) copied.
        1 fi