In [11]:
import os
import numpy as np
import tqdm
import random
import pathlib
import itertools
import collections

import cv2 # process video files
import einops # perform more complex tensor operations
import remotezip as rz

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras import layers

## Load and Preprocess Video Data

In [19]:
# List the files in each class of the dataset given a URL with the zip file
def list_files_from_zip_url(zip_url):
    files = []
    with rz.RemoteZip(zip_url) as zip:
        for zip_info in zip.infolist():
            files.append(zip_info.filename)
    return files

# Retrieve the name of the class given a filename
def get_class(fname):
    return fname.split('_')[-3]

# Retrieve the files that belong to each class
def get_files_per_class(files):
    files_for_class = collections.defaultdict(list)
    for fname in files:
        class_name = get_class(fname)
        files_for_class[class_name].append(fname)    
    return files_for_class

# Download the contents of the zip file from the zip URL
def download_from_zip(zip_url, to_dir, file_names):
    with rz.RemoteZip(zip_url) as zip:
        for fn in tqdm.tqdm(file_names):
            class_name = get_class(fn)
            zip.extract(fn, str(to_dir / class_name))
            unzipped_file = to_dir / class_name / fn

            fn = pathlib.Path(fn).parts[-1]
            output_file = to_dir / class_name / fn
            unzipped_file.rename(output_file)

# Returns the list of files belonging to a subset of data as well as the remainder of files that need to be downloaded
def split_class_lists(files_for_class, count):
    split_files = []
    remainder = {}
    for cls in files_for_class:
        split_files.extend(files_for_class[cls][:count])
        remainder[cls] = files_for_class[cls][count:]
    return split_files, remainder

# Download a subset of the UFC101 dataset and split them into various parts, such as training, validation, and test
def download_ufc_101_subset(zip_url, num_classes, splits, download_dir):
    files = list_files_from_zip_url(zip_url)
    for f in files:
        path = os.path.normpath(f)
        tokens = path.split(os.sep)
        if len(tokens) <= 2:
            files.remove(f) # Remove that item from the list if it does not have a filename

    files_for_class = get_files_per_class(files)

    classes = list(files_for_class.keys())[:num_classes]

    for cls in classes:
        random.shuffle(files_for_class[cls])

    # Only use the number of classes you want in the dictionary
    files_for_class = {x: files_for_class[x] for x in classes}

    dirs = {}
    for split_name, split_count in splits.items():
        print(split_name, ":")
        split_dir = download_dir / split_name
        split_files, files_for_class = split_class_lists(files_for_class, split_count)
        download_from_zip(zip_url, split_dir, split_files)
        dirs[split_name] = split_dir

    return dirs

In [20]:
URL = "https://storage.googleapis.com/thumos14_files/UCF101_videos.zip"
download_dir = pathlib.Path("./UCF101_subset/")
subset_paths = download_ufc_101_subset(URL,
                                       num_classes = 10,
                                       splits = {"train": 30, "val": 10, "test": 10},
                                       download_dir = download_dir)

train :


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [03:47<00:00,  1.32it/s]


val :


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.70it/s]


test :


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.71it/s]
