### CricShot10 Video Classification

(Use this with permission)
- `Source` - https://drive.google.com/drive/folders/1DPHURwQk5R8blgjM8VNz6Q68LqckxljX?usp=drive_link


Notebook to set the baseline model

In [1]:
# Loading Dependencies
from pathlib import Path
from typing import Tuple

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

# Contains helper functions to parse the dataset
import utils

In [14]:
# Global File Paths
# Change them before running the notebook
# These all are specific to my system
ZIP_FILES_PATH = "zipped-data"
DATASET_NAME = "CricShot10"
DATASET_ROOT_PATH = Path("CricShot10/")
TO_DIR = "dataset"

In [10]:
# 80% of videos are used for training the model
TRAIN_SET_RATIO = 0.8
TEST_SET_RATIO = 1 - TRAIN_SET_RATIO

In [17]:
# Majority of videos are 25FPS with a few being 30FPS
# Uniformly Sampling 10 frames from each videos
N_FRAMES = 10
# Change each frame (which is basically a picture) to (height, width) - (224, 224)
FRAME_SHAPE = (224, 224)
# Each batch contains 16 videos
BATCH_SIZE = 16

In [18]:
# Extracting all files
# Only needs to run once if the data is zipped
if not DATASET_ROOT_PATH.exists() and not DATASET_ROOT_PATH.is_dir():
    utils.unzip_files(ZIP_FILES_PATH, DATASET_NAME)

In [21]:
# Setup the directory structure with defined train and test directories
# Only needs to run once to setup the dataset directory
utils.setup_dataset_structure(
    from_dir=DATASET_ROOT_PATH,
    to_dir=TO_DIR,
    train_ratio=TRAIN_SET_RATIO,
    samples_per_class=samples_per_class,
)

Found dataset at: CricShot10
LOG: Creating directory 'dataset'
LOG: Creating directory 'dataset\train'
LOG: Creating directory 'dataset\test'


In [22]:
# Our dataset does not uniform distribution of data in each of the 10 classes
# Waisting the only the minimum amount of videos
samples_per_class = min(
    (
        len(list(path.glob("*.avi")))
        for path in Path(DATASET_ROOT_PATH).iterdir()
        if path.is_dir()
    )
)
print("Minimum Number of samples per class taken:", samples_per_class)

Minimum Number of samples per class taken: 0


In [23]:
# Define train and test directories
root_dir = Path(TO_DIR)
train_dir = root_dir / "train"
test_dir = root_dir / "test"

In [24]:
train_paths = list(train_dir.glob("*/*.avi"))
test_paths = list(test_dir.glob("*/*.avi"))
print("No. of video files in training set:", len(train_paths))
print("No. of video files in testing set:", len(test_paths))

No. of video files in training set: 1528
No. of video files in testing set: 360


In [25]:
# Get all the classes in the dataset
class_names, classes_to_idx = utils.get_classes(train_dir)
idx_to_classes = {idx: class_name for class_name, idx in classes_to_idx.items()}
print("All Classes in dataset:", class_names)

All Classes in dataset: ['cover', 'defense', 'flick', 'hook', 'late_cut', 'lofted', 'pull', 'square_cut', 'straight', 'sweep']


In [26]:
class CricShot10(torch.utils.data.Dataset):
    def __init__(
        self,
        target_dir: str,
        transform=None,
        n_frames: int = 16,
        target_size: Tuple[int, int] = (224, 224),
    ):
        self.target_dir = target_dir
        self.paths = list(target_dir.glob("*/*.avi"))

        self.transform = transform
        self.n_frames = n_frames
        self.target_size = target_size
        self.class_names, self.class_to_idx = utils.get_classes(target_dir)

    def __len__(self):
        return len(self.paths)

    def load_video(self, idx: int) -> torch.Tensor:
        video_path = str(self.paths[idx])

        cap = cv2.VideoCapture(video_path)

        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)

        indices = np.linspace(0, frame_count - 1, self.n_frames, dtype="int")

        frames = []
        for idx in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()

            if not ret:
                last_frame = (
                    frames[-1]
                    if frames
                    else np.zeros(
                        (self.target_size[0], self.target_size[1], 3), dtype=np.uint8
                    )
                )
                frames.append(last_frame)
                continue

            frame = cv2.resize(frame, self.target_size)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

        cap.release()

        if len(frames) < self.n_frames:
            # Pad with copies of the last frame
            last_frame = (
                frames[-1]
                if frames
                else np.zeros(
                    (self.target_size[0], self.target_size[1], 3), dtype=np.uint8
                )
            )
            frames.extend([last_frame] * (self.n_frames - len(frames)))

        # Pytorch requires (C, D, H, W)
        frames = np.transpose(np.array(frames), (3, 0, 2, 1))
        return frames

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        video = self.load_video(idx)
        class_name = "_".join(self.paths[idx].name.split("_")[:-1])
        class_idx = self.class_to_idx[class_name]

        if self.transform:
            return self.transform(video), class_idx
        return video, class_idx

In [27]:
# train_transform = transforms.Compose(
#     [
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#     ]
# )

# test_transform = transforms.Compose(
#     [
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#     ]
# )

In [28]:
train_dataset = CricShot10(
    train_dir,
    n_frames=N_FRAMES,
    target_size=FRAME_SHAPE,
    # transform=test_transform
)

test_dataset = CricShot10(
    test_dir,
    n_frames=N_FRAMES,
    target_size=FRAME_SHAPE,
)

In [29]:
# Create Dataloaders
# Shape: [10, 3, 16, 224, 224]
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [31]:
class Conv2Plus1D(nn.Module):
    def __init__(
        self,
        in_channels: int = 3,
        out_channels: int = 16,
        kernel_size: Tuple[int, int, int] = (3, 7, 7),
        # padding
    ):
        super.__init__()
        self.block = nn.Sequential(
            # Spatial decomposition
            nn.Conv3d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=(1, kernel_size[1], kernel_size[2]),
                padding="same",
            ),
            # Temporal decomposition
            nn.Conv3d(
                in_channels=out_channels,
                out_channels=out_channels,
                kernel_size=(kernel_size[0], 1, 1),
                padding="same",
            ),
        )

    def forward(self, x):
        return self.block(x)


class ResidualMain(nn.Module):
    def __init__(
        self,
        in_channels: int = 3,
        out_channels: int = 16,
        kernel_size: Tuple[int, int, int] = (3, 7, 7),
    ):
        super.__init__()
        self.block = nn.Sequential(
            Conv2Plus1D(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                padding="same",
            ),
            nn.LayerNorm(),
            nn.ReLU(),
            Conv2Plus1D(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                padding="same",
            ),
            nn.LayerNorm(),
        )

    def forward(self, x):
        return self.block(x)


class Project(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super.__init__()
        self.block = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=out_features),
            nn.LayerNorm(),
        )

    def forward(self, x):
        return self.block(x)

In [30]:
class Model(nn.Module):
    def __init__(
        self, in_channels: int = 3, kernel_size: Tuple[int, int, int] = (3, 7, 7)
    ):
        super.__init__()