In [52]:
import os
import random
import torch
import torchvision.transforms as transforms
import torchvision.transforms.v2 as transformsv2
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
from sklearn.model_selection import train_test_split

In [53]:
class UCF101Dataset(Dataset):
    def __init__(self, video_list, labels, root_dir, transform=None):
        self.video_list = video_list
        self.labels = labels
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        video_name = self.video_list[idx]
        label = self.labels[video_name]
        video_path = os.path.join(self.root_dir, video_name)
        video_frames, _, _ = read_video(video_path, pts_unit="sec")

        if self.transform:
            video_frames = torch.stack([self.transform(frame) for frame in video_frames])

        return video_frames, label

In [None]:
# Load the dataset
root_dir = "/kaggle/input/ucf101/UCF101/UCF-101"

# Assign labels to the videos
labels = {}
videos = []

for action_folder in os.listdir(root_dir):
    action_folder_path = os.path.join(root_dir, action_folder)
    
    if os.path.isdir(action_folder_path):
        for video in os.listdir(action_folder_path):
            video_path = os.path.join(action_folder_path, video)
            
            if os.path.isfile(video_path):
                labels[video_path] = action_folder
                videos.append(video_path)

# Create train-test splits
train_videos, test_videos = train_test_split(videos, test_size=0.2, random_state=42)

In [None]:
#Resize
resize = 112

# Color Jetter transformation
coljit = 0.1

# normalization parameters
mean = (0.5, 0.5, 0.5)
std = (0.5, 0.5, 0.5)

inner_transforms = transformsv2.Compose([
    transformsv2.Resize(resize),
    transformsv2.Normalize(mean, std),
    transformsv2.ToImageTensor(),
    transformsv2.ConvertImageDtype(torch.float32)
])

outer_transforms = transformsv2.Compose([
    transformsv2.RandomHorizontalFlip(),
    transformsv2.ColorJitter(brightness=coljit, contrast=coljit, saturation=coljit, hue=coljit),
])

# define the v2 transformations to be applied to the images
transform_val = transforms.Compose([
    inner_transforms
])

transform_train = transforms.Compose([
    outer_transforms,
    inner_transforms
])

In [None]:
selected_actions = ['ApplyEyeMakeup', 'BenchPress', 'CliffDiving']  # Replace these with the actual action names you want to keep

filtered_train_videos = [video_path for video_path in train_videos if labels[video_path] in selected_actions]
filtered_train_labels = [label for label in labels if label in selected_actions]

filtered_test_videos = [video_path for video_path in test_videos if labels[video_path] in selected_actions]
filtered_test_labels = [label for label in labels if label in selected_actions]

# Create the train and test datasets
train_dataset = UCF101Dataset(filtered_train_videos, filtered_train_labels, root_dir, transform_train)
test_dataset = UCF101Dataset(filtered_test_videos, filtered_test_labels, root_dir, transform_val)

In [57]:
batch_size = 2

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [58]:
import torchvision.models as models

# Load the pre-trained 3D ResNet-18 model
r3d_18 = models.video.r3d_18(weights='DEFAULT')

# Remove the last fully connected layer to use the model for feature extraction
r3d_18 = torch.nn.Sequential(*list(r3d_18.children())[:-1])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
r3d_18 = r3d_18.to(device)

In [59]:
# extract_video_features function
def extract_video_features(model, loader):
    model.eval()
    features = []
    labels = []

    with torch.no_grad():
        for i, (inputs, target) in enumerate(loader):
            print(f'Extracting video features: {i * batch_size}/{len(loader.dataset)}', end='\r')
            inputs = inputs.to(device)
            labels.append(target)

            # extract features
            features.append(model(inputs).squeeze())

    return torch.cat(features), torch.cat(labels)

In [60]:
# extract features from the train and test datasets
train_features, train_labels = extract_video_features(r3d_18, train_loader)

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_32/1326434759.py", line 13, in __getitem__
    label = self.labels[video_name]
TypeError: list indices must be integers or slices, not str


In [None]:
print(train_labels)