In [1]:
import cv2

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from torch.nn import functional as F
from torch.utils.data._utils.collate import default_collate

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import seaborn as sns
import matplotlib.pyplot as plt

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Directory paths
VIDEO_DIR = "/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg"
OUTPUT_DIR = "/kaggle/working/temp_folder"

# Data Preprocessing

In [3]:
categories = os.listdir(VIDEO_DIR)
categories

['biking',
 'trampoline_jumping',
 'swing',
 'walking',
 'golf_swing',
 'soccer_juggling',
 'tennis_swing',
 'volleyball_spiking',
 'basketball',
 'horse_riding',
 'diving']

In [4]:
num_classes = len(categories)
num_classes

11

In [5]:
def create_dataset(input_folder):
    groups = []
    classes = os.listdir(input_folder)
    index = 0
    for class_name in sorted(classes):
        print(f'Extracting Data of Class: {class_name}')
        
        label_folder_path = os.path.join(input_folder, class_name)
        if os.path.isdir(label_folder_path):
            group_folders = os.listdir(label_folder_path)
            for group_folder in group_folders:
                if group_folder != 'Annotation':
                    video_dir = os.path.join(label_folder_path,group_folder)
                    for video_file in os.listdir(video_dir):
                        groups.append([os.path.join(video_dir, video_file), class_name])
            index += 1
    return groups

In [6]:
groups = create_dataset(VIDEO_DIR)

Extracting Data of Class: basketball
Extracting Data of Class: biking
Extracting Data of Class: diving
Extracting Data of Class: golf_swing
Extracting Data of Class: horse_riding
Extracting Data of Class: soccer_juggling
Extracting Data of Class: swing
Extracting Data of Class: tennis_swing
Extracting Data of Class: trampoline_jumping
Extracting Data of Class: volleyball_spiking
Extracting Data of Class: walking


In [7]:
groups[:5]

[['/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg/basketball/v_shooting_05/v_shooting_05_01.mpg',
  'basketball'],
 ['/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg/basketball/v_shooting_05/v_shooting_05_04.mpg',
  'basketball'],
 ['/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg/basketball/v_shooting_05/v_shooting_05_03.mpg',
  'basketball'],
 ['/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg/basketball/v_shooting_05/v_shooting_05_02.mpg',
  'basketball'],
 ['/kaggle/input/ucf11-action-recognize/UCF11_updated_mpg/basketball/v_shooting_09/v_shooting_09_05.mpg',
  'basketball']]

In [8]:
len(groups)

1600

In [9]:
# Convert to DataFrame
df = pd.DataFrame(groups, columns=["videos", "Category"])

# Display the DataFrame
df

Unnamed: 0,videos,Category
0,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball
1,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball
2,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball
3,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball
4,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball
...,...,...
1595,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking
1596,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking
1597,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking
1598,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking


In [10]:
# Map category names to numeric labels
class_mapping = {category: idx for idx, category in enumerate(categories)}

print("Category to Label Mapping:")
class_mapping

Category to Label Mapping:


{'biking': 0,
 'trampoline_jumping': 1,
 'swing': 2,
 'walking': 3,
 'golf_swing': 4,
 'soccer_juggling': 5,
 'tennis_swing': 6,
 'volleyball_spiking': 7,
 'basketball': 8,
 'horse_riding': 9,
 'diving': 10}

### Label Encoding

In [11]:
df['label'] = df['Category'].map(class_mapping)
df

Unnamed: 0,videos,Category,label
0,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball,8
1,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball,8
2,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball,8
3,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball,8
4,/kaggle/input/ucf11-action-recognize/UCF11_upd...,basketball,8
...,...,...,...
1595,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking,3
1596,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking,3
1597,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking,3
1598,/kaggle/input/ucf11-action-recognize/UCF11_upd...,walking,3


# Video Preprocessing

### Extracting RGB Frames and Optical Flow

In [12]:
def extract_frames(video_path, frame_folder, SEQUENCE_LENGTH = 10):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    skip_frames_window = max(int(frame_count / SEQUENCE_LENGTH), 1)
    frames = []
    
    for i in range(SEQUENCE_LENGTH):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * skip_frames_window)
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
        cv2.imwrite(f"{frame_folder}/frame_{i}.jpg", frame)
    
    cap.release()

    if not frames:
        raise ValueError(f"No frames found in video: {video_path}")
         
    return frames

In [13]:
def calculate_optical_flow(frames):
    if len(frames) < 2:
        raise ValueError("Insufficient frames for optical flow calculation.")
        
    flow_frames = []
    prev_frame = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
    
    for i in range(1, len(frames)):
        next_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        flow = cv2.calcOpticalFlowFarneback(prev_frame, next_frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        flow_frames.append(flow)
        prev_frame = next_frame
    
    return flow_frames

# Dataset Preparation

### Define a custom dataset class that loads RGB frames, optical flow, and labels.

In [14]:
class VideoDataset(Dataset):
    def __init__(self, data, labels, transform=False, rgb_transform=None, flow_transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing video paths and labels.
            transform (callable, optional): Optional transform to be applied on frames.
        """
        # self.dataframe = dataframe
        self.data = data
        self.labels = labels
        self.transform = transform
        self.rgb_transform = rgb_transform
        self.flow_transform = flow_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # video_path = self.dataframe.iloc[idx]['videos']
        # label_list = self.dataframe.iloc[idx]['label']
        video_path = self.data[idx]
        label = self.labels[idx]

        try:
            # Extract RGB frames and optical flow
            parts = video_path.split("/")
            video_name = parts[-1].split('.')[0]
            frame_folder = f"frames/{video_name}"  # Temporary folder to store frames
            os.makedirs(frame_folder, exist_ok=True)
    
            rgb_frames = extract_frames(video_path, frame_folder)
            if len(rgb_frames) < 10:
                # print(f"Skipping video {video_path} due to insufficient frames.")
                return None
            
            flow_frames = calculate_optical_flow(rgb_frames)
            if len(flow_frames) < 1:
                # print(f"Skipping video {video_path} due to no optical flow.")
                return None
    
            # Apply transformations if provided (e.g., resizing, normalization)
            if self.transform:
                rgb_frames = [self.rgb_transform(frame) for frame in rgb_frames]  # Use RGB-specific transform
                flow_frames = [self.flow_transform(flow) for flow in flow_frames]  # Use flow-specific transform
    
            # Convert frames to tensors (stacked along the time axis)
            rgb_tensor = torch.stack([torch.tensor(frame, dtype=torch.float32) for frame in rgb_frames])
            flow_tensor = torch.stack([torch.tensor(flow, dtype=torch.float32) for flow in flow_frames])

            # label_tensor = torch.tensor(label_list, dtype=torch.float32)  # Convert to tensor
    
            return rgb_tensor, flow_tensor, label

        except Exception as e:
            # print(f"Error processing video {video_path}: {e}")
            return None

# Define the Three-Stream Network Architecture

### Spatial and Temporal CNN

In [15]:
class CNNStream(nn.Module):
    def __init__(self, input_channels=3, num_classes=11):
        super(CNNStream, self).__init__()
        
        # Convolutional Block 1
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=input_channels, out_channels=96, kernel_size=6, stride=2, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # Convolutional Block 2
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=0),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # Convolutional Block 3
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        
        # Convolutional Block 4
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        
        # Convolutional Block 5
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Dynamically compute the flattened size
        self.flatten_size = self._compute_flatten_size(input_channels)
        
        # Fully Connected Layers
        self.fc6 = nn.Sequential(
            nn.Linear(self.flatten_size, 2048),  # Adjust based on input size
            nn.Dropout(0.5),
            nn.ReLU()
        )
        self.fc7 = nn.Sequential(
            nn.Linear(2048, num_classes),
            nn.Dropout(0.5)
        )

    def _compute_flatten_size(self, input_channels):
        """
        Calculate the size of the tensor after convolution and pooling.
        """
        with torch.no_grad():
            dummy_input = torch.zeros(1, input_channels, 128, 128)  # Example input size
            x = self.conv1(dummy_input)
            x = self.conv2(x)
            x = self.conv3(x)
            x = self.conv4(x)
            x = self.conv5(x)
            return x.numel()  # Flatten size
    
    def forward(self, x):
        # Reshape to merge batch and sequence dimensions
        batch_size, seq_len, channels, height, width = x.shape
        x = x.view(batch_size * seq_len, channels, height, width)  # [batch_size * seq_len, 3, 128, 128]
        
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        
        # Flatten for fully connected layers
        x = x.view(x.size(0), -1)
        x = self.fc6(x)
        x = self.fc7(x)

        # Reshape back to separate batch and sequence dimensions
        x = x.view(batch_size, seq_len, -1)  # [batch_size, seq_len, feature_dim]
        
        return x

### Sequential LSTM Network

In [16]:
class LSTMStream(nn.Module):
    def __init__(self, input_channels=3, height=128, width=128, hidden_size=256, num_layers=1, num_classes = 11):
        super(LSTMStream, self).__init__()

        # Calculate the correct input_size by flattening spatial dimensions (channels * height * width)
        input_size = input_channels * height * width  # This will be the correct input size for LSTM
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        batch_size, seq_len, channels, height, width = x.shape

        # Reshape the input from [batch_size, seq_len, channels, height, width]
        x = x.view(batch_size, seq_len, -1)  # Flatten the spatial dimensions (channels * height * width)
        
        # Pass through LSTM
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Last time step output
        out = self.fc(out)
        return out

# Combine the Three Streams

In [17]:
class ThreeStreamNetwork(nn.Module):
    def __init__(self, num_classes = 11):
        super(ThreeStreamNetwork, self).__init__()
        self.spatial_stream = CNNStream(input_channels=3, num_classes = 11)
        self.temporal_stream = CNNStream(input_channels=2, num_classes = 11)  # Optical flow has 2 channels (u, v)
        self.sequential_stream = LSTMStream(input_channels=3, height=128, width=128, hidden_size=512, num_layers=1, num_classes = 11)

        # self.fc_fusion = nn.Linear(1024 + 1024 + 512, 256)
        self.fc_mlp = nn.Linear(num_classes, 256)
        self.fc_output = nn.Linear(256, num_classes)  # 11: For UCF11 dataset

    def forward(self, rgb, flow):
        spatial_features = self.spatial_stream(rgb)
        temporal_features = self.temporal_stream(flow)

        sequential_features = self.sequential_stream(rgb)  # For LSTM

        # Option: Average over sequence length (dimension 1)
        spatial_features_avg = spatial_features.mean(dim=1)  # shape [32, 11]
        temporal_features_avg = temporal_features.mean(dim=1)  # shape [32, 11]
        
        # Sequential features already have shape [32, 11]
        fused_features = (spatial_features_avg + temporal_features_avg + sequential_features) / 3
        
        x = F.relu(self.fc_mlp(fused_features))
        x = self.fc_output(x)
        
        return x

In [18]:
def get_kfold_data(dataset, labels, n_splits=10, shuffle=True):
    """
    Split dataset into k folds.
    Args:
        dataset: List or array of data.
        labels: Corresponding labels.
        n_splits: Number of folds for cross-validation.
        shuffle: Whether to shuffle the data before splitting.
    Returns:
        List of train-test splits.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=42)
    folds = []
    
    for train_idx, test_idx in skf.split(dataset, labels):
        train_data = [dataset[i] for i in train_idx]
        test_data = [dataset[i] for i in test_idx]
        train_labels = [labels[i] for i in train_idx]
        test_labels = [labels[i] for i in test_idx]
        folds.append((train_data, train_labels, test_data, test_labels))
    
    return folds


# Training Loop

In [19]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()
    
    for epoch in range(num_epochs):
        
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()
        
        running_loss = 0.0 
        train_accuracy = 0.0
        correct = 0
        total = 0

        for batch in train_loader:
            if batch is None:  # Skip empty batches
                continue
            rgb, flow, labels = batch
            rgb, flow, labels = rgb.to(device), flow.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(rgb, flow)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_accuracy = correct / total

        end_event.record()
        torch.cuda.synchronize()  # Wait for all GPU operations to finish
        training_time = start_event.elapsed_time(end_event) / 1000  # Convert to seconds
        print("Epoch time: {:.2f} seconds".format(training_time))

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}, Train Accuracy: {train_accuracy:.4f} ")

# Evaluation

In [20]:
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            if batch is None:  # Skip empty batches
                continue
            rgb, flow, labels = batch
            rgb, flow, labels = rgb.to(device), flow.to(device), labels.to(device)
            
            outputs = model(rgb, flow)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    avg_loss = total_loss / len(test_loader)
    
    return accuracy, avg_loss


In [21]:
def safe_collate_fn(batch):
    """Remove None samples from the batch."""
    batch = [b for b in batch if b is not None]
    if not batch:
        return None  # Return None if the batch is empty
    return default_collate(batch)


# Cross Validation

In [22]:
def cross_validate(model_class, num_classes, dataset, labels, n_splits=10, device="cuda"):
    """
    Perform k-fold cross-validation.
    Args:
        model_class: The neural network class to be instantiated.
        dataset: Full dataset.
        labels: Corresponding labels.
        n_splits: Number of folds for cross-validation.
        device: Device to run the training (e.g., "cuda" or "cpu").
    Returns:
        Average accuracy and loss across folds.
    """
    folds = get_kfold_data(dataset, labels, n_splits=n_splits)
    fold_accuracies = []
    fold_losses = []


    # Define transforms
    rgb_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((128, 128)),  # Resize frames to 128x128
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # For RGB normalization
    ])
    
    # For optical flow (u, v channels)
    flow_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((128, 128)), # Resize frames to 128x128
        transforms.Normalize(mean=[0.5, 0.5], std=[0.5, 0.5])  # 2 channels
    ])


    for fold_idx, (train_data, train_labels, test_data, test_labels) in enumerate(folds):
        print(f"Starting fold {fold_idx + 1}/{n_splits}...")
        
        # Prepare data loaders
        train_dataset = VideoDataset(data=train_data, 
                                     labels=train_labels, 
                                     transform=True,
                                     rgb_transform=rgb_transform, 
                                     flow_transform=flow_transform)
        test_dataset = VideoDataset(data=test_data,
                                    labels=test_labels,
                                    transform=True,
                                    rgb_transform=rgb_transform, 
                                    flow_transform=flow_transform)
        
        train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=safe_collate_fn, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=safe_collate_fn, shuffle=False)


        # Initialize model, loss function, and optimizer
        model = model_class(num_classes=num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.000125)


        # Train the model
        train_model(model, train_loader, criterion, optimizer, device)

        # Evaluate the model
        accuracy, loss = evaluate_model(model, test_loader, criterion, device)
        fold_accuracies.append(accuracy)
        fold_losses.append(loss)

        print(f"Fold {fold_idx + 1} - Accuracy: {accuracy:.4f}, Loss: {loss:.4f}")

    # Compute average performance
    avg_accuracy = np.mean(fold_accuracies)
    avg_loss = np.mean(fold_losses)
    print(f"10-Fold CV - Average Accuracy: {avg_accuracy:.4f}, Average Loss: {avg_loss:.4f}")
    
    return avg_accuracy, avg_loss


In [23]:
# Hyperparameters and setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

cuda
Tesla P100-PCIE-16GB


### Training

In [24]:
data = df['videos']
labels = df['label']

# Assuming dataset and labels are preprocessed lists
avg_accuracy, avg_loss = cross_validate(ThreeStreamNetwork, num_classes, data, labels, n_splits=10, device=device)

Starting fold 1/10...
Epoch time: 365.11 seconds
Epoch [1/10], Loss: 2.3205, Train Accuracy: 0.1573 
Epoch time: 346.04 seconds
Epoch [2/10], Loss: 1.9084, Train Accuracy: 0.3013 
Epoch time: 344.75 seconds
Epoch [3/10], Loss: 1.6725, Train Accuracy: 0.3890 
Epoch time: 350.02 seconds
Epoch [4/10], Loss: 1.5077, Train Accuracy: 0.4600 
Epoch time: 349.07 seconds
Epoch [5/10], Loss: 1.3314, Train Accuracy: 0.5198 
Epoch time: 349.17 seconds
Epoch [6/10], Loss: 1.1507, Train Accuracy: 0.5894 
Epoch time: 344.84 seconds
Epoch [7/10], Loss: 1.0334, Train Accuracy: 0.6360 
Epoch time: 345.41 seconds
Epoch [8/10], Loss: 0.9238, Train Accuracy: 0.6771 
Epoch time: 344.32 seconds
Epoch [9/10], Loss: 0.8211, Train Accuracy: 0.7244 
Epoch time: 344.22 seconds
Epoch [10/10], Loss: 0.7824, Train Accuracy: 0.7370 
Fold 1 - Accuracy: 0.6937, Loss: 1.0712
Starting fold 2/10...
Epoch time: 345.38 seconds
Epoch [1/10], Loss: 2.2802, Train Accuracy: 0.1907 
Epoch time: 339.97 seconds
Epoch [2/10], Loss: