# Sports Action Recognition Using an I3D(`Inflated 3D ConvNet`) Architecture on the UCF101 10 Sports actions

## Dependencies

In [1]:
import os
import re
import time
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    top_k_accuracy_score,
    classification_report,
    confusion_matrix,
)

import imageio
import cv2
from IPython.display import Image

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Conv3D,
    MaxPool3D,
    BatchNormalization,
    Input,
    Flatten,
    Dense,
    Dropout,
    GRU,
    Bidirectional,
)

import kagglehub

## Dataset Loading

Download latest version of the ucf101-action-recognition dataset from Kaggle

In [2]:
path = kagglehub.dataset_download("matthewjansen/ucf101-action-recognition")

In [3]:
print("Path to dataset files: \n", path)
print("\nFiles in dataset directory:\n", os.listdir(path))

Path to dataset files: 
 /Users/mzitoh/.cache/kagglehub/datasets/matthewjansen/ucf101-action-recognition/versions/4

Files in dataset directory:
 ['test', 'val.csv', 'test.csv', 'train', 'train.csv', 'val']


**Class Definition**

In [4]:
sports_actions = [
    "SkyDiving",
    "Biking",
    "HorseRace",
    "Surfing",
    "TennisSwing",
    "Punch",
    "Basketball",
    "JumpRope",
    "Archery",
    "Skiing",
]

Utility to transform video paths

In [5]:
def load_dataset(dataset_type):
    dataset_path = os.path.join(path, f"{dataset_type}.csv")
    dataset = pd.read_csv(dataset_path)

    # Filter dataset to only include the specified sports actions
    filtered_dataset = dataset[dataset["label"].isin(sports_actions)]

    return pd.DataFrame(
        {
            "label": filtered_dataset["label"],
            "video_path": filtered_dataset["clip_path"].apply(lambda x: f"{path}{x}"),
        }
    )

Load the train and test datasets

In [6]:
train_df = load_dataset("train")
val_df = load_dataset("val")
test_df = load_dataset("test")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for validation: {len(val_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 1128
Total videos for validation: 189
Total videos for testing: 192


In [7]:
print("Number of unique classes in training set: ", len(train_df["label"].unique()))

Number of unique classes in training set:  10


In [8]:
train_df.sample(10)

Unnamed: 0,label,video_path
103,SkyDiving,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
152,SkyDiving,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
3011,Archery,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
4464,JumpRope,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
2846,Punch,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
2962,Archery,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
2928,Punch,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
787,TennisSwing,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
9801,Surfing,/Users/mzitoh/.cache/kagglehub/datasets/matthe...
4512,JumpRope,/Users/mzitoh/.cache/kagglehub/datasets/matthe...


## Configs & Hyperparameters

In [9]:
MODEL_NAME = "i3d_v1_100f"
MODEL_BASE_PATH = f"../../models/{MODEL_NAME}"

FRAME_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 60
LEARNING_RATE = 1e-4

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 512

## Label Encoding

Encode the class labels as integers using the Keras StringLookup layer

In [10]:
np.unique(train_df["label"])

array(['Archery', 'Basketball', 'Biking', 'HorseRace', 'JumpRope',
       'Punch', 'Skiing', 'SkyDiving', 'Surfing', 'TennisSwing'],
      dtype=object)

In [11]:
label_processor = tf.keras.layers.StringLookup(num_oov_indices=0, vocabulary=sports_actions)

class_vocab = label_processor.get_vocabulary()

print(f"Vocabulary: {class_vocab}")
print(f"Number of classes: {len(class_vocab)}")

Vocabulary: ['SkyDiving', 'Biking', 'HorseRace', 'Surfing', 'TennisSwing', 'Punch', 'Basketball', 'JumpRope', 'Archery', 'Skiing']
Number of classes: 10


Utility to convert string labels to one-hot encoded format

In [12]:
def encode_labels(labels: np.ndarray) -> np.ndarray:
    integer_labels = tf.keras.ops.convert_to_numpy(label_processor(labels[..., None]))
    return tf.keras.utils.to_categorical(integer_labels, num_classes=len(class_vocab))

## Video preprocessing

 Function to resize the video frames to a square shape without distorting their content

In [13]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]  # Get the height (y) and width (x) of the image
    min_dim = min(y, x)       # Find the smallest dimension (either height or width)
    start_x = (x // 2) - (min_dim // 2)  # Calculate the horizontal starting point for the crop
    start_y = (y // 2) - (min_dim // 2)  # Calculate the vertical starting point for the crop

    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]  # Return the cropped square

Crop,resize, and reorder color channels

In [14]:
def load_and_preprocess_video(video_path):    
    cap = cv2.VideoCapture(video_path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame) # Crop center square
            frame = cv2.resize(frame, (FRAME_SIZE,FRAME_SIZE)) # Resize the image (In this case to 224x224)
            frame = frame[:, :, [2, 1, 0]] # Reorder the color channels from OpenCV BGR to RGB
            frame = frame.astype('float32') / 255.0 # Normalize the pixel values
            
            # For Pre-trained I3D normalization 
            # mean = [0.485, 0.456, 0.406]  # For RGB channels
            # std = [0.229, 0.224, 0.225]   # For RGB channels
            # frame = (frame - mean) / std

            frames.append(frame)

            if len(frames) == MAX_SEQ_LENGTH:
                break
    finally:
        cap.release()
   
    return np.array(frames)

Load the video frames

In [15]:
def load_single_video(video_path):
    print(f"Loading video from {video_path}")
    if isinstance(video_path, bytes):
        video_path = video_path.decode('utf-8')
    
    # Load and preprocess video frames
    frames = load_and_preprocess_video(video_path)
    
    # Handle sequence length
    if frames.shape[0] > MAX_SEQ_LENGTH:
        # Sample frames uniformly
        frame_indices = np.linspace(0, frames.shape[0] - 1, MAX_SEQ_LENGTH, dtype=int)
        frames = frames[frame_indices]
    elif frames.shape[0] < MAX_SEQ_LENGTH:
        # Repeat frames if video is too short
        repeat_factor = MAX_SEQ_LENGTH // frames.shape[0] + 1
        frames = np.tile(frames, (repeat_factor, 1, 1, 1))[:MAX_SEQ_LENGTH]
    
    frames = tf.convert_to_tensor(frames, dtype=tf.float32)
    frames.set_shape((MAX_SEQ_LENGTH, FRAME_SIZE, FRAME_SIZE, 3))
    
    return frames

A generator that yields batches of video frames and labels.

In [16]:
def create_video_dataset(df, batch_size=32, shuffle=True, cache=False):
    video_paths = df["video_path"].values
    labels = tf.keras.ops.convert_to_numpy(
        label_processor(df["label"].values[..., None])
    )

    # Create a dataset of paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((video_paths, labels))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(video_paths))

    dataset = dataset.map(
        lambda x, y: (
            tf.py_function(func=load_single_video, inp=[x], Tout=tf.float32),
            y,
        ),
        num_parallel_calls=tf.data.AUTOTUNE,
    )

    if cache:
        dataset = dataset.cache()

    # Batch and prefetch
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [17]:
def prepare_train_val_datasets(train_df, val_df, batch_size=32):
    train_dataset = create_video_dataset(
        train_df, batch_size=batch_size, shuffle=True, cache=True
    )

    val_dataset = create_video_dataset(
        val_df, batch_size=batch_size, shuffle=False, cache=True
    )

    return train_dataset, val_dataset

## Inflated 3D Model Architecture

In [18]:
def build_inflated_3d_model():
    # Input layer with explicit batch size if provided
    video_input = Input(shape=(MAX_SEQ_LENGTH, FRAME_SIZE, FRAME_SIZE, 3), 
                       batch_size=BATCH_SIZE,
                       name='video_input')
    
    # Inflated 3D Convolutional layers
    x = Conv3D(64, (3, 7, 7), strides=(1, 2, 2), padding="same", activation="relu")(video_input)
    x = BatchNormalization()(x)
    x = Conv3D(128, (3, 3, 3), strides=(1, 2, 2), padding="same", activation="relu")(x)
    x = BatchNormalization()(x)
    x = Conv3D(256, (3, 3, 3), strides=(1, 2, 2), padding="same", activation="relu")(x)
    x = BatchNormalization()(x)
    
    # Global pooling with explicit reshaping
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.GlobalAveragePooling2D())(x)
    
    # Ensure proper shape before GRU layer
    x = tf.keras.layers.Reshape((-1, x.shape[-1]))(x)
    
    # Bidirectional GRU layer
    x = Bidirectional(GRU(128, return_sequences=False, 
                         dropout=0.2, 
                         recurrent_dropout=0.2))(x)
    
    # Dense layers
    x = Dropout(0.4)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    
    # Output layer
    output = Dense(len(class_vocab), activation="softmax", name='classification_output')(x)
    
    # Create and compile model
    model = Model(inputs=video_input, outputs=output, name="inflated_3d_model")
    
    # Learning rate schedule
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.01,
        decay_steps=10000,
        decay_rate=0.9
    )
    
    # Optimizer
    adam_optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule)
    
    # Compile with proper loss and metrics
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        optimizer=adam_optimizer,
        metrics=["accuracy"],
    )
    
    return model

Build the model

In [19]:
model = build_inflated_3d_model()
model.summary()

## Model Training

In [20]:
def get_new_model_version():
    model_version = 1
    while os.path.exists(f"{MODEL_BASE_PATH}/v_{model_version}"):
        model_version += 1
    return model_version

In [21]:
def get_model_version_path():
    model_version = get_new_model_version()
    model_version_path = f"{MODEL_BASE_PATH}/v_{model_version}"

    return model_version_path

In [22]:
def get_model_path(model_version_path):
    model_path = os.path.join(model_version_path, f"{MODEL_NAME}.keras")
    
    return model_path

Utility to run the experiment

In [23]:
def train_model(model, model_path):
    callbacks = [
        ModelCheckpoint(
            model_path,
            save_weights_only=False,
            save_best_only=True,
            monitor="val_accuracy",
            verbose=1,
        ),
        EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=3),
    ]

    train_dataset, val_dataset = prepare_train_val_datasets(
        train_df, val_df, BATCH_SIZE
    )

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        steps_per_epoch=len(train_df) // BATCH_SIZE,
        validation_steps=len(val_df) // BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=callbacks,
        verbose=1,
    )

    return model, history

Train the model

In [24]:
model_version_path = get_model_version_path()
model_path = get_model_path(model_version_path)

i3d_model, history = train_model(model, model_path)

Epoch 1/60


ValueError: as_list() is not defined on an unknown TensorShape.

In [None]:
if os.path.exists(model_path):
    print(f"Model saved at: {model_path}")
    model_size = os.path.getsize(model_path) / (1024 * 1024)
    print(f"Model size: {model_size:.2f} MB")

### Training Metrics

Visualise the training and validation loss and accuracy

In [None]:
def visualize_training_metrics(history, metrics_path):
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    
    plt.figure(figsize=(12, 6))
    
    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.title("Training and Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(val_accuracy, label='Validation Accuracy')
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    
    plt.savefig(metrics_path)
    print(f"Training metrics plot saved at: {metrics_path}")

    plt.show()
    
    plt.close()

In [None]:
def get_model_image_path(metric_type, model_version_path):
    image_path = f"{model_version_path}/{metric_type}.png"
    return image_path

In [None]:
image_path = get_model_image_path("loss", model_version_path)
visualize_training_metrics(history, image_path)

## Model Evaluation and Testing

#### Evaluate on the entire test dataset

In [None]:
test_gen = data_generator(test_df, batch_size=32)

i3d_model.load_weights(model_path) # Load the best weights
test_loss, test_accuracy = i3d_model.evaluate(test_gen, steps=len(test_df) // 32)

In [None]:
print(f"Test Loss: {test_loss :.4f}")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

#### Evaluation with single sample

Make predictions on a single video

In [None]:
def predict(model, video_frames, true_label):
    # Expand the dimensions to match model input shape
    video_frames = np.expand_dims(
        video_frames, axis=0
    )  # Shape (1, num_frames, height, width, channels)

    # Get model predictions
    y_pred = model.predict(video_frames)

    # Get the predicted label (index of the highest probability)
    predicted_label_index = np.argmax(y_pred, axis=1)[0]
    predicted_label = class_vocab[predicted_label_index]  # Map index to label

    print(f"Prediction probabilities: {y_pred}")
    print(f"Predicted label index: {predicted_label_index}")

    return true_label, y_pred, predicted_label

Display predicted image as GIF

In [None]:
def display_as_gif(frames, model_version_path, save=False):
    gif_path = f"{model_version_path}/test_animation.gif"
    converted_images = frames.astype(np.uint8)

    if save:
        imageio.mimsave(gif_path, converted_images, duration=100)
        print(f"GIF saved at {gif_path}")

    return Image.open(gif_path)  # Display the gif

Choosing a random video to use for prediction

In [None]:
def get_test_video(test_df, test_labels, save_gif=False):
    # Select a random test video
    random_index = np.random.randint(len(test_df))
    
    # Get the test video path
    test_video = test_df["video_path"].values[random_index]

    # Get the true label of the test video
    true_label_index = test_labels.tolist()[random_index][0]
    true_label = class_vocab[true_label_index]

    # Load video frames
    test_video_frames = load_video_frames(test_video)

    print(f"Test video path: {test_video}")
    print(f"Label: {true_label}")
    
    # Display the shape of the test video frames
    print(f"""
    Test video frames shape:
      - {test_video_frames.shape[0]} frames
      - {test_video_frames.shape[1]} pixels (height) x {test_video_frames.shape[2]} pixels (width)
      - {test_video_frames.shape[3]} color channels
    """)

    display_as_gif(test_video_frames[:MAX_SEQ_LENGTH], model_version_path, save_gif)

    return test_video_frames, true_label

Run the prediction on the test video

In [None]:
test_labels = encode_labels(test_df["label"].values)

test_video_frames, true_label = get_test_video(test_df, test_labels, save_gif=True)
y_true, y_pred, predicted_label = predict(i3d_model, test_video_frames, true_label)

print(f"\nTrue label: {true_label}")
print(f"Predicted label: {predicted_label}")

print(f"y_true: {y_true}")
print(f"y_pred: {y_pred}")

#### Evaluation metrics

In [None]:
def predict_test_videos(model, test_gen, batch_size=32):
    y_true = []
    y_pred = []

    # Calculate the number of steps per epoch
    steps = len(test_gen) // batch_size

    for i in range(steps):
        # Get a batch of frames and labels
        batch_frames, batch_labels = next(test_gen)

        # Predict using the model
        predictions = model.predict(batch_frames, batch_size=batch_size)

        # Get the predicted class labels
        batch_pred_labels = np.argmax(predictions, axis=1)

        # Append the true labels and predicted labels
        # Convert one-hot labels to integers
        y_true.extend(np.argmax(batch_labels, axis=1))
        y_pred.extend(batch_pred_labels)

        # Print progress every 100 batches
        if i % 50 == 0:
            print(f"Processed {i+1}/{steps} batches")

    return np.array(y_true), np.array(y_pred), predictions

Utility to save classification metrics

In [None]:
def save_classification_report(report, save_path):
    report_data = []
    lines = report.split("\n")

    for line in filter(None, lines[2:-3]):  # Remove empty lines and headers/footers
        row = line.split()
        name = row[0]
        stats = row[1:]

        # Convert stats to float, handling support as int
        stats = [float(val) for val in stats[:-1]] + [int(stats[-1])]
        report_data.append([name] + stats)

    report_df = pd.DataFrame(
        report_data, columns=["Class", "Precision", "Recall", "F1-score", "Support"]
    )

    report_df.to_csv(save_path, index=False)

    print(f"Classification report saved at: {save_path}")

Utility to display evaluation Metrics

In [None]:
def display_evaluation_metrics(y_true, y_pred, predictions, target_classes):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Top-1 Accuracy: {accuracy * 100:.2f}%")

    k = 2
    top_k_acc = top_k_accuracy_score(y_true, predictions, k=k)
    print(f"Top-{k} Accuracy: {top_k_acc * 100:.2f}%")

    report = classification_report(y_true, y_pred, target_names=target_classes)
    print("\nClassification Report:\n", report)

    save_classification_report(report, f'{model_version_path}/classification_metrics.csv')

In [None]:
y_true, y_pred, predictions = predict_test_videos(i3d_model, test_gen)

# y_pred = np.argmax(predictions, axis=1)

# Print the results
print(f"True Labels: {y_true}")
print(f"Predicted Labels: {y_pred}")

# Calculate accuracy
accuracy = np.sum(y_true == y_pred) / len(y_true)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
display_evaluation_metrics(y_true, y_pred, predictions, class_vocab)

#### Confusion Matrix

In [None]:
def display_confusion_matrix(y_true, y_pred, target_classes, plot_path, show_plot=False):
    conf_matrix = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        conf_matrix,
        fmt="d",
        annot=True,
        cmap="Blues",
        cbar=True,
        xticklabels=target_classes,
        yticklabels=target_classes,
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")

    if show_plot:
        plt.show()    
    else:
        print(conf_matrix)

    plt.savefig(plot_path)
    print(f"\nConfusion matrix saved at: {plot_path}")

    plt.close()

In [None]:
true_labels = np.array(test_labels).flatten()

cm_plot_path = get_model_image_path("confusion_matrix", model_version_path)
display_confusion_matrix(true_labels, y_pred, class_vocab, cm_plot_path, show_plot=False)