In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data Preparation

In [2]:
!unzip "/content/drive/MyDrive/data_trimmed_clean.zip" -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data_trimmed/Train/Shooting/Shooting042_x264_1030.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting020_x264_2680.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting029_x264_1260.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_2690.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting014_x264_2740.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_4510.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_11010.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_9020.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting005_x264_1860.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting052_x264_4560.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_130.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting027_x264_140.png  


In [3]:
# ONLY USED FOR TESTING AND DEBUGGING - for final model, we will use the whole dataset

import os
import shutil

def create_debug_subset_sequential(source_dir, dest_dir, train_limit=2000, test_limit=600):
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)

    for split, limit in [("Train", train_limit), ("Test", test_limit)]:
        src_split_path = os.path.join(source_dir, split)
        dst_split_path = os.path.join(dest_dir, split)
        os.makedirs(dst_split_path, exist_ok=True)

        for class_name in os.listdir(src_split_path):
            class_src = os.path.join(src_split_path, class_name)
            class_dst = os.path.join(dst_split_path, class_name)
            os.makedirs(class_dst, exist_ok=True)

            valid_images = sorted([f for f in os.listdir(class_src) if f.endswith(".png") and not f.startswith("._")])
            selected_images = valid_images[:limit]

            for img in selected_images:
                shutil.copy(os.path.join(class_src, img), os.path.join(class_dst, img))

create_debug_subset_sequential("/content/data_trimmed", "/content/data_trimmed_debug", train_limit=2000, test_limit=600)

In [7]:
# RUN JUST ONCE

!pip install transformers wandb --quiet

[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _

The following `load_dataset` function does the following:


*   Simulates videos using grouped frames.
*   Creates temporal windows (sequences of window_size frames).
*   Encodes each window using DINOv2's preprocessor.
*   Reduces temporal dimension via mean pooling.
*   Prepares final NumPy arrays for training.

In [4]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from transformers import AutoImageProcessor

# backbone
image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")

def load_dataset(data_dir, window_size=8):
    X, y = [], []
    class_labels = sorted(os.listdir(data_dir))
    label_map = {cls: idx for idx, cls in enumerate(class_labels)}

    for label in tqdm(class_labels, desc=f"Processing classes from {data_dir}", ncols=100):
        folder_path = os.path.join(data_dir, label)
        video_dict = {}

        for img_name in sorted(os.listdir(folder_path)):
            if not (img_name.endswith(".png") and not img_name.startswith("._")):
                continue
            vid_id = "_".join(img_name.split("_")[:2])
            video_dict.setdefault(vid_id, []).append(os.path.join(folder_path, img_name))

        for vid, frames in video_dict.items():
            if len(frames) < window_size:
                continue
            for i in range(0, len(frames) - window_size + 1, window_size):
                window = frames[i:i + window_size]
                tensor_batch = []
                for frame_path in window:
                    try:
                        img = Image.open(frame_path).convert("RGB").resize((224, 224))
                        pixel_values = image_processor(images=img, return_tensors="pt").pixel_values[0].numpy()
                        tensor_batch.append(pixel_values)
                    except Exception as e:
                        continue
                if len(tensor_batch) == window_size:
                    tensor_batch = np.array(tensor_batch)                         # (window_size, 3, 224, 224)
                    video_features = np.mean(tensor_batch, axis=0)                # (3, 224, 224)
                    video_features = np.transpose(video_features, (1, 2, 0))      # ➡ (224, 224, 3)
                    X.append(video_features)
                    y.append(label_map[label])

    return np.array(X), to_categorical(y, num_classes=len(class_labels))

# paths
train_dir = "/content/data_trimmed_debug/Train"
test_dir = "/content/data_trimmed_debug/Test"

# load data
X_train, y_train = load_dataset(train_dir, window_size=8)
X_test, y_test = load_dataset(test_dir, window_size=8)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Processing classes from /content/data_trimmed_debug/Train: 100%|██████| 8/8 [01:07<00:00,  8.40s/it]
Processing classes from /content/data_trimmed_debug/Test: 100%|███████| 8/8 [00:20<00:00,  2.61s/it]


X_train shape: (1971, 224, 224, 3)
y_train shape: (1971, 8)
X_test shape: (591, 224, 224, 3)
y_test shape: (591, 8)


### Compile and train backbone model

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, BatchNormalization, Activation, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# input layer based on shape of processed features
input_tensor = Input(shape=X_train.shape[1:])

# block 1
x = Conv2D(32, (3, 3), padding='same', kernel_regularizer=l2(0.001))(input_tensor)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

# block 2
x = Conv2D(64, (3, 3), padding='same', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

# block 3 (deeper layer)
x = Conv2D(128, (3, 3), padding='same', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

x = GlobalAveragePooling2D()(x)

# dense layers with dropout
x = Dropout(0.5)(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(x)
x = Dropout(0.5)(x)
out = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(inputs=input_tensor, outputs=out)
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [6]:
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

wandb.init(project="inm705-cw-video-classification", config={"epochs": 15, "batch_size": 16})

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=16,
    callbacks=[
        WandbMetricsLogger(),  # only logs metrics (was crashing if graphs were included)
        WandbModelCheckpoint("model.keras")
    ]
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfareesahhussain[0m ([33mfareesahhussain-city-university-of-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 90ms/step - accuracy: 0.2451 - loss: 2.3080 - val_accuracy: 0.2555 - val_loss: 2.4554
Epoch 2/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.4625 - loss: 1.8140 - val_accuracy: 0.2843 - val_loss: 2.9033
Epoch 3/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.5202 - loss: 1.6150 - val_accuracy: 0.2690 - val_loss: 3.1014
Epoch 4/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.5389 - loss: 1.5947 - val_accuracy: 0.2555 - val_loss: 3.3489
Epoch 5/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5792 - loss: 1.4506 - val_accuracy: 0.2555 - val_loss: 3.2806
Epoch 6/20
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.6027 - loss: 1.3849 - val_accuracy: 0.2775 - val_loss: 3.1968
Epoch 7/20
[1m124/12

<keras.src.callbacks.history.History at 0x7ada1051fa90>