In [1]:
import os
import shutil

Data Trimming 

In [3]:
import os
import shutil

def create_debug_subset_sequential(source_dir, dest_dir, train_limit=2000, test_limit=600):
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)

    for split, limit in [("Train", train_limit), ("Test", test_limit)]:
        src_split_path = os.path.join(source_dir, split)
        dst_split_path = os.path.join(dest_dir, split)
        os.makedirs(dst_split_path, exist_ok=True)

        for class_name in os.listdir(src_split_path):
            class_src = os.path.join(src_split_path, class_name)
            class_dst = os.path.join(dst_split_path, class_name)
            os.makedirs(class_dst, exist_ok=True)

            valid_images = sorted([f for f in os.listdir(class_src) if f.endswith(".png") and not f.startswith("._")])
            selected_images = valid_images[:limit]

            for img in selected_images:
                shutil.copy(os.path.join(class_src, img), os.path.join(class_dst, img))

# Correct folder name used here
create_debug_subset_sequential("data_trimmed", "data_trimmed_debug", train_limit=2000, test_limit=600)


In [8]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm


In [2]:
from tensorflow.keras.utils import to_categorical
from transformers import AutoImageProcessor

## Backbone

In [3]:
image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


-Looks inside the main folder to find different categories  

-Gives each category a number so the computer can understand it.

-Goes into each category folder and groups the images that belong to the same video.

-Skips videos that don’t have enough images.

-Breaks the video into small parts (like clips of 8 images).

-Opens and resizes each image so they’re all the same size.

-Processes the images to turn them into numbers the computer can use.

-Averages the 8 images in each clip to make one final image that represents the clip.

-Saves this final image and its category.

-At the end, returns all the images and their categories in a format ready for training a computer model.

In [4]:
def load_dataset(data_dir, window_size=8):
    X, y = [], []
    class_labels = sorted(os.listdir(data_dir))
    label_map = {cls: idx for idx, cls in enumerate(class_labels)}

    for label in tqdm(class_labels, desc=f"Processing classes from {data_dir}", ncols=100):
        folder_path = os.path.join(data_dir, label)
        video_dict = {}

        for img_name in sorted(os.listdir(folder_path)):
            if not (img_name.endswith(".png") and not img_name.startswith("._")):
                continue
            vid_id = "_".join(img_name.split("_")[:2])
            video_dict.setdefault(vid_id, []).append(os.path.join(folder_path, img_name))

        for vid, frames in video_dict.items():
            if len(frames) < window_size:
                continue
            for i in range(0, len(frames) - window_size + 1, window_size):
                window = frames[i:i + window_size]
                tensor_batch = []
                for frame_path in window:
                    try:
                        img = Image.open(frame_path).convert("RGB").resize((224, 224))
                        pixel_values = image_processor(images=img, return_tensors="pt").pixel_values[0].numpy()
                        tensor_batch.append(pixel_values)
                    except Exception as e:
                        continue
                if len(tensor_batch) == window_size:
                    tensor_batch = np.array(tensor_batch)                         # (window_size, 3, 224, 224)
                    video_features = np.mean(tensor_batch, axis=0)                # (3, 224, 224)
                    video_features = np.transpose(video_features, (1, 2, 0))      # ➡ (224, 224, 3)
                    X.append(video_features)
                    y.append(label_map[label])

    return np.array(X), to_categorical(y, num_classes=len(class_labels))

In [5]:
train_dir = "data_trimmed_debug/Train"
test_dir = "data_trimmed_debug/Test"

In [9]:
# 🚀 Load data
X_train, y_train = load_dataset(train_dir, window_size=8)
X_test, y_test = load_dataset(test_dir, window_size=8)

Processing classes from data_trimmed_debug/Train: 100%|███████████████| 8/8 [06:42<00:00, 50.35s/it]
Processing classes from data_trimmed_debug/Test: 100%|████████████████| 8/8 [02:00<00:00, 15.00s/it]


In [10]:
# 📊 Check shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1971, 224, 224, 3)
y_train shape: (1971, 8)
X_test shape: (591, 224, 224, 3)
y_test shape: (591, 8)


## Train and compile the Backbone 



- Imports necessary TensorFlow and Keras components for building a convolutional neural network (CNN).
- Defines the input layer based on the shape of the training image data.
- Constructs three convolutional blocks, each containing:
  - A convolutional layer to extract image features.
  - Batch normalization to stabilize and accelerate training.
  - ReLU activation for non-linearity.
  - Max pooling to reduce spatial dimensions.
- Applies Global Average Pooling to compress the feature maps while retaining important information.
- Adds a fully connected (dense) layer with dropout for regularization and to prevent overfitting.
- Defines the output layer with softmax activation for multi-class classification.
- Compiles the model using the Adam optimizer and categorical crossentropy loss function, tracking accuracy as a performance metric.
- Displays a summary of the model architecture, including the number of layers and trainable parameters.

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, BatchNormalization, Activation, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [12]:
# input layer based on shape of processed features
input_tensor = Input(shape=X_train.shape[1:])

In [13]:
# block 1
x = Conv2D(32, (3, 3), padding='same', kernel_regularizer=l2(0.001))(input_tensor)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

In [14]:
# block 2
x = Conv2D(64, (3, 3), padding='same', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

In [15]:
# block 3 (deeper layer)
x = Conv2D(128, (3, 3), padding='same', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2))(x)

x = GlobalAveragePooling2D()(x)

In [16]:
# dense layers with dropout
x = Dropout(0.5)(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(x)
x = Dropout(0.5)(x)
out = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(inputs=input_tensor, outputs=out)
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [19]:
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

wandb.init(project="inm705-cw-video-classification", config={"epochs": 15, "batch_size": 16})

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=16,
    callbacks=[
        WandbMetricsLogger(),  # only logs metrics (was crashing if graphs were included)
        WandbModelCheckpoint("model.keras")
    ]
)

wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Dell\_netrc
wandb: Currently logged in as: marium-waseem (marium-waseem-city-university-of-london) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Epoch 1/20
124/124 ━━━━━━━━━━━━━━━━━━━━ 17:08 8s/step - accuracy: 0.0625 - loss: 3.14 ━━━━━━━━━━━━━━━━━━━━ 1:38 809ms/step - accuracy: 0.1094 - loss: 3.02 ━━━━━━━━━━━━━━━━━━━━ 1:36 797ms/step - accuracy: 0.1285 - loss: 2.96 ━━━━━━━━━━━━━━━━━━━━ 1:35 796ms/step - accuracy: 0.1393 - loss: 2.91 ━━━━━━━━━━━━━━━━━━━━ 1:35 802ms/step - accuracy: 0.1440 - loss: 2.88 ━━━━━━━━━━━━━━━━━━━━ 1:36 815ms/step - accuracy: 0.1495 - loss: 2.85 ━━━━━━━━━━━━━━━━━━━━ 1:34 812ms/step - accuracy: 0.1536 - loss: 2.83 ━━━━━━━━━━━━━━━━━━━━ 1:33 803ms/step - accuracy: 0.1549 - loss: 2.81 ━━━━━━━━━━━━━━━━━━━━ 1:31 799ms/step - accuracy: 0.1547 - loss: 2.80 ━━━━━━━━━━━━━━━━━━━━ 1:30 797ms/step - accuracy: 0.1542 - loss: 2.79 ━━━━━━━━━━━━━━━━━━━━ 1:30 797ms/step - accuracy: 0.1531 - loss: 2.77 ━━━━━━━━━━━━━━━━━━━━ 1:28 793ms/step - accuracy: 0.1521 - loss: 2.76 ━━━━━━━━━━━━━━━━━━━━ 1:27 789ms/step - accuracy: 0.1507 - loss: 2.75 ━━━━━━━━━━━━━━━━━━━━ 1:26 788ms/step - accuracy: 0.1495 - loss: 2.75 ━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2224840b620>