<a href="https://colab.research.google.com/github/fareesah28/INM705-CW/blob/main/MariumColabFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Extraction

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/data_trimmed_clean.zip" -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data_trimmed/Train/Shooting/Shooting042_x264_1030.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting020_x264_2680.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting029_x264_1260.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_2690.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting014_x264_2740.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_4510.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_11010.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_9020.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting005_x264_1860.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting052_x264_4560.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_130.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting027_x264_140.png  


Data Reconstructing

In [3]:
import os
import shutil
from collections import defaultdict

def extract_video_id(filename):
    """
    Extracts the video ID by removing the frame suffix (assumes last underscore + digits is the frame number).
    For example:
        Normal_Videos_003_x264_0.png → Normal_Videos_003_x264
        Assault_001_frame_010.png → Assault_001
    """
    parts = filename.rsplit("_", 1)
    return parts[0] if len(parts) == 2 else filename.split("_frame")[0]

def restructure_dataset(src_dir, dst_dir):
    os.makedirs(dst_dir, exist_ok=True)

    for class_name in os.listdir(src_dir):
        class_path = os.path.join(src_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        print(f"Processing class: {class_name}")
        video_frame_dict = defaultdict(list)

        for fname in os.listdir(class_path):
            if not fname.endswith('.png'):
                continue

            video_id = extract_video_id(fname)
            video_frame_dict[video_id].append(fname)

        for video_id, frames in video_frame_dict.items():
            video_folder_path = os.path.join(dst_dir, class_name, video_id)
            os.makedirs(video_folder_path, exist_ok=True)

            for frame in frames:
                src = os.path.join(class_path, frame)
                dst = os.path.join(video_folder_path, frame)
                shutil.copy2(src, dst)

    print(f"Done restructuring: {src_dir} → {dst_dir}\n")

train_dir = "/content/data_trimmed/Train"
test_dir = "/content/data_trimmed/Test"

train_dst = "/content/data_trimmed_restructured/Train"
test_dst = "/content/data_trimmed_restructured/Test"

# Run restructuring
restructure_dataset(train_dir, train_dst)
restructure_dataset(test_dir, test_dst)

Processing class: NormalVideos
Processing class: Burglary
Processing class: Explosion
Processing class: Assault
Processing class: Fighting
Processing class: Shooting
Processing class: Arrest
Processing class: Arson
Done restructuring: /content/data_trimmed/Train → /content/data_trimmed_restructured/Train

Processing class: NormalVideos
Processing class: Burglary
Processing class: Explosion
Processing class: Assault
Processing class: Fighting
Processing class: Shooting
Processing class: Arrest
Processing class: Arson
Done restructuring: /content/data_trimmed/Test → /content/data_trimmed_restructured/Test



# Pad  video to 16 frames , stack into a (16, 64, 64, 3) array

In [4]:
IMG_HEIGHT = 64
IMG_WIDTH = 64
SEQUENCE_LENGTH = 16
BATCH_SIZE = 8
CLASS_NAMES = ['Arrest','Arson','Assault','Burglary','Explosion','Fighting','NormalVideos','Shooting']
NUM_CLASSES = len(CLASS_NAMES)

# one sequence of frames for each video folder


In [5]:
import tensorflow as tf
import numpy as np
import os
from glob import glob
from tensorflow.keras.utils import to_categorical
from PIL import Image

def load_video_frames(video_dir, sequence_length, img_size):
    # Get sorted list of frame paths
    frame_paths = sorted(glob(os.path.join(video_dir, "*.png")))

    frames = []
    for path in frame_paths[:sequence_length]:
        img = Image.open(path).resize(img_size)
        frame = np.array(img).astype("float32") / 255.0  # Normalize to [0, 1]
        frames.append(frame)


    while len(frames) < sequence_length:
        frames.append(np.zeros((img_size[1], img_size[0], 3), dtype="float32"))

    return np.stack(frames)

def get_video_paths_and_labels(base_dir, class_names):
    video_paths = []
    labels = []

    for class_index, class_name in enumerate(class_names):
        class_path = os.path.join(base_dir, class_name)
        for video_folder in os.listdir(class_path):
            video_path = os.path.join(class_path, video_folder)
            if os.path.isdir(video_path):
                video_paths.append(video_path)
                labels.append(class_index)

    return video_paths, labels

def build_video_dataset(base_dir, sequence_length, img_size, batch_size, class_names, shuffle=True):
    video_paths, labels = get_video_paths_and_labels(base_dir, class_names)

    def generator():
        for video_path, label in zip(video_paths, labels):
            frames = load_video_frames(video_path, sequence_length, img_size)
            yield frames, to_categorical(label, num_classes=len(class_names))

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(sequence_length, img_size[1], img_size[0], 3), dtype=tf.float32),
            tf.TensorSpec(shape=(len(class_names),), dtype=tf.float32)
        )
    )

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(video_paths))

    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [6]:
train_dir = "data_trimmed_restructured/Train"
test_dir = "data_trimmed_restructured/Test"
train_dataset = build_video_dataset(
    base_dir=train_dir,
    sequence_length=SEQUENCE_LENGTH,
    img_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_names=CLASS_NAMES,
    shuffle=True
)

test_dataset = build_video_dataset(
    base_dir=test_dir,
    sequence_length=SEQUENCE_LENGTH,
    img_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_names=CLASS_NAMES,
    shuffle=False
)

In [7]:
for x, y in train_dataset.take(1):
    print(x.shape)  # (8, 16, 64, 64, 3)
    print(y.shape)  # (8, 8)

(8, 16, 64, 64, 3)
(8, 8)


DINOv2 + Transformer Model - Basic

In [8]:
!pip install transformers




DINOv2 + Transformer model - Basic

In [31]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
dinov2 = AutoModel.from_pretrained("facebook/dinov2-base").to(device)
dinov2.eval()


Dinov2Model(
  (embeddings): Dinov2Embeddings(
    (patch_embeddings): Dinov2PatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): Dinov2Encoder(
    (layer): ModuleList(
      (0-11): 12 x Dinov2Layer(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attention): Dinov2Attention(
          (attention): Dinov2SelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
          )
          (output): Dinov2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (layer_scale1): Dinov2LayerScale()
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06,

In [33]:
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, sequence_length, img_size, class_names):
        self.video_paths = video_paths
        self.labels = labels
        self.sequence_length = sequence_length
        self.img_size = img_size
        self.class_names = class_names

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frame_paths = sorted([
            os.path.join(video_path, f)
            for f in os.listdir(video_path)
            if f.endswith(".png")
        ])

        frames = []
        for path in frame_paths[:self.sequence_length]:
            img = Image.open(path).resize(self.img_size)
            inputs = processor(images=img, return_tensors="pt").to(device)
            with torch.no_grad():
                features = dinov2(**inputs).last_hidden_state.mean(dim=1).squeeze(0)
            frames.append(features)

        while len(frames) < self.sequence_length:
            frames.append(torch.zeros_like(frames[0]))

        sequence = torch.stack(frames)
        return sequence, torch.tensor(label, dtype=torch.long)


In [34]:
class VideoTransformerClassifier(nn.Module):
    def __init__(self, feature_dim, num_classes, sequence_length, dropout=0.3):
        super().__init__()
        self.positional_encoding = nn.Parameter(torch.randn(1, sequence_length, feature_dim))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=feature_dim,
                nhead=4,
                dim_feedforward=512,
                dropout=dropout,
                activation='gelu',
                batch_first=True,
                norm_first=True
            ),
            num_layers=2
        )

        self.cls_head = nn.Sequential(
            nn.LayerNorm([sequence_length, feature_dim]),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(feature_dim * sequence_length, 256),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = x + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.cls_head(x)


In [35]:

train_video_paths, train_labels = get_video_paths_and_labels("/content/data_trimmed_restructured/Train", CLASS_NAMES)
test_video_paths, test_labels = get_video_paths_and_labels("/content/data_trimmed_restructured/Test", CLASS_NAMES)

#  class weights from training labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(CLASS_NAMES)),
    y=train_labels
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

#  datasets and loaders
train_dataset = VideoDataset(train_video_paths, train_labels, sequence_length=16, img_size=(64, 64), class_names=CLASS_NAMES)
test_dataset = VideoDataset(test_video_paths, test_labels, sequence_length=16, img_size=(64, 64), class_names=CLASS_NAMES)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [36]:
model = VideoTransformerClassifier(feature_dim=768, num_classes=len(CLASS_NAMES), sequence_length=16).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    train_loss, train_preds, train_targets = 0, [], []

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)

        # weighted cross-entropy
        loss = F.cross_entropy(outputs, batch_y, weight=class_weights_tensor)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * batch_x.size(0)
        train_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        train_targets.extend(batch_y.cpu().numpy())

    train_loss /= len(train_loader.dataset)
    train_acc = accuracy_score(train_targets, train_preds)
    try:
        train_auc = roc_auc_score(
            F.one_hot(torch.tensor(train_targets), num_classes=len(CLASS_NAMES)),
            F.one_hot(torch.tensor(train_preds), num_classes=len(CLASS_NAMES)),
            multi_class='ovr'
        )
    except:
        train_auc = 0.0

    # evaluate on test
    model.eval()
    test_loss, test_preds, test_targets = 0, [], []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = F.cross_entropy(outputs, batch_y)

            test_loss += loss.item() * batch_x.size(0)
            test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            test_targets.extend(batch_y.cpu().numpy())

    test_loss /= len(test_loader.dataset)
    test_acc = accuracy_score(test_targets, test_preds)
    try:
        test_auc = roc_auc_score(
            F.one_hot(torch.tensor(test_targets), num_classes=len(CLASS_NAMES)),
            F.one_hot(torch.tensor(test_preds), num_classes=len(CLASS_NAMES)),
            multi_class='ovr'
        )
    except:
        test_auc = 0.0

    print(f"Epoch {epoch+1:02d} | "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, AUC: {train_auc:.4f} | "
          f"Test Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, AUC: {test_auc:.4f}")




Epoch 01 | Train Loss: 2.1852, Acc: 0.1882, AUC: 0.5185 | Test Loss: 1.8555, Acc: 0.3538, AUC: 0.5822
Epoch 02 | Train Loss: 1.7250, Acc: 0.3972, AUC: 0.6221 | Test Loss: 1.8181, Acc: 0.4000, AUC: 0.6502
Epoch 03 | Train Loss: 1.3988, Acc: 0.5401, AUC: 0.7258 | Test Loss: 1.7962, Acc: 0.2923, AUC: 0.5683
Epoch 04 | Train Loss: 1.1552, Acc: 0.5958, AUC: 0.7585 | Test Loss: 1.9141, Acc: 0.4308, AUC: 0.6471
Epoch 05 | Train Loss: 0.8628, Acc: 0.7282, AUC: 0.8420 | Test Loss: 2.0010, Acc: 0.3385, AUC: 0.5956
Epoch 06 | Train Loss: 0.6644, Acc: 0.7700, AUC: 0.8651 | Test Loss: 2.1854, Acc: 0.3846, AUC: 0.6238
Epoch 07 | Train Loss: 0.5011, Acc: 0.8188, AUC: 0.9010 | Test Loss: 2.4514, Acc: 0.3846, AUC: 0.6381
Epoch 08 | Train Loss: 0.3723, Acc: 0.9024, AUC: 0.9492 | Test Loss: 2.6220, Acc: 0.3846, AUC: 0.6123
Epoch 09 | Train Loss: 0.1941, Acc: 0.9547, AUC: 0.9760 | Test Loss: 2.8186, Acc: 0.3692, AUC: 0.6213
Epoch 10 | Train Loss: 0.1327, Acc: 0.9721, AUC: 0.9845 | Test Loss: 3.4921, Acc: 