## Data Loading and Pre-processing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/data_trimmed_clean.zip" -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data_trimmed/Train/Shooting/Shooting042_x264_1030.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting020_x264_2680.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting029_x264_1260.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_2690.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting014_x264_2740.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_4510.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_11010.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_9020.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting005_x264_1860.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting052_x264_4560.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_130.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting027_x264_140.png  


In [131]:
# ONLY USED FOR TESTING AND DEBUGGING - for final model we will use the whole dataset

import os
import shutil

def create_debug_subset_sequential(source_dir, dest_dir, train_limit=10000, test_limit=4000):
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)

    for split, limit in [("Train", train_limit), ("Test", test_limit)]:
        src_split_path = os.path.join(source_dir, split)
        dst_split_path = os.path.join(dest_dir, split)
        os.makedirs(dst_split_path, exist_ok=True)

        for class_name in os.listdir(src_split_path):
            class_src = os.path.join(src_split_path, class_name)
            class_dst = os.path.join(dst_split_path, class_name)
            os.makedirs(class_dst, exist_ok=True)

            valid_images = sorted([f for f in os.listdir(class_src) if f.endswith(".png") and not f.startswith("._")])
            selected_images = valid_images[:limit]

            for img in selected_images:
                shutil.copy(os.path.join(class_src, img), os.path.join(class_dst, img))

create_debug_subset_sequential("/content/data_trimmed", "/content/data_trimmed_debug", train_limit=10000, test_limit=4000)

# paths
train_dir = "/content/data_trimmed_debug/Train"
test_dir = "/content/data_trimmed_debug/Test"

In [3]:
import os
import shutil
from collections import defaultdict

def extract_video_id(filename):
    """
    Extracts the video ID by removing the frame suffix (assumes last underscore + digits is the frame number).
    For example:
        Normal_Videos_003_x264_0.png → Normal_Videos_003_x264
        Assault_001_frame_010.png → Assault_001
    """
    parts = filename.rsplit("_", 1)
    return parts[0] if len(parts) == 2 else filename.split("_frame")[0]

def restructure_dataset(src_dir, dst_dir):
    os.makedirs(dst_dir, exist_ok=True)

    for class_name in os.listdir(src_dir):
        class_path = os.path.join(src_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        print(f"Processing class: {class_name}")
        video_frame_dict = defaultdict(list)

        for fname in os.listdir(class_path):
            if not fname.endswith('.png'):
                continue

            video_id = extract_video_id(fname)
            video_frame_dict[video_id].append(fname)

        for video_id, frames in video_frame_dict.items():
            video_folder_path = os.path.join(dst_dir, class_name, video_id)
            os.makedirs(video_folder_path, exist_ok=True)

            for frame in frames:
                src = os.path.join(class_path, frame)
                dst = os.path.join(video_folder_path, frame)
                shutil.copy2(src, dst)

    print(f"Done restructuring: {src_dir} → {dst_dir}\n")

# Paths
train_dir = "/content/data_trimmed/Train"
test_dir = "/content/data_trimmed/Test"

train_dst = "/content/data_trimmed_restructured/Train"
test_dst = "/content/data_trimmed_restructured/Test"

# Run restructuring
restructure_dataset(train_dir, train_dst)
restructure_dataset(test_dir, test_dst)

Processing class: Arson
Processing class: Arrest
Processing class: Shooting
Processing class: Burglary
Processing class: Explosion
Processing class: NormalVideos
Processing class: Assault
Processing class: Fighting
Done restructuring: /content/data_trimmed/Train → /content/data_trimmed_restructured/Train

Processing class: Arson
Processing class: Arrest
Processing class: Shooting
Processing class: Burglary
Processing class: Explosion
Processing class: NormalVideos
Processing class: Assault
Processing class: Fighting
Done restructuring: /content/data_trimmed/Test → /content/data_trimmed_restructured/Test



In [26]:
IMG_HEIGHT = 64
IMG_WIDTH = 64
SEQUENCE_LENGTH = 16  # Pad/truncate each video to this many frames
BATCH_SIZE = 4
CLASS_NAMES = ['Arrest','Arson','Assault','Burglary','Explosion','Fighting','NormalVideos','Shooting']
NUM_CLASSES = len(CLASS_NAMES)

In [27]:
import tensorflow as tf
import numpy as np
import os
from glob import glob
from tensorflow.keras.utils import to_categorical
from PIL import Image

def load_video_frames(video_dir, sequence_length, img_size):
    # Get sorted list of frame paths
    frame_paths = sorted(glob(os.path.join(video_dir, "*.png")))

    frames = []
    for path in frame_paths[:sequence_length]:
        img = Image.open(path).resize(img_size)
        frame = np.array(img).astype("float32") / 255.0  # Normalize to [0, 1]
        frames.append(frame)

    # Pad with zeros if not enough frames
    while len(frames) < sequence_length:
        frames.append(np.zeros((img_size[1], img_size[0], 3), dtype="float32"))

    return np.stack(frames)

def get_video_paths_and_labels(base_dir, class_names):
    video_paths = []
    labels = []

    for class_index, class_name in enumerate(class_names):
        class_path = os.path.join(base_dir, class_name)
        for video_folder in os.listdir(class_path):
            video_path = os.path.join(class_path, video_folder)
            if os.path.isdir(video_path):
                video_paths.append(video_path)
                labels.append(class_index)

    return video_paths, labels

def build_video_dataset(base_dir, sequence_length, img_size, batch_size, class_names, shuffle=True):
    video_paths, labels = get_video_paths_and_labels(base_dir, class_names)

    def generator():
        for video_path, label in zip(video_paths, labels):
            frames = load_video_frames(video_path, sequence_length, img_size)
            yield frames, to_categorical(label, num_classes=len(class_names))

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(sequence_length, img_size[1], img_size[0], 3), dtype=tf.float32),
            tf.TensorSpec(shape=(len(class_names),), dtype=tf.float32)
        )
    )

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(video_paths))

    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [28]:
train_seq_dir = "/content/data_trimmed_restructured/Train"
test_seq_dir = "/content/data_trimmed_restructured/Test"

train_dataset = build_video_dataset(
    base_dir=train_seq_dir,
    sequence_length=SEQUENCE_LENGTH,
    img_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_names=CLASS_NAMES,
    shuffle=True
)

test_dataset = build_video_dataset(
    base_dir=test_seq_dir,
    sequence_length=SEQUENCE_LENGTH,
    img_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_names=CLASS_NAMES,
    shuffle=False
)

In [137]:
for x, y in train_dataset.take(1):
    print(x.shape)  # (8, 16, 64, 64, 3)
    print(y.shape)  # (8, 8)

(8, 16, 64, 64, 3)
(8, 8)


## CNN + LSTM with DenseNet121

In [88]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, TimeDistributed, GlobalAveragePooling2D, LSTM,
    Dropout, Dense
)
from tensorflow.keras.applications import DenseNet121

def build_cnn_lstm_model(sequence_length, img_height, img_width, num_classes):
    input_shape = (sequence_length, img_height, img_width, 3)
    inputs = Input(shape=input_shape)

    # 1. Feature extraction with DenseNet121 per frame
    base_cnn = DenseNet121(
        include_top=False,
        weights="imagenet",
        input_shape=(img_height, img_width, 3)
    )
    base_cnn.trainable = False  # freeze CNN

    x = TimeDistributed(base_cnn)(inputs)
    x = TimeDistributed(GlobalAveragePooling2D())(x)

    # 2. LSTM for temporal modeling
    x = LSTM(128, return_sequences=False)(x)
    x = Dropout(0.5)(x)

    # 3. Final classifier
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs, outputs)
    return model

model = build_cnn_lstm_model(
    sequence_length=SEQUENCE_LENGTH,
    img_height=IMG_HEIGHT,
    img_width=IMG_WIDTH,
    num_classes=NUM_CLASSES
)

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
        tf.keras.metrics.AUC(name="auc")
    ]
)

model.summary()

In [69]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=8
)

Epoch 1/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 6s/step - accuracy: 0.2417 - auc: 0.5376 - loss: 2.3326 - val_accuracy: 0.2500 - val_auc: 0.5935 - val_loss: 2.0526
Epoch 2/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 288ms/step - accuracy: 0.3805 - auc: 0.7832 - loss: 1.7020 - val_accuracy: 0.2045 - val_auc: 0.6160 - val_loss: 2.0069
Epoch 3/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 289ms/step - accuracy: 0.4149 - auc: 0.8383 - loss: 1.5346 - val_accuracy: 0.1591 - val_auc: 0.5892 - val_loss: 2.1610
Epoch 4/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 281ms/step - accuracy: 0.5169 - auc: 0.8558 - loss: 1.4076 - val_accuracy: 0.2500 - val_auc: 0.6009 - val_loss: 2.0446
Epoch 5/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 282ms/step - accuracy: 0.7567 - auc: 0.9660 - loss: 0.9979 - val_accuracy: 0.2727 - val_auc: 0.5963 - val_loss: 2.1172
Epoch 6/8
[1m14/14[0m [32m━━━━━━━

## CNN + Transformer (DenseNet121 + Transformer Encoder)

In [89]:
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import (
    Input, TimeDistributed, GlobalAveragePooling2D,
    Dense, Dropout, LayerNormalization, Add, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model

# Custom Transformer Encoder block
def transformer_encoder(inputs, head_size=64, num_heads=4, ff_dim=128, dropout=0.1):
    # Multi-head self-attention
    x = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = Add()([x, inputs])
    x = LayerNormalization()(x)

    # Feedforward
    x_ff = tf.keras.layers.Dense(ff_dim, activation='relu')(x)
    x_ff = tf.keras.layers.Dropout(dropout)(x_ff)
    x = Add()([x, x_ff])
    x = LayerNormalization()(x)

    return x

def build_cnn_transformer_model(sequence_length, img_height, img_width, num_classes):
    input_shape = (sequence_length, img_height, img_width, 3)
    inputs = Input(shape=input_shape)

    # Frame-level feature extraction
    base_cnn = DenseNet121(include_top=False, weights="imagenet", input_shape=(img_height, img_width, 3))
    base_cnn.trainable = False  # Freeze DenseNet
    x = TimeDistributed(base_cnn)(inputs)
    x = TimeDistributed(GlobalAveragePooling2D())(x)  # (batch, time, features)

    # Transformer encoder
    x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=1024, dropout=0.1)

    # Sequence pooling
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.4)(x)

    # Final classifier
    outputs = Dense(num_classes, activation='softmax')(x)

    return Model(inputs, outputs)

model = build_cnn_transformer_model(
    sequence_length=SEQUENCE_LENGTH,
    img_height=IMG_HEIGHT,
    img_width=IMG_WIDTH,
    num_classes=NUM_CLASSES
)

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
        tf.keras.metrics.AUC(name="auc")
    ]
)

model.summary()

In [71]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=8
)

Epoch 1/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m519s[0m 18s/step - accuracy: 0.1202 - auc: 0.5042 - loss: 3.8438 - val_accuracy: 0.1818 - val_auc: 0.5947 - val_loss: 2.8060
Epoch 2/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 81ms/step - accuracy: 0.5316 - auc: 0.8628 - loss: 1.4349 - val_accuracy: 0.1364 - val_auc: 0.6026 - val_loss: 3.0943
Epoch 3/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - accuracy: 0.5468 - auc: 0.9175 - loss: 1.1435 - val_accuracy: 0.1364 - val_auc: 0.4816 - val_loss: 3.8098
Epoch 4/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 82ms/step - accuracy: 0.8259 - auc: 0.9845 - loss: 0.4664 - val_accuracy: 0.0909 - val_auc: 0.5721 - val_loss: 3.6555
Epoch 5/8
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step - accuracy: 0.8257 - auc: 0.9869 - loss: 0.4322 - val_accuracy: 0.0909 - val_auc: 0.5174 - val_loss: 3.8544
Epoch 6/8
[1m14/14[0m [32m━━━━━━━━━━

## DINOv2 + LSTM (using PyTorch)

In [None]:
## run just once on colab!
pip install transformers datasets timm

In [29]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.transforms import Compose, Resize, ToTensor
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
from glob import glob
from sklearn.preprocessing import LabelEncoder
import random

# Constants
SEQUENCE_LENGTH = 16
IMG_SIZE = 224
NUM_CLASSES = 8
BATCH_SIZE = 4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CLASS_NAMES = ['Arrest','Arson','Assault','Burglary','Explosion','Fighting','NormalVideos','Shooting']

# Label encoder
label_encoder = LabelEncoder()
label_encoder.fit(CLASS_NAMES)

In [30]:
class VideoDataset(Dataset):
    def __init__(self, base_dir, sequence_length=16, image_size=224):
        self.sequence_length = sequence_length
        self.image_size = image_size
        self.samples = []
        self.processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')

        for class_name in os.listdir(base_dir):
            class_path = os.path.join(base_dir, class_name)
            if not os.path.isdir(class_path):
                continue
            for video_folder in os.listdir(class_path):
                video_path = os.path.join(class_path, video_folder)
                if os.path.isdir(video_path):
                    frame_paths = sorted(glob(os.path.join(video_path, '*.png')))
                    if len(frame_paths) > 0:
                        self.samples.append((frame_paths, label_encoder.transform([class_name])[0]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        frame_paths, label = self.samples[idx]
        num_frames = len(frame_paths)

        # Sample or pad frames
        if num_frames >= self.sequence_length:
            selected = sorted(random.sample(frame_paths, self.sequence_length))
        else:
            selected = sorted(frame_paths)
            while len(selected) < self.sequence_length:
                selected.append(selected[-1])

        images = [Image.open(p).convert("RGB").resize((self.image_size, self.image_size)) for p in selected]
        processed = self.processor(images=images, return_tensors="pt")
        pixel_values = processed['pixel_values'].squeeze(0)  # shape: (T, 3, 224, 224)
        label = torch.tensor(label, dtype=torch.long)
        return pixel_values, label

# Paths
train_dir = "/content/data_trimmed_restructured/Train"
test_dir = "/content/data_trimmed_restructured/Test"

# Datasets & loaders
train_dataset = VideoDataset(train_dir, sequence_length=SEQUENCE_LENGTH, image_size=IMG_SIZE)
test_dataset = VideoDataset(test_dir, sequence_length=SEQUENCE_LENGTH, image_size=IMG_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [92]:
class DINOv2_LSTM(nn.Module):
    def __init__(self, model_name='facebook/dinov2-base', hidden_dim=128, num_classes=NUM_CLASSES):
        super().__init__()
        self.vit = AutoModel.from_pretrained(model_name)
        self.vit.eval()
        for param in self.vit.parameters():
            param.requires_grad = False

        self.lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):  # x: (B, T, 3, 224, 224)
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)

        with torch.no_grad():
            vit_out = self.vit(pixel_values=x).last_hidden_state[:, 0]  # CLS token

        x_seq = vit_out.view(B, T, -1)  # (B, T, 768)
        x_seq, _ = self.lstm(x_seq)
        x_seq = self.dropout(x_seq[:, -1, :])  # last time step
        return self.fc(x_seq)

# Model
model = DINOv2_LSTM().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [75]:
from sklearn.metrics import accuracy_score, roc_auc_score
import torch.nn.functional as F

def evaluate(model, loader, criterion, split='Test'):
    model.eval()
    total_loss = 0.0
    total_preds = []
    total_probs = []
    total_labels = []

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            logits = model(x)
            probs = F.softmax(logits, dim=1)
            loss = criterion(logits, y)

            total_loss += loss.item()
            total_probs.append(probs.cpu())
            total_preds.append(torch.argmax(probs, dim=1).cpu())
            total_labels.append(y.cpu())

    y_true = torch.cat(total_labels).numpy()
    y_pred = torch.cat(total_preds).numpy()
    y_probs = torch.cat(total_probs).numpy()

    acc = accuracy_score(y_true, y_pred)

    try:
        auc = roc_auc_score(y_true, y_probs, multi_class='ovr', average='macro')
    except ValueError:
        auc = 0.0

    avg_loss = total_loss / len(loader)
    print(f"{split} Loss: {avg_loss:.4f} | Accuracy: {acc:.4f} | AUC: {auc:.4f}")


for epoch in range(8):
    model.train()
    running_loss = 0.0
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f"\nEpoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")
    evaluate(model, train_loader, criterion, split='Train')
    evaluate(model, test_loader, criterion, split='Test')


Epoch 1, Train Loss: 2.0689
Train Loss: 1.9803 | Accuracy: 0.2571 | AUC: 0.6444
Test Loss: 2.0931 | Accuracy: 0.0909 | AUC: 0.4720

Epoch 2, Train Loss: 1.9436
Train Loss: 1.8665 | Accuracy: 0.4571 | AUC: 0.7944
Test Loss: 2.0594 | Accuracy: 0.1591 | AUC: 0.5475

Epoch 3, Train Loss: 1.8572
Train Loss: 1.7593 | Accuracy: 0.5524 | AUC: 0.8828
Test Loss: 2.0561 | Accuracy: 0.2273 | AUC: 0.5443

Epoch 4, Train Loss: 1.7310
Train Loss: 1.6553 | Accuracy: 0.6667 | AUC: 0.9441
Test Loss: 2.0582 | Accuracy: 0.2273 | AUC: 0.5469

Epoch 5, Train Loss: 1.6297
Train Loss: 1.5382 | Accuracy: 0.7524 | AUC: 0.9676
Test Loss: 2.0655 | Accuracy: 0.2273 | AUC: 0.5024

Epoch 6, Train Loss: 1.5484
Train Loss: 1.4716 | Accuracy: 0.7905 | AUC: 0.9791
Test Loss: 2.0226 | Accuracy: 0.2500 | AUC: 0.5799

Epoch 7, Train Loss: 1.4737
Train Loss: 1.3904 | Accuracy: 0.8286 | AUC: 0.9875
Test Loss: 2.0191 | Accuracy: 0.2500 | AUC: 0.5863

Epoch 8, Train Loss: 1.3600
Train Loss: 1.2838 | Accuracy: 0.9143 | AUC: 0.

## DINOv2 + Transformer Encoder (using PyTorch)

In [98]:
import torch
import torch.nn as nn
from transformers import AutoModel

class DINOv2_Transformer(nn.Module):
    def __init__(self, model_name='facebook/dinov2-base', hidden_dim=768, num_classes=8, num_heads=4, ff_dim=1024, dropout=0.1):
        super().__init__()

        # DINOv2 backbone
        self.vit = AutoModel.from_pretrained(model_name)
        self.vit.eval()
        for param in self.vit.parameters():
            param.requires_grad = False

        # Transformer Encoder block
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)

        # Final classifier
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):  # x: (B, T, 3, 224, 224)
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)

        with torch.no_grad():
            vit_out = self.vit(pixel_values=x).last_hidden_state[:, 0]  # CLS token

        x_seq = vit_out.view(B, T, -1)  # (B, T, 768)
        x_encoded = self.transformer_encoder(x_seq)  # (B, T, 768)
        x_pooled = x_encoded.mean(dim=1)  # average over time
        return self.fc(self.dropout(x_pooled))

model = DINOv2_Transformer().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [99]:
for epoch in range(20):
    model.train()
    running_loss = 0.0
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f"\nEpoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")
    evaluate(model, train_loader, criterion, split='Train')
    evaluate(model, test_loader, criterion, split='Test')


Epoch 1, Train Loss: 2.0450
Train Loss: 1.2645 | Accuracy: 0.6284 | AUC: 0.9366
Test Loss: 1.8811 | Accuracy: 0.2941 | AUC: 0.6961

Epoch 2, Train Loss: 1.2145
Train Loss: 0.7473 | Accuracy: 0.8176 | AUC: 0.9917
Test Loss: 1.7950 | Accuracy: 0.3725 | AUC: 0.7245

Epoch 3, Train Loss: 0.7136
Train Loss: 0.4388 | Accuracy: 0.9595 | AUC: 0.9994
Test Loss: 1.8521 | Accuracy: 0.4118 | AUC: 0.7210

Epoch 4, Train Loss: 0.4567
Train Loss: 0.2259 | Accuracy: 1.0000 | AUC: 1.0000
Test Loss: 1.8400 | Accuracy: 0.3922 | AUC: 0.7417

Epoch 5, Train Loss: 0.2278
Train Loss: 0.1187 | Accuracy: 1.0000 | AUC: 1.0000
Test Loss: 1.9477 | Accuracy: 0.3725 | AUC: 0.7240

Epoch 6, Train Loss: 0.1242
Train Loss: 0.0764 | Accuracy: 1.0000 | AUC: 1.0000
Test Loss: 2.0737 | Accuracy: 0.4118 | AUC: 0.7354

Epoch 7, Train Loss: 0.0844
Train Loss: 0.0583 | Accuracy: 0.9932 | AUC: 1.0000
Test Loss: 2.0547 | Accuracy: 0.3725 | AUC: 0.7494

Epoch 8, Train Loss: 0.0508
Train Loss: 0.0325 | Accuracy: 1.0000 | AUC: 1.

## 3D CNN

In [31]:
class VideoDataset3D(VideoDataset):  # inherits from your existing DINO dataset
    def __getitem__(self, idx):
        pixel_values, label = super().__getitem__(idx)  # (T, 3, H, W)
        pixel_values = pixel_values.permute(1, 0, 2, 3)  # (3, T, H, W)
        return pixel_values, label

train_dataset = VideoDataset3D(train_dir, sequence_length=SEQUENCE_LENGTH, image_size=IMG_SIZE)
test_dataset = VideoDataset3D(test_dir, sequence_length=SEQUENCE_LENGTH, image_size=IMG_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [103]:
class Simple3DCNN(nn.Module):
    def __init__(self, num_classes=8):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv3d(3, 32, kernel_size=(3, 5, 5), stride=1, padding=(1, 2, 2)),
            nn.BatchNorm3d(32),
            nn.ReLU(),
            nn.MaxPool3d((1, 2, 2)),

            nn.Conv3d(32, 64, kernel_size=(3, 3, 3), stride=1, padding=1),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.MaxPool3d((2, 2, 2)),

            nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=1, padding=1),
            nn.BatchNorm3d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool3d((1, 1, 1)),  # output shape: (B, 128, 1, 1, 1)
        )
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):  # x: (B, 3, T, H, W)
        x = self.model(x)  # (B, 128, 1, 1, 1)
        x = x.view(x.size(0), -1)  # (B, 128)
        return self.fc(x)

model = Simple3DCNN(num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [96]:
for epoch in range(30):
    model.train()
    running_loss = 0.0
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f"\nEpoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")
    evaluate(model, train_loader, criterion, split='Train')
    evaluate(model, test_loader, criterion, split='Test')


Epoch 1, Train Loss: 2.0921
Train Loss: 2.0085 | Accuracy: 0.1892 | AUC: 0.6303
Test Loss: 2.1123 | Accuracy: 0.2157 | AUC: 0.5528

Epoch 2, Train Loss: 2.0162
Train Loss: 1.9523 | Accuracy: 0.2500 | AUC: 0.6641
Test Loss: 2.0611 | Accuracy: 0.1765 | AUC: 0.5854

Epoch 3, Train Loss: 1.9936
Train Loss: 1.9248 | Accuracy: 0.2635 | AUC: 0.6819
Test Loss: 2.0474 | Accuracy: 0.1961 | AUC: 0.5825

Epoch 4, Train Loss: 1.9649
Train Loss: 1.9134 | Accuracy: 0.2770 | AUC: 0.6743
Test Loss: 2.0175 | Accuracy: 0.1961 | AUC: 0.6129

Epoch 5, Train Loss: 1.9463
Train Loss: 1.8929 | Accuracy: 0.2973 | AUC: 0.6972
Test Loss: 1.9990 | Accuracy: 0.2941 | AUC: 0.6445

Epoch 6, Train Loss: 1.9338
Train Loss: 1.8371 | Accuracy: 0.3446 | AUC: 0.7204
Test Loss: 1.9375 | Accuracy: 0.2941 | AUC: 0.6634

Epoch 7, Train Loss: 1.9094
Train Loss: 1.8376 | Accuracy: 0.2500 | AUC: 0.7240
Test Loss: 1.9472 | Accuracy: 0.3137 | AUC: 0.6661

Epoch 8, Train Loss: 1.8591
Train Loss: 1.8020 | Accuracy: 0.3716 | AUC: 0.