In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/data_trimmed_clean.zip" -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data_trimmed/Train/Shooting/Shooting042_x264_1030.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting020_x264_2680.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting029_x264_1260.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_2690.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting014_x264_2740.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_4510.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_11010.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting006_x264_9020.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting005_x264_1860.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting052_x264_4560.png  
  inflating: /content/data_trimmed/Train/Shooting/Shooting009_x264_130.png  
 extracting: /content/data_trimmed/Train/Shooting/Shooting027_x264_140.png  


In [3]:
# ONLY USED FOR TESTING AND DEBUGGING - for final model we will use the whole dataset

import os
import shutil

def create_debug_subset_sequential(source_dir, dest_dir, train_limit=4000, test_limit=1500):
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    os.makedirs(dest_dir, exist_ok=True)

    for split, limit in [("Train", train_limit), ("Test", test_limit)]:
        src_split_path = os.path.join(source_dir, split)
        dst_split_path = os.path.join(dest_dir, split)
        os.makedirs(dst_split_path, exist_ok=True)

        for class_name in os.listdir(src_split_path):
            class_src = os.path.join(src_split_path, class_name)
            class_dst = os.path.join(dst_split_path, class_name)
            os.makedirs(class_dst, exist_ok=True)

            valid_images = sorted([f for f in os.listdir(class_src) if f.endswith(".png") and not f.startswith("._")])
            selected_images = valid_images[:limit]

            for img in selected_images:
                shutil.copy(os.path.join(class_src, img), os.path.join(class_dst, img))

create_debug_subset_sequential("/content/data_trimmed", "/content/data_trimmed_debug", train_limit=4000, test_limit=1500)

In [7]:
# RUN JUST ONCE ON COLAB!!

!pip install -q torch torchvision transformers accelerate peft

### Dataset & DataLoader

In [39]:
import torch, random, os
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoImageProcessor, Dinov2Model
import torch.nn as nn
from tqdm.auto import tqdm

# define clip-based dataset
class CrimeClipDataset(Dataset):
    def __init__(self, root_dir, clip_len=8, stride=8, jitter=2):
        self.clip_len = clip_len
        self.stride   = stride
        self.jitter   = jitter

        classes = sorted(d for d in os.listdir(root_dir)
                         if os.path.isdir(os.path.join(root_dir, d)))
        self.cls2idx = {c:i for i,c in enumerate(classes)}
        self.clips = []
        for cls in classes:
            cls_idx = self.cls2idx[cls]
            frames = sorted(f for f in os.listdir(os.path.join(root_dir, cls))
                            if f.lower().endswith('.png'))
            for start in range(0, len(frames) - clip_len + 1, stride):
                paths = [os.path.join(root_dir, cls, frames[i])
                         for i in range(start, start + clip_len)]
                self.clips.append((paths, cls_idx))

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        paths, label = self.clips[idx]
        # temporal jitter augmentation
        if self.jitter > 0:
            shift = random.randint(-self.jitter, self.jitter)
            if shift > 0:
                paths = paths[shift:] + [paths[-1]] * shift
            elif shift < 0:
                paths = [paths[0]] * (-shift) + paths[:shift]
        # load frames and process
        imgs = [Image.open(p).convert('RGB') for p in paths]
        pixel_values = processor(images=imgs, return_tensors="pt").pixel_values
        return pixel_values, label

processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small")

train_ds = CrimeClipDataset(
    root_dir="/content/data_trimmed_debug/Train",
    clip_len=16, stride=8, jitter=2
)
val_ds = CrimeClipDataset(
    root_dir="/content/data_trimmed_debug/Test",
    clip_len=16, stride=8, jitter=0
)

train_loader = DataLoader(
    train_ds, batch_size=4, shuffle=True,
    num_workers=4, pin_memory=True
)
val_loader = DataLoader(
    val_ds, batch_size=4, shuffle=False,
    num_workers=4, pin_memory=True
)

print(f"# train clips: {len(train_ds)}, # val clips: {len(val_ds)}")
b, t, c, h, w = next(iter(train_loader))[0].shape
print(f"One batch shape [B, T, C, H, W]: {[b, t, c, h, w]}")

# train clips: 3901, # val clips: 1454
One batch shape [B, T, C, H, W]: [4, 16, 3, 224, 224]


### Dino v2 backbone (frozen) + Bi‑LSTM

In [40]:
backbone = Dinov2Model.from_pretrained("facebook/dinov2-small")
for p in backbone.parameters():
    p.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
backbone.to(device).eval()

# define temporal head
class TemporalCrimeModel(nn.Module):
    def __init__(self,
                 feat_dim=backbone.config.hidden_size,
                 hidden=256,
                 num_classes=8,
                 lstm_layers=1,
                 dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            feat_dim, hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=(dropout if lstm_layers>1 else 0.0)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden*2, hidden),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden, num_classes)
        )

    def forward(self, cls_seq):
        # cls_seq: [B, T, feat_dim]
        out, _ = self.lstm(cls_seq)         # [B, T, hidden*2]
        last   = out[:, -1]                 # [B, hidden*2]
        logits = self.classifier(last)      # [B, num_classes]
        return logits

temporal_model = TemporalCrimeModel().to(device)

In [42]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm.auto import tqdm
import wandb

# initialize wandb
wandb.init(
    project="crime_action_classification",
    name="dinov2_lstm_clip16",
    config={
        "clip_len":      16,
        "stride":        8,
        "batch_size":    4,
        "lr":            1e-3,
        "weight_decay":  1e-3,
        "hidden_size":   256,
        "dropout":       0.3,
        "num_epochs":    10,
        "patience":      3,
        "step_size":     3,
        "gamma":         0.5
    }
)
config = wandb.config


criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = AdamW(temporal_model.parameters(),
                  lr=config.lr,
                  weight_decay=config.weight_decay)
scheduler = StepLR(optimizer,
                   step_size=config.step_size,
                   gamma=config.gamma)

best_val_loss = float('inf')
no_improve = 0

for epoch in range(1, config.num_epochs + 1):
    # train
    temporal_model.train()
    train_loss = train_correct = train_total = 0
    train_iter = tqdm(train_loader, desc=f"Epoch {epoch:02d} [Train]", leave=False)
    for pix, labels in train_iter:
        B, T, C, H, W = pix.shape
        flat = pix.view(B*T, C, H, W).to(device)
        labels = labels.to(device)

        with torch.no_grad():
            out   = backbone(pixel_values=flat)
            feats = out.last_hidden_state[:, 0, :]

        seq    = feats.view(B, T, -1)
        logits = temporal_model(seq)
        loss   = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss   += loss.item() * B
        preds        = logits.argmax(dim=1)
        train_correct+= (preds == labels).sum().item()
        train_total  += B

        train_iter.set_postfix({
            'loss': f"{loss.item():.3f}",
            'acc':  f"{train_correct/train_total:.3f}"
        })

    train_loss /= train_total
    train_acc  = train_correct / train_total

    # validate
    temporal_model.eval()
    val_loss = val_correct = val_total = 0
    val_iter = tqdm(val_loader, desc=f"Epoch {epoch:02d} [Valid]", leave=False)
    with torch.no_grad():
        for pix, labels in val_iter:
            B, T, C, H, W = pix.shape
            flat = pix.view(B*T, C, H, W).to(device)
            labels = labels.to(device)

            out   = backbone(pixel_values=flat)
            feats = out.last_hidden_state[:, 0, :]
            seq   = feats.view(B, T, -1)
            logits= temporal_model(seq)
            loss  = criterion(logits, labels)

            val_loss    += loss.item() * B
            preds        = logits.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
            val_total   += B

            val_iter.set_postfix({
                'loss': f"{loss.item():.3f}",
                'acc':  f"{val_correct/val_total:.3f}"
            })

    val_loss /= val_total
    val_acc   = val_correct / val_total

    # log to wandb
    wandb.log({
        "epoch":       epoch,
        "train/loss":  train_loss,
        "train/acc":   train_acc,
        "val/loss":    val_loss,
        "val/acc":     val_acc,
        "lr":          scheduler.get_last_lr()[0],
    })

    # print and checkpoint
    print(f"Epoch {epoch:02d} | "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.3f} | "
          f"Val Loss:   {val_loss:.4f}, Acc: {val_acc:.3f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve = 0
        torch.save(temporal_model.state_dict(), "best_temporal_model.pth")
        print(" → New best model saved.")
    else:
        no_improve += 1
        if no_improve >= config.patience:
            print("Early stopping triggered.")
            break

    scheduler.step()

print("Training complete. Best val loss:", best_val_loss)
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfareesahhussain[0m ([33mfareesahhussain-city-university-of-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 01 [Train]:   0%|          | 0/976 [00:00<?, ?it/s]

Epoch 01 [Valid]:   0%|          | 0/364 [00:00<?, ?it/s]

Epoch 01 | Train Loss: 0.6709, Acc: 0.939 | Val Loss:   2.1466, Acc: 0.207
 → New best model saved.


Epoch 02 [Train]:   0%|          | 0/976 [00:00<?, ?it/s]

Epoch 02 [Valid]:   0%|          | 0/364 [00:00<?, ?it/s]

Epoch 02 | Train Loss: 0.5421, Acc: 0.988 | Val Loss:   2.1751, Acc: 0.221


Epoch 03 [Train]:   0%|          | 0/976 [00:00<?, ?it/s]

Epoch 03 [Valid]:   0%|          | 0/364 [00:00<?, ?it/s]

Epoch 03 | Train Loss: 0.5248, Acc: 0.996 | Val Loss:   2.3110, Acc: 0.204


Epoch 04 [Train]:   0%|          | 0/976 [00:00<?, ?it/s]

Epoch 04 [Valid]:   0%|          | 0/364 [00:00<?, ?it/s]

Epoch 04 | Train Loss: 0.5040, Acc: 0.998 | Val Loss:   2.2441, Acc: 0.193
Early stopping triggered.
Training complete. Best val loss: 2.146610297149936


0,1
epoch,▁▃▆█
lr,███▁
train/acc,▁▇██
train/loss,█▃▂▁
val/acc,▄█▄▁
val/loss,▁▂█▅

0,1
epoch,4.0
lr,0.0005
train/acc,0.99846
train/loss,0.50396
val/acc,0.19326
val/loss,2.24405


Next TO DO:

**Alternate Temporal Heads**
* TCN (Temporal Convolutional Net): 3–5 dilated 1D conv layers (kernel=3), receptive field ≈T.
* Transformer: 2–4 encoder layers, 4–8 heads, dropout=0.1.