### Cell A â€” Imports & Config

In [1]:
# === Cell A â€” Imports, paths, reproducibility ===
import os, random, time
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, WeightedRandomSampler
import sys
from pathlib import Path
from torchvision.models.video import r2plus1d_18, R2Plus1D_18_Weights

# Optional: avoid some multiprocessing headaches
try:
    import torch.multiprocessing as mp
    mp.set_start_method("spawn", force=True)
    torch.multiprocessing.set_sharing_strategy("file_system")
except Exception:
    pass

# Paths
root = Path("..").resolve()
if str(root) not in sys.path:
    sys.path.append(str(root))
if str(root / "src") not in sys.path:
    sys.path.append(str(root / "src"))

print("Root added to PYTHONPATH:", root)

data_dir = root / "data" / "wlasl_preprocessed"
roi_manifest = data_dir / "manifest_nslt2000_roi_full_resplit_70_15_15_min7.csv"
assert roi_manifest.exists(), f"Missing manifest: {roi_manifest}"

ckpt_dir = root / "checkpoints"
ckpt_dir.mkdir(exist_ok=True)

# Device & seeds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

print("Device:", device)
print("Manifest:", roi_manifest)


Root added to PYTHONPATH: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL
Device: cuda
Manifest: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/manifest_nslt2000_roi_full_resplit_70_15_15_min7.csv


### Cell B â€” Load manifest, dataset, Kinetics normalization, loaders

In [2]:
# === Cell B â€” Dataset, Kinetics normalization, DataLoaders ===
from src.data.wlasl_ds import WLASLDataset
import src.data.wlasl_ds as wds_mod

df = pd.read_csv(roi_manifest)
assert {"path", "gloss", "label_new", "split"}.issubset(df.columns), df.columns

# ðŸ”§ Make sure the column used by WLASLDataset ("label") is the new contiguous one
df = df.copy()
df["label"] = df["label_new"]

num_classes = df["label"].nunique()
print("num_classes:", num_classes)
print("split counts:", df["split"].value_counts().to_dict())

# Kinetics-style normalization (matches K400 pretraining)
def kinetics_normalize(x):
    # x: [T,C,H,W] float32 in [0,1]
    mean = torch.tensor((0.432, 0.394, 0.376), dtype=x.dtype, device=x.device)[None,:,None,None]
    std  = torch.tensor((0.228, 0.221, 0.223), dtype=x.dtype, device=x.device)[None,:,None,None]
    return (x - mean) / std

# Monkeypatch the dataset's _normalize
wds_mod._normalize = kinetics_normalize

# train/val/test splits
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "val"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)


CLIP_LEN = 32
STRIDE   = 2
BATCH    = 8
NUM_WORKERS = 4

train_ds = WLASLDataset(train_df, clip_len=CLIP_LEN, stride=STRIDE, train=True)
val_ds   = WLASLDataset(val_df,   clip_len=CLIP_LEN, stride=STRIDE, train=False)
test_ds  = WLASLDataset(test_df,  clip_len=CLIP_LEN, stride=STRIDE, train=False)

# Optional weighted sampler for class imbalance
use_weighted_sampler = True

if use_weighted_sampler:
    counts = train_df["label_new"].value_counts().to_dict()
    weights = train_df["label_new"].map(lambda y: 1.0 / counts[y]).values.astype(np.float32)
    sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)
    train_loader = DataLoader(
        train_ds, batch_size=BATCH, sampler=sampler,
        num_workers=NUM_WORKERS, pin_memory=True
    )
else:
    train_loader = DataLoader(
        train_ds, batch_size=BATCH, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=True
    )

val_loader  = DataLoader(val_ds,  batch_size=BATCH, shuffle=False,
                         num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_ds, batch_size=BATCH, shuffle=False,
                         num_workers=NUM_WORKERS, pin_memory=True)

print(f"Splits | train={len(train_ds)} val={len(val_ds)} test={len(test_ds)}")
print("train label range:", train_df["label"].min(), "â†’", train_df["label"].max())
print("val label range:",   val_df["label"].min(),   "â†’", val_df["label"].max())
print("test label range:",  test_df["label"].min(),  "â†’", test_df["label"].max())
print("num_classes:", num_classes)


num_classes: 647
split counts: {'train': 3286, 'test': 1350, 'val': 656}
Splits | train=3286 val=656 test=1350
train label range: 0 â†’ 646
val label range: 0 â†’ 646
test label range: 0 â†’ 646
num_classes: 647


#### Cell C â€” R(2+1)D-18 model (Kinetics-400)

In [3]:
# === Cell C â€” R(2+1)D-18 Kinetics-400 model ===

class R2Plus1D18WithPermute(nn.Module):
    """Wrap r2plus1d_18 to accept [B, T, C, H, W] and permute internally."""
    def __init__(self, num_classes, pretrained=True):
        super().__init__()
        weights = R2Plus1D_18_Weights.KINETICS400_V1 if pretrained else None
        self.backbone = r2plus1d_18(weights=weights)
        in_feats = self.backbone.fc.in_features
        self.backbone.fc = nn.Linear(in_feats, num_classes)

    def forward(self, x):  # x: [B, T, C, H, W]
        x = x.permute(0, 2, 1, 3, 4).contiguous()  # -> [B, C, T, H, W]
        return self.backbone(x)

torch.set_float32_matmul_precision('high')

model = R2Plus1D18WithPermute(num_classes=num_classes, pretrained=True).to(device)
print("Model: R(2+1)D-18 K400")


Model: R(2+1)D-18 K400


#### Cell D â€” Optimizer, AMP, training utilities

In [4]:
# === Cell D â€” Optimizer, AMP, helpers ===
from torch.optim import AdamW
from torch.amp import GradScaler

EPOCHS = 20
LR     = 1e-4
WD     = 1e-5
AMP_ON = True

opt    = AdamW(model.parameters(), lr=LR, weight_decay=WD)
scaler = GradScaler(enabled=AMP_ON)
criterion = nn.CrossEntropyLoss(label_smoothing=0.0)  # or 0.1 if you like
best_val_acc = -1.0

def top1_acc(logits, y):
    with torch.no_grad():
        return (logits.argmax(1) == y).float().mean().item()

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    total_loss = 0.0
    total_acc  = 0.0
    total_n    = 0

    if train:
        opt.zero_grad(set_to_none=True)

    for x, y, _ in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        with torch.amp.autocast(device_type=device.type, enabled=AMP_ON):
            logits = model(x)
            loss = F.cross_entropy(logits, y)

        if train:
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
            opt.zero_grad(set_to_none=True)

        with torch.no_grad():
            bs = x.size(0)
            total_loss += loss.item() * bs
            total_acc  += top1_acc(logits, y) * bs
            total_n    += bs

    return total_loss / total_n, total_acc / total_n


#### Cell E â€” Training loop & checkpoint

In [5]:
# === Cell E â€” Training loop & checkpoint ===

best_val_acc = -1.0
best_path = ckpt_dir / "best_r2plus1d_k400_fullroi_70_15_15_min7.pt"

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(val_loader,   train=False)

    print(f"Epoch {epoch:02d}/{EPOCHS} | "
          f"train loss {tr_loss:.4f} acc {tr_acc:.3f} | "
          f"val loss {va_loss:.4f} acc {va_acc:.3f}")

    if va_acc > best_val_acc:
        best_val_acc = va_acc
        torch.save(model.state_dict(), best_path)
        print(f"  âžœ New best val acc={best_val_acc:.3f} (model saved to {best_path})")


terminate called without an active exception
terminate called without an active exception


OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB. GPU 0 has a total capacity of 15.46 GiB of which 168.25 MiB is free. Process 3465458 has 1.32 GiB memory in use. Process 3815780 has 1.32 GiB memory in use. Process 1455768 has 2.21 GiB memory in use. Process 1813039 has 4.62 GiB memory in use. Process 2333701 has 1.02 GiB memory in use. Including non-PyTorch memory, this process has 4.73 GiB memory in use. Of the allocated memory 4.39 GiB is allocated by PyTorch, and 7.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

#### Cell F â€” Test evaluation

In [None]:
# === Cell F â€” Test evaluation ===

assert best_path.exists(), f"Best checkpoint not found: {best_path}"

model.load_state_dict(torch.load(best_path, map_location=device))
model.to(device)

test_loss, test_acc = run_epoch(test_loader, train=False)
print(f"TEST â€” loss {test_loss:.4f} | acc {test_acc:.3f}")
