In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader, Subset
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import random

# --- Reproducibility ---
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# --- Settings ---
images_path = "../output/intersectionlight_merged.npy"  # shape: (N, H, W, 3)
labels_path = "../output/intersectionlight_labels_merged.npy"  # shape: (N,) values: 0=red, 1=green

val_size = 0.2
batch_size = 32
epochs = 10
img_size = 224  # ResNet-18 default
lr = 1e-3
num_workers = 2  # tweak per machine
ckpt_dir = "checkpoints"
os.makedirs(ckpt_dir, exist_ok=True)

# --- Dataset ---
class NpyDataset(Dataset):
    def __init__(self, images_path, labels_path, transform=None):
        self.images = np.load(images_path)       # (N, H, W, 3), uint8 or float
        self.labels = np.load(labels_path).astype(np.int64)  # (N,)
        assert len(self.images) == len(self.labels), "Images and labels must have same length."
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = self.images[idx]
        lbl = self.labels[idx]
        if self.transform:
            img = self.transform(img)  # expects HWC
        return img, lbl

# --- Transform ---
transform = transforms.Compose([
    transforms.ToPILImage(),  # works with HWC numpy
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# --- Build base dataset and stratified split ---
base_ds = NpyDataset(images_path, labels_path, transform)

sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=seed)
(train_idx, val_idx), = sss.split(np.zeros(len(base_ds)), base_ds.labels)

train_ds = Subset(base_ds, train_idx)
val_ds   = Subset(base_ds, val_idx)

# --- DataLoaders ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pin = (device.type == "cuda")
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin)

# --- GPU / environment printouts ---
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Selected device: {device}")
if device.type == "cuda":
    print(f"GPU 0: {torch.cuda.get_device_name(0)}")
    print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
    print(f"cuDNN enabled: {torch.backends.cudnn.enabled}")
    print(f"cuDNN version: {torch.backends.cudnn.version()}")
else:
    print("Running on CPU.")

# --- Model ---
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, 2)  # 2 classes
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

def save_checkpoint(path, epoch, model, optimizer, val_acc, train_loss):
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "val_acc": val_acc,
        "train_loss": train_loss,
    }, path)

best_val_acc = -1.0

# --- Training ---
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for step, (imgs, labels) in enumerate(train_loader, start=1):
        imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * labels.size(0)

        # Optional: mini-batch device sanity print (only prints occasionally)
        if step == 1 and epoch == 0:
            print(f"[Sanity] Batch device: imgs={imgs.device}, labels={labels.device}")

    # Validation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            preds = model(imgs).argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / len(train_ds)
    val_acc = correct / total if total > 0 else 0.0

    # GPU memory stats (if available)
    if device.type == "cuda":
        allocated = torch.cuda.memory_allocated(0) / (1024**2)
        reserved  = torch.cuda.memory_reserved(0)  / (1024**2)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {epoch_loss:.4f} | "
              f"Val Acc: {val_acc:.4f} | GPU mem (MB): allocated={allocated:.1f}, reserved={reserved:.1f}")
    else:
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {epoch_loss:.4f} | Val Acc: {val_acc:.4f}")

    # --- Checkpoints ---
    # 1) Per-epoch checkpoint
    ckpt_path = os.path.join(ckpt_dir, f"traffic_light_resnet18_epoch{epoch+1:03d}_acc{val_acc:.4f}.pth")
    save_checkpoint(ckpt_path, epoch+1, model, optimizer, val_acc, epoch_loss)

    # 2) Best model checkpoint
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_path = os.path.join(ckpt_dir, "traffic_light_resnet18_best.pth")
        save_checkpoint(best_path, epoch+1, model, optimizer, val_acc, epoch_loss)
        print(f"  ✔ Saved new BEST model to {best_path} (val_acc={val_acc:.4f})")

# Final weights (last epoch only, non-checkpoint format)
torch.save(model.state_dict(), "traffic_light_resnet18_last.pth")
print("Saved final state_dict to traffic_light_resnet18_last.pth")


PyTorch version: 2.7.1+cu128
CUDA available: True
Selected device: cuda
GPU 0: NVIDIA GeForce GTX 1060 6GB
CUDA capability: (6, 1)
cuDNN enabled: True
cuDNN version: 90701
