In [3]:
# Cell 1 — Imports + config + folder
import os
from pathlib import Path

import torch
import torch.nn as nn
import torchvision.models as models
import wandb

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_CLASSES = 3  # Health, Other, Rust

PROJECT_NAME = "beyond-visible-spectrum"
RUN_NAME = "baseline_rgb_resnet18"

# thư mục lưu checkpoint
CKPT_DIR = Path("checkpoints") / "RGB" / RUN_NAME
CKPT_DIR.mkdir(parents=True, exist_ok=True)

BEST_CKPT_PATH = CKPT_DIR / "best.pth"
LAST_CKPT_PATH = CKPT_DIR / "last.pth"

EPOCHS = 10
LR = 1e-4


In [4]:
# Cell 2 — Init Weights & Biases
wandb.init(
    project=PROJECT_NAME,
    name=RUN_NAME,
    config={
        "model": "resnet18",
        "input": "RGB",
        "epochs": EPOCHS,
        "batch_size": 32,  # nhớ set đúng với train_loader của bạn
        "lr": LR,
    }
)



[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from C:\Users\ADMIN\_netrc.
[34m[1mwandb[0m: Currently logged in as: [33mphucga150625[0m ([33mphucga15062005[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Cell 3 — Build model, loss, optimizer
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


In [6]:
# Cell 4 — 1 epoch train/val
def run_epoch(loader, train: bool):
    model.train() if train else model.eval()

    total_loss = 0.0
    correct = 0
    total = 0

    with torch.set_grad_enabled(train):
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)

            if train:
                optimizer.zero_grad(set_to_none=True)

            logits = model(x)
            loss = criterion(logits, y)

            if train:
                loss.backward()
                optimizer.step()

            bs = x.size(0)
            total_loss += loss.item() * bs
            pred = logits.argmax(dim=1)
            correct += (pred == y).sum().item()
            total += bs

    return total_loss / max(total, 1), correct / max(total, 1)


In [7]:
# Cell 5 — Training loop + log + save best/last
best_val_acc = -1.0

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(val_loader, train=False)

    print(
        f"Epoch {epoch:02d} | "
        f"train loss {tr_loss:.4f} acc {tr_acc:.4f} | "
        f"val loss {va_loss:.4f} acc {va_acc:.4f}"
    )

    # log theo từng epoch (đúng vị trí)
    wandb.log(
        {
            "train_loss": tr_loss,
            "train_acc": tr_acc,
            "val_loss": va_loss,
            "val_acc": va_acc,
            "epoch": epoch,
        },
        step=epoch,
    )

    # luôn lưu last
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "best_val_acc": best_val_acc,
            "config": dict(wandb.config),
        },
        LAST_CKPT_PATH,
    )

    # lưu best theo val_acc
    if va_acc > best_val_acc:
        best_val_acc = va_acc
        torch.save(
            {
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "best_val_acc": best_val_acc,
                "config": dict(wandb.config),
            },
            BEST_CKPT_PATH,
        )
        wandb.save(str(BEST_CKPT_PATH))

# lưu last lên wandb luôn
wandb.save(str(LAST_CKPT_PATH))


RuntimeError: DataLoader worker (pid(s) 15616, 18572) exited unexpectedly

In [None]:
# Cell 6 — Finish
print("Saved:")
print(" - best:", BEST_CKPT_PATH)
print(" - last:", LAST_CKPT_PATH)
wandb.finish()
