In [None]:
!pip install torchmetrics -q

In [None]:
import random
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
import torchvision.transforms as T
from tqdm.auto import tqdm
from torchmetrics.detection.mean_ap import MeanAveragePrecision

In [None]:

class NPZKilnDataset(Dataset):
    def __init__(self, npz_path, transforms=None):
        data = np.load(npz_path)
        self.images  = data['images']   # (N,3,H,W)
        self.bboxes  = data['bboxes']   # (N,4)
        self.classes = data['classes']  # (N,)
        self.transforms = transforms

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = torch.from_numpy(self.images[idx])  # [0,1] float32
        # build target
        if self.classes[idx] == 1:
            box   = self.bboxes[idx]
            boxes = torch.from_numpy(box[np.newaxis,:]).float()
            labels= torch.tensor([1], dtype=torch.int64)
        else:
            boxes = torch.zeros((0,4), dtype=torch.float32)
            labels= torch.zeros((0,),   dtype=torch.int64)

        # drop any degenerate
        if boxes.numel() > 0:
            keep = (boxes[:,2]>boxes[:,0]) & (boxes[:,3]>boxes[:,1])
            boxes, labels = boxes[keep], labels[keep]

        target = {
            'boxes':    boxes,
            'labels':   labels,
            'image_id': torch.tensor([idx]),
            'area':     ((boxes[:,3]-boxes[:,1])*(boxes[:,2]-boxes[:,0])
                         if boxes.numel()>0 else torch.tensor([])),
            'iscrowd':  torch.zeros((boxes.shape[0],), dtype=torch.int64)
        }

        if self.transforms:
            img = self.transforms(img)
        return img, target

def collate_fn(batch):
    return tuple(zip(*batch))

def get_transform():
    return T.Normalize(mean=[0.485,0.456,0.406],
                       std=[0.229,0.224,0.225])

def train(
    npz_path='/content/kiln_patches_dataset.npz',
    num_epochs=20,
    batch_size=8,
    val_split=0.2,
    lr=0.001,               # lowered LR
    num_workers=2,
    score_threshold=0.01
):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 1) load & split
    full_ds = NPZKilnDataset(npz_path, transforms=get_transform())
    total   = len(full_ds)
    vcount  = int(total * val_split)
    tcount  = total - vcount
    train_ds, val_ds = random_split(
        full_ds, [tcount, vcount],
        generator=torch.Generator().manual_seed(42)
    )

    def balance(subset):
        idxs = subset.indices
        pos = [i for i in idxs if full_ds.classes[i]==1]
        neg = [i for i in idxs if full_ds.classes[i]==0]
        n   = min(len(pos), len(neg))
        s   = random.Random(42)
        return Subset(
            full_ds,
            s.sample(pos, n) + s.sample(neg, n)
        )

    train_ds = balance(train_ds)
    val_ds   = balance(val_ds)

    # 3) show stats
    def stats(ds):
        idxs = ds.indices
        p = sum(int(full_ds.classes[i]==1) for i in idxs)
        return len(idxs), p, len(idxs)-p

    tn, tp, tn_neg = stats(train_ds)
    vn, vp, vn_neg = stats(val_ds)
    print(f"Train set:      {tn} images ({tp} kiln, {tn_neg} bg)")
    print(f"Validation set: {vn} images ({vp} kiln, {vn_neg} bg)\n")

    train_loader = DataLoader(train_ds, batch_size=batch_size,
                              shuffle=True, num_workers=num_workers,
                              collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size,
                              shuffle=False,num_workers=num_workers,
                              collate_fn=collate_fn)

    anchor_sizes  = ((4,), (8,), (16,), (32,), (64,))
    aspect_ratios = ((0.5,1.0,2.0),) * len(anchor_sizes)
    anchor_gen    = AnchorGenerator(sizes=anchor_sizes,
                                    aspect_ratios=aspect_ratios)

    weights = FasterRCNN_ResNet50_FPN_Weights.COCO_V1
    model   = fasterrcnn_resnet50_fpn(
        weights=weights,
        rpn_anchor_generator=anchor_gen
    )
    in_f = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_f, num_classes=2)
    model.to(device)

    params    = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9,
                                weight_decay=0.0005)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10,
                                                gamma=0.1)

    # 7) metric
    metric = MeanAveragePrecision(box_format="xyxy")

    # 8) train loop
    for epoch in range(1, num_epochs+1):
        model.train()
        total_loss = 0.0
        for imgs, tgts in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs} [Train]"):
            imgs    = [im.to(device) for im in imgs]
            targets = [{k:v.to(device) for k,v in t.items()} for t in tgts]
            loss_dict = model(imgs, targets)
            loss      = sum(loss_dict.values())
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()
        avg_train = total_loss / len(train_loader)

        model.eval()
        metric.reset()
        with torch.no_grad():
            for imgs, tgts in tqdm(val_loader, desc=f"Epoch {epoch}/{num_epochs} [Val]"):
                imgs = [im.to(device) for im in imgs]
                preds= model(imgs)
                cpu_preds, cpu_tgts = [], []
                for p,t in zip(preds, tgts):
                    keep = p['scores'] > score_threshold
                    cpu_preds.append({
                        'boxes':  p['boxes'][keep].cpu(),
                        'scores': p['scores'][keep].cpu(),
                        'labels': p['labels'][keep].cpu()
                    })
                    cpu_tgts.append({
                        'boxes':  t['boxes'].cpu(),
                        'labels': t['labels'].cpu()
                    })
                metric.update(cpu_preds, cpu_tgts)
        m = metric.compute()
        print(
            f"Epoch {epoch}/{num_epochs} — "
            f"Train Loss: {avg_train:.4f}, "
            f"mAP: {m['map']:.4f}, "
            f"AP50: {m['map_50']:.4f}, "
            f"AR@100: {m['mar_100']:.4f}"
        )

    torch.save(model.state_dict(), 'fasterrcnn_kiln.pth')
    print("Training complete, model saved to fasterrcnn_kiln.pth")

if __name__ == '__main__':
    train()


Train set:      3270 images (1635 kiln, 1635 bg)
Validation set: 778 images (389 kiln, 389 bg)



Epoch 1/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 1/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 1/20 — Train Loss: 0.0236, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 2/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 2/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 2/20 — Train Loss: 0.0206, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 3/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 3/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 3/20 — Train Loss: 0.0182, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 4/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 4/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 4/20 — Train Loss: 0.0174, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 5/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 5/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 5/20 — Train Loss: 0.0172, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 6/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 6/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 6/20 — Train Loss: 0.0174, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 7/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 7/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 7/20 — Train Loss: 0.0184, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 8/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 8/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 8/20 — Train Loss: 0.0196, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 9/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]

Epoch 9/20 [Val]:   0%|          | 0/98 [00:00<?, ?it/s]

Epoch 9/20 — Train Loss: 0.0197, mAP: 0.0000, AP50: 0.0000, AR@100: 0.0000


Epoch 10/20 [Train]:   0%|          | 0/409 [00:00<?, ?it/s]