In [43]:
import os, shutil, json, math, random, heapq
import unicodedata
import albumentations as A
import os, math, random, cv2, numpy as np, torch, torch.nn as nn, torch.nn.functional as F
import glob
from glob import glob
from pathlib import Path
from collections import defaultdict, Counter
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
from collections import defaultdict

In [10]:
root = "/content/data/Sample"
img_root = f"{root}/01.원천데이터"      # .jpg
lab_root = f"{root}/02.라벨링데이터"    # .json

dataset_root = "/content/dataset"
images_root = f"{dataset_root}/images"  # .jpg (전처리 후)
labels_root = f"{dataset_root}/labels"  # .txt

splits = ["train", "val", "test"]

### 데이터 준비

In [2]:
# 초기화 후 CP949로 해제
!rm -rf /content/data && mkdir -p /content/data
!unzip -qq -O CP949 "/content/drive/MyDrive/Colab Notebooks/Python_colab/Web Service/생성형 AI/data/art.zip" -d /content/data

In [7]:
random.seed(2025)

for split in splits:
    os.makedirs(f"{dataset_root}/images/{split}", exist_ok=True)
    os.makedirs(f"{dataset_root}/labels/{split}", exist_ok=True)

# 각 클래스 폴더 순회
classes = os.listdir(img_root)
for cls in classes:
    img_paths = sorted(glob.glob(f"{img_root}/{cls}/*.jpg"))
    lab_paths = sorted(glob.glob(f"{lab_root}/{cls}/*.json"))

    # 이름 기준 매칭
    pairs = [(i, l) for i, l in zip(img_paths, lab_paths) if Path(i).stem == Path(l).stem]
    random.shuffle(pairs)

    n = len(pairs)
    n_train = int(n*0.8)
    n_val   = int(n*0.1)
    n_test  = n - n_train - n_val

    split_data = {
        "train": pairs[:n_train],
        "val":   pairs[n_train:n_train+n_val],
        "test":  pairs[n_train+n_val:]
    }

    # 복사
    for split, items in split_data.items():
        for imgf, labf in items:
            shutil.copy(imgf, f"{dataset_root}/images/{split}/{Path(imgf).name}")
            shutil.copy(labf, f"{dataset_root}/labels/{split}/{Path(labf).name}")

print("데이터셋 분리 완료 ✅")

데이터셋 분리 완료 ✅


### 데이터 분석

In [8]:
label_counter = defaultdict(Counter)

# 모든 하위 폴더까지 뒤져서 json 읽기
for jp in Path(lab_root).rglob("*.json"):
    with open(jp, "r", encoding="utf-8") as f:
        data = json.load(f)

    anns = data.get("annotations") or {}
    top_cls = anns.get("class", "UNKNOWN")

    bbox_list = anns.get("bbox") or []
    for b in bbox_list:
        lbl = b.get("label")
        if lbl:
            label_counter[top_cls][lbl] += 1

# 출력
for cls, counter in label_counter.items():
    print(f"\n=== {cls} ===")
    for lbl, cnt in counter.most_common(15):
        print(f"{lbl:15s}  {cnt}")


=== 나무 ===
나뭇잎              509
별                485
열매               477
꽃                318
구름               275
가지               209
새                153
수관               151
다람쥐              144
그네               142
나무전체             140
기둥               140
뿌리               140
달                140

=== 여자사람 ===
눈                280
팔                280
다리               280
발                280
운동화              280
여자구두             280
손                279
귀                277
주머니              253
단추               143
사람전체             140
머리               140
얼굴               140
코                140
입                140

=== 남자사람 ===
눈                280
귀                280
팔                280
발                280
운동화              280
남자구두             280
다리               279
손                278
주머니              264
단추               147
사람전체             140
머리               140
얼굴               140
코                140
입                140

=== 집 ===
잔디               525
꽃   

### 데이터 전처리

In [20]:
def otsu_binarize_white_on_black(img_rgb, median_k=3, dilate_ksize=0):
    """
    흰 객체(선) + 검정 배경으로 이진화.
    - OTSU + THRESH_BINARY_INV 사용
    - 선택적으로 median blur, 얇은 선 보강을 위한 dilation 제공
    """
    g = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
    if median_k and median_k > 1:
        g = cv2.GaussianBlur(g, (3,3), 0)
    # OTSU + invert → 배경(밝음)은 0(검정), 선(어두움)은 255(흰색)
    _, th = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    if median_k and median_k > 1:
        th = cv2.medianBlur(th, median_k)
    if dilate_ksize and dilate_ksize > 0:
        k = cv2.getStructuringElement(cv2.MORPH_RECT, (dilate_ksize, dilate_ksize))
        th = cv2.dilate(th, k, iterations=1)
    # 1채널 → 3채널(RGB)로 반환 (데이터 로더 호환)
    return cv2.cvtColor(th, cv2.COLOR_GRAY2RGB)

# ==== train/val/test 각각 덮어쓰기 ====
total = 0
for split in ["train", "val", "test"]:
    img_paths = sorted(
        glob.glob(f"{images_root}/{split}/*.jpg")
    )
    for ip in tqdm(img_paths, desc=f"{split} Otsu invert", ncols=100):
        bgr = cv2.imread(ip)
        if bgr is None:
            continue
        rgb = bgr[:, :, ::-1]
        bin_rgb = otsu_binarize_white_on_black(
            rgb,
            median_k=3,      # 노이즈 있으면 3 유지, 더 선명하면 0도 가능
            dilate_ksize=1   # 선 살짝 두껍게(원치 않으면 0)
        )
        cv2.imwrite(ip, bin_rgb[:, :, ::-1])  # 다시 BGR로 저장(덮어쓰기)
        total += 1

print(f"오츠 이진화(흰 객체/검정 배경) 전처리 완료 ✅  처리수: {total}")

train Otsu invert: 100%|██████████████████████████████████████████| 448/448 [00:26<00:00, 16.83it/s]
val Otsu invert: 100%|██████████████████████████████████████████████| 56/56 [00:03<00:00, 16.96it/s]
test Otsu invert: 100%|█████████████████████████████████████████████| 56/56 [00:03<00:00, 16.82it/s]

오츠 이진화(흰 객체/검정 배경) 전처리 완료 ✅  처리수: 560





### 라벨 변환( JSON -> txt )

In [11]:
class_to_id = {"나무":0, "남자사람":1, "여자사람":2, "집":3}
id_to_class = {v:k for k,v in class_to_id.items()}

rep_map = {
    "나무": ["나무전체"],
    "남자사람": ["사람전체"],
    "여자사람": ["사람전체"],
    "집": ["집전체"],
}

In [12]:
def _norm_kor(s: str) -> str:
    return unicodedata.normalize("NFKC", s).strip().lower().replace(" ", "")

rep_map_norm = {k: [_norm_kor(x) for x in v] for k,v in rep_map.items()}

def _parse_resolution(meta_res):
    """meta.img_resolution like '1280x1280' → (W,H). 실패 시 None."""
    if isinstance(meta_res, str) and "x" in meta_res.lower():
        try:
            W, H = map(int, meta_res.lower().split("x"))
            if W>0 and H>0: return W, H
        except: pass
    return None

def _best_rep_box(anns, top_cls):
    """대표 라벨 1개만 선택."""
    wanted = rep_map_norm.get(top_cls, [])
    for b in (anns.get("bbox") or []):
        lbl = b.get("label")
        if isinstance(lbl, str) and _norm_kor(lbl) in wanted:
            # pascal_voc absolute: x,y,w,h → 반환은 (x,y,w,h)
            if all(isinstance(b.get(k),(int,float)) for k in ["x","y","w","h"]):
                return b
    return None

def convert_one_json(jpath: Path, out_dir: Path):
    with open(jpath, "r", encoding="utf-8") as f:
        data = json.load(f)

    anns = data.get("annotations") or {}
    top_cls = anns.get("class")
    if top_cls not in class_to_id:
        return False, "unknown_top_class"

    # 이미지 해상도
    meta = data.get("meta") or {}
    sz = _parse_resolution(meta.get("img_resolution"))
    if not sz:
        return False, "no_resolution"
    W, H = sz

    # 대표 박스 선택
    rep = _best_rep_box(anns, top_cls)
    if rep is None:
        return False, "no_rep_box"

    # YOLO 정규화
    x, y, w, h = float(rep["x"]), float(rep["y"]), float(rep["w"]), float(rep["h"])
    cx = (x + w/2) / W
    cy = (y + h/2) / H
    wn = w / W
    hn = h / H
    # 범위 체크
    if not (0<=cx<=1 and 0<=cy<=1 and wn>0 and hn>0):
        return False, "out_of_range"

    # 저장(.txt, 덮어쓰기)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_txt = out_dir / (jpath.stem + ".txt")
    with open(out_txt, "w", encoding="utf-8") as fw:
        fw.write(f"{class_to_id[top_cls]} {cx:.6f} {cy:.6f} {wn:.6f} {hn:.6f}\n")
    return True, "ok"

In [13]:
summary = {}
for split in splits:
    in_dir  = Path(labels_root) / split
    out_dir = Path(labels_root) / split
    cnt_all = 0
    cnt_ok  = 0
    reasons = {}
    for jp in sorted(glob.glob(str(in_dir / "*.json"))):
        jp = Path(jp)
        cnt_all += 1
        ok, why = convert_one_json(jp, out_dir)
        if ok: cnt_ok += 1
        else:  reasons[why] = reasons.get(why, 0) + 1
    summary[split] = (cnt_ok, cnt_all, reasons)
    print(f"[{split}] {cnt_ok}/{cnt_all} converted. fails: {reasons}")

print("✅ 변환 완료 (같은 폴더에 .txt 덮어쓰기 생성)")

[train] 448/448 converted. fails: {}
[val] 56/56 converted. fails: {}
[test] 56/56 converted. fails: {}
✅ 변환 완료 (같은 폴더에 .txt 덮어쓰기 생성)


### 앵커 산출

In [14]:
# === 단일 셀: 앵커(k=5) 산출 + YOLOv2 미니 학습(3epoch sanity) ===
!pip -q install albumentations tqdm

In [15]:
# ====== 1) 학습/검증 데이터로부터 앵커(k=5) 산출 ======
def load_wh_from_labels(split="train"):
    wh = []
    for lab in sorted(Path(labels_root, split).glob("*.txt")):
        with open(lab, "r", encoding="utf-8") as f:
            line = f.readline().strip().split()
        if len(line) < 5:
            continue
        wn, hn = float(line[3]), float(line[4])
        if wn > 0 and hn > 0:
            wh.append([wn, hn])
    return np.array(wh, dtype=np.float32)

def kmeans_iou(boxes, k=5, iters=50, seed=42):
    rng = np.random.default_rng(seed)
    centers = boxes[rng.choice(len(boxes), size=k, replace=False)]
    for _ in range(iters):
        ious = []
        for c in centers:
            inter = np.minimum(boxes[:,0], c[0]) * np.minimum(boxes[:,1], c[1])
            union = (boxes[:,0]*boxes[:,1] + c[0]*c[1] - inter + 1e-9)
            ious.append(inter/union)
        ious = np.stack(ious, axis=1)
        d = 1 - ious
        labels = np.argmin(d, axis=1)
        new_centers = np.array([boxes[labels==i].mean(axis=0) if np.any(labels==i) else centers[i] for i in range(k)])
        if np.allclose(new_centers, centers, atol=1e-5):
            break
        centers = new_centers
    return centers

wh_train = load_wh_from_labels("train")
assert len(wh_train) > 0, "라벨에서 w,h를 찾지 못했어요."
anchors = kmeans_iou(wh_train, k=5)
areas = anchors[:,0]*anchors[:,1]
anchors = anchors[np.argsort(areas)]  # 면적 오름차순

print("YOLOv2 Anchors (normalized):")
for w,h in anchors: print(f"{w:.4f},{h:.4f}")
#


print("\n@416px (참고):")
for w,h in anchors: print(f"{w*416:.1f} x {h*416:.1f}")
# 이때 grid 적합도(S=13)
# 셀크기 416/13 = 32px
# 만약 v2 단일 스케일에서 recall이 낮으면 입혁 해상도 512로 올리기

YOLOv2 Anchors (normalized):
0.2265,0.3618
0.4303,0.4964
0.3272,0.6632
0.4894,0.8136
0.7093,0.8537

@416px (참고):
94.2 x 150.5
179.0 x 206.5
136.1 x 275.9
203.6 x 338.4
295.1 x 355.1


### YOLOv2 모델 구현

In [16]:
# ====== 2) YOLOv2 최소 구현 (Darknet-19 + 1-scale head) ======
def conv_bn_lrelu(c1, c2, k=3, s=1, p=None):
    if p is None: p = k//2
    return nn.Sequential(
        nn.Conv2d(c1, c2, k, s, p, bias=False),
        nn.BatchNorm2d(c2),
        nn.LeakyReLU(0.1, inplace=True),
    )

class Darknet19(nn.Module):
    def __init__(self):
        super().__init__()
        self.body = nn.Sequential(
            conv_bn_lrelu(3, 32),
            nn.MaxPool2d(2,2),
            conv_bn_lrelu(32,64),
            nn.MaxPool2d(2,2),
            conv_bn_lrelu(64,128),
            conv_bn_lrelu(128,64,1,1,0),
            conv_bn_lrelu(64,128),
            nn.MaxPool2d(2,2),
            conv_bn_lrelu(128,256),
            conv_bn_lrelu(256,128,1,1,0),
            conv_bn_lrelu(128,256),
            nn.MaxPool2d(2,2),
            conv_bn_lrelu(256,512),
            conv_bn_lrelu(512,256,1,1,0),
            conv_bn_lrelu(256,512),
            conv_bn_lrelu(512,256,1,1,0),
            conv_bn_lrelu(256,512),
            nn.MaxPool2d(2,2),
            conv_bn_lrelu(512,1024),
            conv_bn_lrelu(1024,512,1,1,0),
            conv_bn_lrelu(512,1024),
            conv_bn_lrelu(1024,512,1,1,0),
            conv_bn_lrelu(512,1024)
        )
    def forward(self, x): return self.body(x)

class YOLOv2(nn.Module):
    def __init__(self, num_classes=4, anchors=None):
        super().__init__()
        self.backbone = Darknet19()
        self.anchors = torch.tensor(anchors, dtype=torch.float32)  # (A,2) normalized w,h
        A = self.anchors.size(0)
        self.num_classes = num_classes
        self.pred = nn.Conv2d(1024, A*(5+num_classes), 1, 1, 0)
    def forward(self, x):
        B = x.size(0)
        feat = self.backbone(x)               # (B,1024,13,13) if input=416
        out  = self.pred(feat)                # (B, A*(5+C), 13, 13)
        A = self.anchors.size(0)
        return out.view(B, A, 5+self.num_classes, 13, 13)

In [17]:
# ====== 3) Dataset / Dataloader ======
class YoloDataset(Dataset):
    def __init__(self, img_dir, lab_dir, img_size=416, aug=True):
            self.img_paths = sorted(glob(f"{img_dir}/*.jpg"))
            self.lab_dir = lab_dir
            self.size = img_size
            self.aug = aug
            self.t = A.Compose(
                [
                    A.HorizontalFlip(p=0.5),
                    A.Affine(scale=(0.9,1.1), translate_percent=(-0.05,0.05), rotate=(-5,5), p=0.7),
                    A.ColorJitter(p=0.3),
                    A.Resize(self.size, self.size),
                ],
                bbox_params=A.BboxParams(
                    format="pascal_voc",          # [x_min, y_min, x_max, y_max] (pixels)
                    label_fields=["class_labels"],
                    clip=True
                )
            ) if aug else A.Compose(
                [A.Resize(self.size, self.size)],
                bbox_params=A.BboxParams(
                    format="pascal_voc",
                    label_fields=["class_labels"],
                    clip=True
                )
            )

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        ip = self.img_paths[idx]
        img = cv2.imread(ip)[:, :, ::-1]
        h0, w0 = img.shape[:2]

        lab_path = f"{self.lab_dir}/{Path(ip).stem}.txt"
        with open(lab_path, "r") as f:
            line = f.readline().strip().split()
        cls = int(line[0]); cx, cy, wn, hn = map(float, line[1:5])

        # pascal_voc abs box
        x = (cx - wn/2) * w0; y = (cy - hn/2) * h0; w = wn*w0; h = hn*h0
        b = [x, y, x + w, y + h]

        aug = self.t(image=img, bboxes=[b], class_labels=[cls])
        # 박스가 증강으로 사라질 수 있음 → 가드
        if len(aug["bboxes"]) == 0:
            aug = A.Compose(
                [A.Resize(self.size, self.size)],
                bbox_params=A.BboxParams(format="pascal_voc", label_fields=["class_labels"], clip=True)
            )(image=img, bboxes=[b], class_labels=[cls])

        img = aug["image"]
        x1, y1, x2, y2 = aug["bboxes"][0]
        H, W = img.shape[:2]

        w = max(1.0, x2 - x1); h = max(1.0, y2 - y1)
        cx = (x1 + w/2) / W; cy = (y1 + h/2) / H; wn = w / W; hn = h / H

        img = (img / 255.0).astype(np.float32).transpose(2, 0, 1)
        target = np.array([cls, cx, cy, wn, hn], dtype=np.float32)
        return torch.from_numpy(img), torch.from_numpy(target)

def build_loaders(size=416, bs=16):
    tr = YoloDataset(f"{images_root}/train", f"{labels_root}/train", img_size=size, aug=True)
    va = YoloDataset(f"{images_root}/val",   f"{labels_root}/val",   img_size=size, aug=False)
    return DataLoader(tr, batch_size=bs, shuffle=True,  num_workers=2, pin_memory=True), \
           DataLoader(va, batch_size=bs, shuffle=False, num_workers=2, pin_memory=True)

In [18]:
# ====== 4) Loss (간단 YOLOv2 스타일: 1 GT/이미지 가정) ======
def yolo2_loss(pred, target, anchors, S=13, num_classes=4, lambda_coord=5.0, lambda_noobj=0.5):
    B,A,_,_,_ = pred.shape
    device = pred.device
    tx,ty,tw,th,to = pred[:, :, 0], pred[:, :, 1], pred[:, :, 2], pred[:, :, 3], pred[:, :, 4]
    tcls = pred[:, :, 5:]  # (B,A,C,S,S)

    grid_y, grid_x = torch.meshgrid(torch.arange(S, device=device), torch.arange(S, device=device), indexing='ij')
    cls = target[:,0].long()
    gx  = target[:,1]*S; gy=target[:,2]*S
    gi = gx.long().clamp(0,S-1); gj = gy.long().clamp(0,S-1)
    gw  = target[:,3]; gh=target[:,4]

    anchors = anchors.to(device)
    inter = torch.min(anchors[:,0][None], gw[:,None]) * torch.min(anchors[:,1][None], gh[:,None])
    union = anchors[:,0][None]*anchors[:,1][None] + gw[:,None]*gh[:,None] - inter + 1e-9
    ious = inter/union
    best_a = ious.argmax(dim=1)  # (B,)

    obj_mask   = torch.zeros((B,A,S,S), device=device)
    noobj_mask = torch.ones((B,A,S,S), device=device)
    tx_t = torch.zeros((B,A,S,S), device=device)
    ty_t = torch.zeros((B,A,S,S), device=device)
    tw_t = torch.zeros((B,A,S,S), device=device)
    th_t = torch.zeros((B,A,S,S), device=device)
    tcls_t = torch.zeros((B,A,num_classes,S,S), device=device)

    for b in range(B):
        a = best_a[b]
        i = gi[b].item(); j = gj[b].item()
        obj_mask[b,a,j,i] = 1.0
        noobj_mask[b,a,j,i] = 0.0
        tx_t[b,a,j,i] = gx[b] - i
        ty_t[b,a,j,i] = gy[b] - j
        pw, ph = anchors[a]
        tw_t[b,a,j,i] = torch.log(gw[b]/(pw+1e-9) + 1e-9)
        th_t[b,a,j,i] = torch.log(gh[b]/(ph+1e-9) + 1e-9)
        tcls_t[b,a,cls[b],j,i] = 1.0

    px = torch.sigmoid(tx); py = torch.sigmoid(ty); po = torch.sigmoid(to)

    loss_x  = lambda_coord * F.mse_loss(px*obj_mask, tx_t*obj_mask, reduction='sum')/B
    loss_y  = lambda_coord * F.mse_loss(py*obj_mask, ty_t*obj_mask, reduction='sum')/B
    loss_w  = lambda_coord * F.mse_loss(tw*obj_mask, tw_t*obj_mask, reduction='sum')/B
    loss_h  = lambda_coord * F.mse_loss(th*obj_mask, th_t*obj_mask, reduction='sum')/B
    loss_obj   = F.binary_cross_entropy(po*obj_mask, obj_mask, reduction='sum')/B
    loss_noobj = lambda_noobj * F.binary_cross_entropy(po*noobj_mask, 0.0*noobj_mask, reduction='sum')/B
    if obj_mask.sum()>0:
        logits = tcls.permute(0,1,3,4,2)[obj_mask.bool()]
        targets= torch.argmax(tcls_t.permute(0,1,3,4,2)[obj_mask.bool()], dim=-1)
        loss_cls = F.cross_entropy(logits, targets)
    else:
        loss_cls = torch.tensor(0., device=device)

    loss = loss_x+loss_y+loss_w+loss_h + loss_obj+loss_noobj + loss_cls
    return loss

In [25]:
def _sigmoid(x): return 1/(1+torch.exp(-x))

# (cx,cy,w,h)->(x1,y1,x2,y2) [0,1] 정규화
def _cxcywh_to_xyxy_norm(b):  # (...,4) in [0,1]
    cx,cy,w,h = b.unbind(-1)
    x1 = cx - w/2; y1 = cy - h/2
    x2 = cx + w/2; y2 = cy + h/2
    return torch.stack([x1,y1,x2,y2], dim=-1)

# IoU for xyxy in [0,1]
def _iou_xyxy(a, b):  # (N,4),(M,4) in [0,1]
    tl = torch.maximum(a[:,None,:2], b[None,:, :2])
    br = torch.minimum(a[:,None,2:], b[None,:, 2:])
    wh = (br - tl).clamp(min=0)
    inter = wh[...,0]*wh[...,1]
    area_a = (a[:,2]-a[:,0]).clamp(min=0) * (a[:,3]-a[:,1]).clamp(min=0)
    area_b = (b[:,2]-b[:,0]).clamp(min=0) * (b[:,3]-b[:,1]).clamp(min=0)
    return inter / (area_a[:,None] + area_b[None,:] - inter + 1e-9)

def nms_xyxy(boxes, scores, iou_thr=0.5, max_det=100):
    """boxes: (N,4) [0,1] xyxy, scores: (N,)"""
    keep = []
    idxs = scores.argsort(descending=True)
    while idxs.numel() > 0 and len(keep) < max_det:
        i = idxs[0].item()
        keep.append(i)
        if idxs.numel() == 1: break
        iou = _iou_xyxy(boxes[i:i+1], boxes[idxs[1:]]).squeeze(0)
        idxs = idxs[1:][ iou <= iou_thr ]
    return torch.tensor(keep, dtype=torch.long, device=boxes.device)

In [22]:
# YOLOv2 raw 출력 -> 전체 후보 디코딩
def decode_yolov2_all(out, anchors):
    """
    out: (B,A,5+C,S,S) raw, anchors: (A,2) normalized (w,h)
    return:
      boxes_xyxy: (B, A*S*S, 4) in [0,1]  (xyxy)
      scores:     (B, A*S*S)   (obj * best-class prob)
      cls_ids:    (B, A*S*S)   (argmax class)
      cls_scores: (B, A*S*S, C) optional full class scores (obj*pcls)
    """
    B,A,CH,S,_ = out.shape
    C = CH - 5
    device = out.device
    anchors = anchors.to(device)

    tx,ty,tw,th,to = out[:,:,0], out[:,:,1], out[:,:,2], out[:,:,3], out[:,:,4]   # (B,A,S,S)
    tcls = out[:,:,5:]                                                             # (B,A,C,S,S)

    obj = torch.sigmoid(to)                                                        # (B,A,S,S)
    cls_prob = F.softmax(tcls.permute(0,1,3,4,2), dim=-1)                          # (B,A,S,S,C)

    # grid
    gy, gx = torch.meshgrid(torch.arange(S, device=device), torch.arange(S, device=device), indexing="ij")
    gx = gx[None,None]; gy = gy[None,None]                                         # (1,1,S,S)

    cx = (torch.sigmoid(tx) + gx) / S
    cy = (torch.sigmoid(ty) + gy) / S
    pw = anchors[:,0][None,:,None,None]
    ph = anchors[:,1][None,:,None,None]
    w = pw * torch.exp(tw)
    h = ph * torch.exp(th)

    # (B,A,S,S,4)
    boxes_cxcywh = torch.stack([cx,cy,w,h], dim=-1)
    boxes_xyxy = _cxcywh_to_xyxy_norm(boxes_cxcywh).view(B, -1, 4)                 # (B,N,4) N=A*S*S

    # best-class only
    pcls, cls_id = cls_prob.max(dim=-1)                                            # (B,A,S,S)
    scores = (obj * pcls).view(B, -1)                                              # (B,N)
    cls_ids = cls_id.view(B, -1)                                                   # (B,N)

    # full class scores (optional, per-class NMS에 사용)
    cls_scores = (obj[...,None] * cls_prob).view(B, -1, C)                         # (B,N,C)
    return boxes_xyxy, scores, cls_ids, cls_scores

In [23]:
# 클래스별 NMS
def nms_per_class_per_image(boxes_xyxy, cls_scores, conf_thr=0.6, iou_thr=0.5, max_det=100):
    """
    boxes_xyxy: (N,4) [0,1], cls_scores: (N,C)
    returns:
      final_boxes: (M,4) [0,1], final_scores: (M,), final_clss: (M,)
    """
    N, C = cls_scores.shape
    final_boxes = []; final_scores = []; final_clss = []
    for c in range(C):
        sc = cls_scores[:, c]
        m = sc >= conf_thr
        if m.sum() == 0: continue
        b = boxes_xyxy[m]; s = sc[m]
        keep = nms_xyxy(b, s, iou_thr=iou_thr, max_det=max_det)
        if keep.numel() == 0: continue
        final_boxes.append(b[keep]); final_scores.append(s[keep])
        final_clss.append(torch.full((keep.numel(),), c, dtype=torch.long, device=b.device))
    if len(final_boxes)==0:
        return boxes_xyxy.new_zeros((0,4)), boxes_xyxy.new_zeros((0,)), torch.empty((0,), dtype=torch.long)
    return torch.cat(final_boxes,0), torch.cat(final_scores,0), torch.cat(final_clss,0)

In [24]:
# 시각화 도우미 (파란 박스)
def draw_xyxy_on_bgr(bgr, boxes_xyxy_norm, scores=None, clss=None, class_names=None, color=(255,0,0), thickness=2):
    H,W = bgr.shape[:2]
    img = bgr.copy()
    for i, (x1,y1,x2,y2) in enumerate(boxes_xyxy_norm):
        x1 = int(x1*W); y1=int(y1*H); x2=int(x2*W); y2=int(y2*H)
        cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness)
        if scores is not None:
            label = f"{scores[i]:.2f}"
            if clss is not None:
                cname = f"{clss[i]}" if class_names is None else class_names[int(clss[i])]
                label = f"{cname}:{label}"
            cv2.putText(img, label, (x1, max(0,y1-6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
    return img

### 학습 + 검증 + best weight

In [50]:
# ===== 클래스 매핑 (라벨명 표시용) =====
class_to_id = {"나무":0, "남자사람":1, "여자사람":2, "집":3}
id_to_class = {v:k for k,v in class_to_id.items()}

In [51]:
# ====== 하이퍼파라미터 ======
SIZE = 416
S = 13
NUM_CLASSES = len(class_to_id)
EPOCHS = 30
BS = 16
LR = 1e-3
WEIGHT_DECAY = 5e-4

# 평가/후처리 임계값(기본값)
CONF_THR_EVAL = 0.3     # FP↓ / FN↑
IOU_THR_EVAL  = 0.6     # 0.5~0.6 권장

# 손실 가중치
LAMBDA_COORD  = 5.0     # 3~7
LAMBDA_NOOBJ  = 0.3

In [52]:
# ====== 디바이스/AMP ======
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = torch.cuda.is_available()
scaler  = GradScaler(enabled=use_amp)
torch.backends.cudnn.benchmark = True

  scaler  = GradScaler(enabled=use_amp)


In [53]:
# ===== 시드 고정(재현성) =====
def set_seed(seed=2025):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(2025)

In [54]:
# ===== 로더/모델/옵티마 =====
trL, vaL = build_loaders(size=SIZE, bs=BS)   # canonical
model = YOLOv2(num_classes=NUM_CLASSES, anchors=anchors).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

In [55]:
# ===== 기본 유틸 =====
def _sigmoid(x): return 1/(1+torch.exp(-x))

def _cxcywh_to_xyxy_norm(b):  # (...,4) [0,1]
    cx,cy,w,h = b.unbind(-1)
    x1 = cx - w/2; y1 = cy - h/2
    x2 = cx + w/2; y2 = cy + h/2
    return torch.stack([x1,y1,x2,y2], dim=-1)

def _iou_xyxy(a, b):  # a:(N,4), b:(M,4) in [0,1]
    tl = torch.maximum(a[:,None,:2], b[None,:, :2])
    br = torch.minimum(a[:,None,2:], b[None,:, 2:])
    wh = (br - tl).clamp(min=0)
    inter = wh[...,0]*wh[...,1]
    area_a = (a[:,2]-a[:,0]).clamp(min=0) * (a[:,3]-a[:,1]).clamp(min=0)
    area_b = (b[:,2]-b[:,0]).clamp(min=0) * (b[:,3]-b[:,1]).clamp(min=0)
    return inter / (area_a[:,None] + area_b[None,:] - inter + 1e-9)

def nms_xyxy(boxes, scores, iou_thr=0.5, max_det=100):
    keep = []
    idxs = scores.argsort(descending=True)
    while idxs.numel() > 0 and len(keep) < max_det:
        i = idxs[0].item()
        keep.append(i)
        if idxs.numel() == 1: break
        iou = _iou_xyxy(boxes[i:i+1], boxes[idxs[1:]]).squeeze(0)
        idxs = idxs[1:][ iou <= iou_thr ]
    return torch.tensor(keep, dtype=torch.long, device=boxes.device)


In [56]:
# ===== YOLOv2 raw -> 전체 후보 디코딩 (per-class score 포함) =====
def decode_yolov2_all(out, anchors):
    """
    out: (B,A,5+C,S,S), anchors: (A,2) normalized (w,h)
    return:
      boxes_xyxy: (B, A*S*S, 4) in [0,1]
      cls_scores: (B, A*S*S, C) = obj * p(class)   (클래스별 NMS에 사용)
    """
    B,A,CH,SZ,_ = out.shape
    C = CH - 5
    dev = out.device
    anchors = anchors.to(dev)

    tx,ty,tw,th,to = out[:,:,0], out[:,:,1], out[:,:,2], out[:,:,3], out[:,:,4]
    tcls = out[:,:,5:]

    obj = torch.sigmoid(to)                                    # (B,A,S,S)
    cls_prob = F.softmax(tcls.permute(0,1,3,4,2), dim=-1)      # (B,A,S,S,C)

    gy, gx = torch.meshgrid(torch.arange(SZ, device=dev),
                            torch.arange(SZ, device=dev), indexing="ij")
    gx = gx[None,None]; gy = gy[None,None]

    cx = (torch.sigmoid(tx) + gx) / SZ
    cy = (torch.sigmoid(ty) + gy) / SZ
    pw = anchors[:,0][None,:,None,None]
    ph = anchors[:,1][None,:,None,None]
    w = pw * torch.exp(tw)
    h = ph * torch.exp(th)

    boxes_cxcywh = torch.stack([cx,cy,w,h], dim=-1)            # (B,A,S,S,4)
    boxes_xyxy = _cxcywh_to_xyxy_norm(boxes_cxcywh).view(B,-1,4)
    cls_scores = (obj[...,None] * cls_prob).view(B, -1, C)     # (B,N,C)
    return boxes_xyxy, cls_scores

In [57]:
# ===== 클래스별 NMS =====
def nms_per_class_per_image(boxes_xyxy, cls_scores, conf_thr=0.6, iou_thr=0.5, max_det=100):
    N, C = cls_scores.shape
    fb, fs, fc = [], [], []
    for c in range(C):
        s = cls_scores[:, c]
        m = s >= conf_thr
        if m.any():
            b = boxes_xyxy[m]; sc = s[m]
            keep = nms_xyxy(b, sc, iou_thr=iou_thr, max_det=max_det)
            if keep.numel():
                fb.append(b[keep]); fs.append(sc[keep])
                fc.append(torch.full((keep.numel(),), c, dtype=torch.long, device=b.device))
    if not fb:
        z4 = boxes_xyxy.new_zeros((0,4)); z1 = boxes_xyxy.new_zeros((0,))
        return z4, z1, torch.empty((0,), dtype=torch.long, device=boxes_xyxy.device)
    return torch.cat(fb,0), torch.cat(fs,0), torch.cat(fc,0)

In [58]:
def _decode_best_per_image(out, anchors_tensor):
    # 학습 중 빠른 지표용 (이미지당 TOP-1)
    B,A,CH,SZ,_ = out.shape
    tx,ty,tw,th,to = out[:,:,0], out[:,:,1], out[:,:,2], out[:,:,3], out[:,:,4]
    tcls = out[:,:,5:]
    obj = _sigmoid(to)
    tcls_ = F.softmax(tcls.permute(0,1,3,4,2), dim=-1)
    pcls, cls_id = tcls_.max(dim=-1)
    score = obj * pcls
    score_flat = score.view(B, -1)
    best_idx = score_flat.argmax(dim=1)
    a_idx = best_idx // (SZ*SZ); cell = best_idx % (SZ*SZ)
    j_idx = cell // SZ; i_idx = cell % SZ
    sel_tx = tx[torch.arange(B), a_idx, j_idx, i_idx]
    sel_ty = ty[torch.arange(B), a_idx, j_idx, i_idx]
    sel_tw = tw[torch.arange(B), a_idx, j_idx, i_idx]
    sel_th = th[torch.arange(B), a_idx, j_idx, i_idx]
    sel_sc = score[torch.arange(B), a_idx, j_idx, i_idx]
    sel_cls= cls_id[torch.arange(B), a_idx, j_idx, i_idx]
    cx = (_sigmoid(sel_tx) + i_idx.float()) / SZ
    cy = (_sigmoid(sel_ty) + j_idx.float()) / SZ
    pwph = anchors_tensor[a_idx].to(out.device)
    w = pwph[:,0] * torch.exp(sel_tw)
    h = pwph[:,1] * torch.exp(sel_th)
    return torch.stack([cx,cy,w,h], dim=1), sel_sc, sel_cls

In [59]:
# ---------- 단일 GT/이미지용: PR/F1 스윕 ----------
def prf1_single_gt(all_preds, all_gts, iou_thr=0.5, conf_grid=None):
    if conf_grid is None:
        conf_grid = np.linspace(0.0, 0.9, 19)
    stats = []
    for th in conf_grid:
        TP = FP = FN = 0
        for pred, gt in zip(all_preds, all_gts):
            m = (pred["scores"].numpy() >= th)
            boxes = pred["boxes"][m]; clss = pred["clss"][m]; scores_m = pred["scores"][m]
            if boxes.numel() == 0:
                FN += 1; continue
            sel = (clss == gt["cls"])
            cand_boxes = boxes[sel] if sel.any() else boxes
            cand_scores = scores_m[sel] if sel.any() else scores_m
            top_idx = cand_scores.argmax().item()
            cand = cand_boxes[top_idx]
            iou = _iou_xyxy(cand.unsqueeze(0), gt["box"].unsqueeze(0)).item()
            if iou >= iou_thr and sel.any():
                TP += 1
            else:
                FP += 1; FN += 1
        prec = TP / (TP + FP + 1e-9)
        rec  = TP / (TP + FN + 1e-9)
        f1   = 2*prec*rec / (prec+rec + 1e-9)
        stats.append((th, prec, rec, f1))
    best = max(stats, key=lambda x: x[3]) if stats else (0.5, 0.0, 0.0, 0.0)
    return stats, best  # (th, P, R, F1)

In [60]:
# ---------- 시각화 ----------
def draw_xyxy_on_bgr(bgr, boxes_xyxy_norm, scores=None, clss=None, class_names=None,
                     color=(255,0,0), thickness=2):
    H,W = bgr.shape[:2]
    img = bgr.copy()
    for i,(x1,y1,x2,y2) in enumerate(boxes_xyxy_norm):
        x1 = int(x1*W); y1 = int(y1*H); x2 = int(x2*W); y2 = int(y2*H)
        cv2.rectangle(img, (x1,y1), (x2,y2), color, thickness)
        if scores is not None:
            label = f"{scores[i]:.2f}"
            if clss is not None:
                name = class_names[int(clss[i])] if class_names is not None else str(int(clss[i]))
                label = f"{name}:{label}"
            cv2.putText(img, label, (x1, max(0,y1-6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
    return img

In [61]:
# =============================================================
# 체크포인트/얼리스톱 관리
# =============================================================
PRIMARY = "f1"     # 저장 기준: "f1" or "recall"
min_delta   = 0.005   # 최소 개선폭
patience    = 5       # 개선 없을 때 허용 에포크 수
top_k       = 3       # 상위 K개만 유지
snapshot_every = 5    # N 에포크마다 스냅샷

best_val_metric = -1e9
epochs_no_improve = 0
topk_heap = []  # (metric, path)
best_conf_global = 0.5

In [62]:
def _save_topk(metric_value, epoch, best_conf, tag="best"):
    os.makedirs("ckpts", exist_ok=True)
    ckpt_path = f"ckpts/yolov2_{tag}_ep{epoch:02d}_{metric_value:.3f}.pth"
    torch.save(model.state_dict(), ckpt_path)
    meta = {"epoch": epoch, "metric": float(metric_value), "best_conf": float(best_conf)}
    with open(ckpt_path.replace(".pth",".json"), "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)
    heapq.heappush(topk_heap, (metric_value, ckpt_path))
    if len(topk_heap) > top_k:
        worst_metric, worst_path = heapq.heappop(topk_heap)
        if worst_path != ckpt_path:
            for p in [worst_path, worst_path.replace(".pth",".json")]:
                if os.path.exists(p):
                    try: os.remove(p)
                    except: pass

In [63]:
# ===== 검증 지표 (클래스별 NMS 기반 간이 리콜/IoU; 이미지당 1 GT 가정) =====
def eval_batch_with_nms(out, tgt, anchors, conf_thr=0.6, iou_thr=0.5):
    """
    tgt: (B,5) [cls,cx,cy,w,h], 이미지당 하나 GT 가정
    return: matched, total_gt, mean_iou_on_matched
    """
    boxes_xyxy, cls_scores = decode_yolov2_all(out, anchors)  # (B,N,4), (B,N,C)
    B = out.shape[0]
    matched = 0; total_gt = 0; iou_sum = 0.0
    for b in range(B):
        gt_cls = tgt[b,0].long().view(1)
        gt_box = tgt[b,1:5].view(1,4)
        total_gt += 1
        pb, ps, pc = nms_per_class_per_image(boxes_xyxy[b], cls_scores[b],
                                             conf_thr=conf_thr, iou_thr=iou_thr, max_det=100)
        if pb.numel() == 0: continue
        cm = (pc == gt_cls.item())
        if cm.any():
            cand = pb[cm]
            ious = _iou_xyxy(cand, _cxcywh_to_xyxy_norm(gt_box)).squeeze(1)
            iou_max, _ = ious.max(dim=0)
            if iou_max.item() >= iou_thr:
                matched += 1
                iou_sum += iou_max.item()
    mean_iou = (iou_sum / max(matched,1))
    return matched, total_gt, mean_iou

In [64]:
# ========================= 학습 루프 ==========================
# =============================================================
for ep in range(1, EPOCHS+1):
    # -------------------- Train --------------------
    model.train()
    pbar = tqdm(trL, desc=f"Train {ep}/{EPOCHS}", ncols=120)
    tot_loss=tot_iou=0.0; hit=cls_ok=tot=0

    for imgs, tgt in pbar:
        imgs = imgs.to(device); tgt = tgt.to(device)
        with torch.amp.autocast("cuda", enabled=use_amp):
            out = model(imgs)
            loss = yolo2_loss(out, tgt, model.anchors.to(device), S=S, num_classes=NUM_CLASSES,
                              lambda_coord=LAMBDA_COORD, lambda_noobj=LAMBDA_NOOBJ)
        opt.zero_grad(set_to_none=True)
        if use_amp:
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
        else:
            loss.backward(); opt.step()

        with torch.no_grad():
            best_box, best_score, best_cls = _decode_best_per_image(out, model.anchors.to(device))
            gt_box  = tgt[:,1:5]
            ious = _iou_xyxy(_cxcywh_to_xyxy_norm(best_box), _cxcywh_to_xyxy_norm(gt_box))
            if ious.ndim == 2:
                diag = ious.diag()
            else:
                diag = ious
            tot_iou += diag.sum().item()
            mask = (best_score >= CONF_THR_EVAL) & (diag >= IOU_THR_EVAL)
            hit += mask.sum().item()
            tot += tgt.size(0)
            cls_ok += (best_cls == tgt[:,0].long()).sum().item()

        pbar.set_postfix(loss=f"{loss.item():.3f}",
                         iou=f"{tot_iou/max(tot,1):.3f}",
                         recall=f"{hit/max(tot,1):.2f}",
                         clsacc=f"{cls_ok/max(tot,1):.2f}",
                         lr=f"{opt.param_groups[0]['lr']:.1e}")

    # -------------------- Validate --------------------
    model.eval()
    val_loss=0.0; vn=0
    v_matched=v_total=0; v_iou_sum=0.0

    # PR/F1 스윕 누적(에포크마다 초기화)
    all_preds = []
    all_gts   = []

    with torch.no_grad():
        for imgs, tgt in vaL:
            imgs = imgs.to(device); tgt = tgt.to(device)
            out = model(imgs)
            loss = yolo2_loss(out, tgt, model.anchors.to(device), S=S, num_classes=NUM_CLASSES,
                              lambda_coord=LAMBDA_COORD, lambda_noobj=LAMBDA_NOOBJ)
            val_loss += loss.item(); vn += 1

            # 간이 NMS 리콜/IoU (단일 GT 가정)
            boxes_xyxy_all, cls_scores_all = decode_yolov2_all(out, model.anchors.to(device))
            B = imgs.shape[0]
            for b in range(B):
                # 평가용(간이): 설정 임계값으로 리콜/IoU
                pb_eval, ps_eval, pc_eval = nms_per_class_per_image(
                    boxes_xyxy_all[b], cls_scores_all[b],
                    conf_thr=CONF_THR_EVAL, iou_thr=IOU_THR_EVAL, max_det=300
                )
                gt_xyxy = _cxcywh_to_xyxy_norm(tgt[b,1:5].unsqueeze(0)).squeeze(0)
                gt_cls  = int(tgt[b,0].item())
                # 리콜/IoU 계산
                if pb_eval.numel() > 0:
                    same_cls = (pc_eval == gt_cls)
                    cand = pb_eval[same_cls] if same_cls.any() else pb_eval
                    # 최고 점수 1개
                    if cand.shape[0] > 0:
                        ious = _iou_xyxy(cand, gt_xyxy.unsqueeze(0)).squeeze(1)
                        iou_max = ious.max().item()
                        if iou_max >= IOU_THR_EVAL and same_cls.any():
                            v_matched += 1; v_iou_sum += iou_max
                    v_total += 1
                else:
                    v_total += 1

                # PR/F1 스윕용(가능한 많이 모음: conf_thr=0.0, NMS IoU=0.5)
                pb, ps, pc = nms_per_class_per_image(
                    boxes_xyxy_all[b], cls_scores_all[b],
                    conf_thr=0.0, iou_thr=0.5, max_det=300
                )
                all_preds.append({"scores": ps.detach().cpu(),
                                  "boxes":  pb.detach().cpu(),
                                  "clss":   pc.detach().cpu()})
                all_gts.append({"cls": gt_cls, "box": gt_xyxy.detach().cpu()})

    val_loss /= max(vn,1)
    val_rec_nms  = v_matched / max(v_total,1)
    val_miou_nms = (v_iou_sum / max(v_matched,1)) if v_matched>0 else 0.0

    # PR/F1 스윕 (IoU=0.5 기준)
    _, (best_conf, best_P, best_R, best_F1) = prf1_single_gt(
        all_preds, all_gts, iou_thr=0.5, conf_grid=np.linspace(0.0, 0.9, 19)
    )
    best_conf_global = float(best_conf)

    print(f"[VAL {ep:02d}] loss={val_loss:.3f}  nms_recall@{CONF_THR_EVAL}/{IOU_THR_EVAL}={val_rec_nms:.3f}  "
          f"nms_meanIoU={val_miou_nms:.3f}  |  F1(best)@conf={best_conf:.2f}  P={best_P:.3f} R={best_R:.3f} F1={best_F1:.3f}")

    # ---------- 체크포인트 저장 정책 ----------
    if PRIMARY == "f1":
        current_metric = best_F1
    elif PRIMARY == "recall":
        current_metric = val_rec_nms
    else:
        current_metric = best_F1

    improved = (current_metric - best_val_metric) > min_delta
    if improved:
        best_val_metric = current_metric
        epochs_no_improve = 0
        _save_topk(current_metric, ep, best_conf_global, tag="best")
        print(f"  ↳ saved (improved by ≥{min_delta}) | metric={current_metric:.3f}, conf*={best_conf_global:.2f}")
    else:
        epochs_no_improve += 1

    # 주기 스냅샷(선택)
    if (ep % snapshot_every) == 0:
        _save_topk(current_metric, ep, best_conf_global, tag="snapshot")
        print(f"  ↳ snapshot saved at epoch {ep}")

    # 항상 마지막 모델 저장
    torch.save(model.state_dict(), "yolov2_last.pth")

    # 얼리스톱
    if epochs_no_improve >= patience:
        print(f"Early stopping at epoch {ep} (no improvement for {patience} epochs).")
        break

print("train done ✅")

Train 1/30: 100%|██████████| 28/28 [11:56<00:00, 25.58s/it, clsacc=0.24, iou=0.151, loss=9.491, lr=1.0e-03, recall=0.00]


[VAL 01] loss=92.704  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.036 R=0.036 F1=0.036
  ↳ saved (improved by ≥0.005) | metric=0.036, conf*=0.00


Train 2/30: 100%|██████████| 28/28 [11:40<00:00, 25.02s/it, clsacc=0.30, iou=0.368, loss=7.566, lr=1.0e-03, recall=0.00]


[VAL 02] loss=7.769  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.482 R=0.482 F1=0.482
  ↳ saved (improved by ≥0.005) | metric=0.482, conf*=0.00


Train 3/30: 100%|██████████| 28/28 [11:27<00:00, 24.54s/it, clsacc=0.35, iou=0.453, loss=6.685, lr=1.0e-03, recall=0.00]


[VAL 03] loss=6.932  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.536 R=0.536 F1=0.536
  ↳ saved (improved by ≥0.005) | metric=0.536, conf*=0.00


Train 4/30: 100%|██████████| 28/28 [11:50<00:00, 25.38s/it, clsacc=0.41, iou=0.475, loss=6.606, lr=1.0e-03, recall=0.00]


[VAL 04] loss=6.783  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.589 R=0.589 F1=0.589
  ↳ saved (improved by ≥0.005) | metric=0.589, conf*=0.00


Train 5/30: 100%|██████████| 28/28 [11:54<00:00, 25.52s/it, clsacc=0.47, iou=0.519, loss=5.931, lr=1.0e-03, recall=0.00]


[VAL 05] loss=6.530  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.607 R=0.607 F1=0.607
  ↳ saved (improved by ≥0.005) | metric=0.607, conf*=0.00
  ↳ snapshot saved at epoch 5


Train 6/30: 100%|██████████| 28/28 [11:56<00:00, 25.59s/it, clsacc=0.46, iou=0.535, loss=6.186, lr=1.0e-03, recall=0.00]


[VAL 06] loss=5.872  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.589 R=0.589 F1=0.589


Train 7/30: 100%|██████████| 28/28 [11:49<00:00, 25.34s/it, clsacc=0.47, iou=0.542, loss=5.172, lr=1.0e-03, recall=0.00]


[VAL 07] loss=5.711  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.714 R=0.714 F1=0.714
  ↳ saved (improved by ≥0.005) | metric=0.714, conf*=0.00


Train 8/30: 100%|██████████| 28/28 [11:38<00:00, 24.95s/it, clsacc=0.53, iou=0.576, loss=5.284, lr=1.0e-03, recall=0.02]


[VAL 08] loss=5.608  nms_recall@0.3/0.6=0.036  nms_meanIoU=0.651  |  F1(best)@conf=0.00  P=0.661 R=0.661 F1=0.661


Train 9/30: 100%|██████████| 28/28 [11:49<00:00, 25.32s/it, clsacc=0.55, iou=0.586, loss=4.404, lr=1.0e-03, recall=0.04]


[VAL 09] loss=6.244  nms_recall@0.3/0.6=0.000  nms_meanIoU=0.000  |  F1(best)@conf=0.00  P=0.643 R=0.643 F1=0.643


Train 10/30: 100%|█████████| 28/28 [11:49<00:00, 25.34s/it, clsacc=0.57, iou=0.615, loss=4.281, lr=1.0e-03, recall=0.06]


[VAL 10] loss=4.940  nms_recall@0.3/0.6=0.071  nms_meanIoU=0.757  |  F1(best)@conf=0.00  P=0.786 R=0.786 F1=0.786
  ↳ saved (improved by ≥0.005) | metric=0.786, conf*=0.00
  ↳ snapshot saved at epoch 10


Train 11/30: 100%|█████████| 28/28 [11:35<00:00, 24.83s/it, clsacc=0.65, iou=0.613, loss=3.976, lr=1.0e-03, recall=0.11]


[VAL 11] loss=6.116  nms_recall@0.3/0.6=0.232  nms_meanIoU=0.706  |  F1(best)@conf=0.00  P=0.786 R=0.786 F1=0.786


Train 12/30: 100%|█████████| 28/28 [12:02<00:00, 25.80s/it, clsacc=0.64, iou=0.608, loss=4.087, lr=1.0e-03, recall=0.15]


[VAL 12] loss=5.105  nms_recall@0.3/0.6=0.054  nms_meanIoU=0.749  |  F1(best)@conf=0.00  P=0.786 R=0.786 F1=0.786


Train 13/30: 100%|█████████| 28/28 [11:50<00:00, 25.39s/it, clsacc=0.62, iou=0.623, loss=4.519, lr=1.0e-03, recall=0.18]


[VAL 13] loss=4.796  nms_recall@0.3/0.6=0.161  nms_meanIoU=0.767  |  F1(best)@conf=0.00  P=0.821 R=0.821 F1=0.821
  ↳ saved (improved by ≥0.005) | metric=0.821, conf*=0.00


Train 14/30: 100%|█████████| 28/28 [11:43<00:00, 25.11s/it, clsacc=0.64, iou=0.610, loss=4.301, lr=1.0e-03, recall=0.23]


[VAL 14] loss=4.854  nms_recall@0.3/0.6=0.161  nms_meanIoU=0.736  |  F1(best)@conf=0.00  P=0.804 R=0.804 F1=0.804


Train 15/30: 100%|█████████| 28/28 [11:42<00:00, 25.08s/it, clsacc=0.65, iou=0.649, loss=3.793, lr=1.0e-03, recall=0.29]


[VAL 15] loss=5.205  nms_recall@0.3/0.6=0.214  nms_meanIoU=0.760  |  F1(best)@conf=0.00  P=0.732 R=0.732 F1=0.732
  ↳ snapshot saved at epoch 15


Train 16/30: 100%|█████████| 28/28 [11:47<00:00, 25.27s/it, clsacc=0.66, iou=0.640, loss=4.764, lr=1.0e-03, recall=0.30]


[VAL 16] loss=4.757  nms_recall@0.3/0.6=0.089  nms_meanIoU=0.746  |  F1(best)@conf=0.00  P=0.804 R=0.804 F1=0.804


Train 17/30: 100%|█████████| 28/28 [11:57<00:00, 25.61s/it, clsacc=0.69, iou=0.654, loss=4.321, lr=1.0e-03, recall=0.36]


[VAL 17] loss=4.828  nms_recall@0.3/0.6=0.339  nms_meanIoU=0.745  |  F1(best)@conf=0.00  P=0.804 R=0.804 F1=0.804


Train 18/30: 100%|█████████| 28/28 [11:55<00:00, 25.57s/it, clsacc=0.70, iou=0.643, loss=4.269, lr=1.0e-03, recall=0.35]


[VAL 18] loss=5.422  nms_recall@0.3/0.6=0.125  nms_meanIoU=0.686  |  F1(best)@conf=0.00  P=0.839 R=0.839 F1=0.839
  ↳ saved (improved by ≥0.005) | metric=0.839, conf*=0.00


Train 19/30: 100%|█████████| 28/28 [11:59<00:00, 25.68s/it, clsacc=0.70, iou=0.654, loss=3.537, lr=1.0e-03, recall=0.38]


[VAL 19] loss=4.747  nms_recall@0.3/0.6=0.143  nms_meanIoU=0.743  |  F1(best)@conf=0.00  P=0.857 R=0.857 F1=0.857
  ↳ saved (improved by ≥0.005) | metric=0.857, conf*=0.00


Train 20/30: 100%|█████████| 28/28 [12:23<00:00, 26.57s/it, clsacc=0.72, iou=0.661, loss=3.936, lr=1.0e-03, recall=0.42]


[VAL 20] loss=4.344  nms_recall@0.3/0.6=0.393  nms_meanIoU=0.751  |  F1(best)@conf=0.00  P=0.875 R=0.875 F1=0.875
  ↳ saved (improved by ≥0.005) | metric=0.875, conf*=0.00
  ↳ snapshot saved at epoch 20


Train 21/30: 100%|█████████| 28/28 [12:04<00:00, 25.88s/it, clsacc=0.70, iou=0.686, loss=3.670, lr=1.0e-03, recall=0.50]


[VAL 21] loss=4.535  nms_recall@0.3/0.6=0.375  nms_meanIoU=0.719  |  F1(best)@conf=0.00  P=0.929 R=0.929 F1=0.929
  ↳ saved (improved by ≥0.005) | metric=0.929, conf*=0.00


Train 22/30: 100%|█████████| 28/28 [11:44<00:00, 25.15s/it, clsacc=0.73, iou=0.695, loss=3.033, lr=1.0e-03, recall=0.54]


[VAL 22] loss=4.135  nms_recall@0.3/0.6=0.393  nms_meanIoU=0.759  |  F1(best)@conf=0.00  P=0.946 R=0.946 F1=0.946
  ↳ saved (improved by ≥0.005) | metric=0.946, conf*=0.00


Train 23/30: 100%|█████████| 28/28 [11:36<00:00, 24.89s/it, clsacc=0.73, iou=0.698, loss=3.456, lr=1.0e-03, recall=0.56]


[VAL 23] loss=3.945  nms_recall@0.3/0.6=0.482  nms_meanIoU=0.758  |  F1(best)@conf=0.00  P=0.911 R=0.911 F1=0.911


Train 24/30: 100%|█████████| 28/28 [12:04<00:00, 25.87s/it, clsacc=0.75, iou=0.687, loss=3.042, lr=1.0e-03, recall=0.59]


[VAL 24] loss=4.562  nms_recall@0.3/0.6=0.375  nms_meanIoU=0.759  |  F1(best)@conf=0.00  P=0.893 R=0.893 F1=0.893


Train 25/30: 100%|█████████| 28/28 [11:57<00:00, 25.64s/it, clsacc=0.77, iou=0.700, loss=3.019, lr=1.0e-03, recall=0.65]


[VAL 25] loss=4.030  nms_recall@0.3/0.6=0.411  nms_meanIoU=0.750  |  F1(best)@conf=0.00  P=0.857 R=0.857 F1=0.857
  ↳ snapshot saved at epoch 25


Train 26/30: 100%|█████████| 28/28 [11:36<00:00, 24.89s/it, clsacc=0.77, iou=0.707, loss=2.855, lr=1.0e-03, recall=0.64]


[VAL 26] loss=3.910  nms_recall@0.3/0.6=0.554  nms_meanIoU=0.758  |  F1(best)@conf=0.00  P=0.946 R=0.946 F1=0.946


Train 27/30: 100%|█████████| 28/28 [11:33<00:00, 24.76s/it, clsacc=0.79, iou=0.723, loss=3.030, lr=1.0e-03, recall=0.68]


[VAL 27] loss=4.263  nms_recall@0.3/0.6=0.304  nms_meanIoU=0.734  |  F1(best)@conf=0.00  P=0.875 R=0.875 F1=0.875
Early stopping at epoch 27 (no improvement for 5 epochs).
train done ✅


In [None]:
# 학습 후: val 5장 시각화 (best_conf 적용, 파란 박스 + 한글 라벨)
# =============================================================
def _prep(img_rgb, size=416):
    im = cv2.resize(img_rgb, (size,size), interpolation=cv2.INTER_LINEAR)
    ten = torch.from_numpy(im).float().permute(2,0,1).unsqueeze(0)/255.0
    return ten

# 가장 좋은 ckpt를 하나 로드(없으면 last)
best_ckpts = sorted(glob("ckpts/yolov2_best_ep*.pth"))
ckpt_to_load = best_ckpts[-1] if best_ckpts else "yolov2_last.pth"
print(f"Load for vis: {ckpt_to_load}")
model.load_state_dict(torch.load(ckpt_to_load, map_location=device))
model.eval()

# best_conf 읽기 시도
best_conf_for_vis = best_conf_global
if ckpt_to_load.startswith("ckpts/"):
    meta_path = ckpt_to_load.replace(".pth", ".json")
    if os.path.exists(meta_path):
        try:
            with open(meta_path, "r", encoding="utf-8") as f:
                meta = json.load(f)
                best_conf_for_vis = float(meta.get("best_conf", best_conf_for_vis))
        except:
            pass
print(f"Use conf_thr for vis: {best_conf_for_vis:.2f}")

val_imgs = sorted(glob(f"{images_root}/val/*.jpg") + glob(f"{images_root}/val/*.png"))
os.makedirs("/mnt/data", exist_ok=True)
picks = [val_imgs[i] for i in np.linspace(0, max(0,len(val_imgs)-1), num=min(5, len(val_imgs))).astype(int)] if len(val_imgs)>0 else []
saved = []
with torch.no_grad():
    for k, ip in enumerate(picks, 1):
        bgr = cv2.imread(ip); rgb = bgr[:,:,::-1]
        inp = _prep(rgb, SIZE).to(device)
        out = model(inp)
        boxes_xyxy, cls_scores = decode_yolov2_all(out, model.anchors.to(out.device))
        bboxes, scrs, clss = nms_per_class_per_image(
            boxes_xyxy[0], cls_scores[0],
            conf_thr=best_conf_for_vis, iou_thr=0.5, max_det=100
        )
        vis = draw_xyxy_on_bgr(bgr, bboxes.cpu().numpy(),
                               scrs.cpu().numpy() if scrs.numel()>0 else None,
                               clss.cpu().numpy() if clss.numel()>0 else None,
                               class_names=id_to_class, color=(255,0,0), thickness=2)
        op = f"/mnt/data/pred_vis_nms_{k}.jpg"
        cv2.imwrite(op, vis); saved.append(op)
print("Saved visualization:", saved)