In [1]:
from pathlib import Path
import json
import shutil
from glob import glob
from typing import Any, Dict, List, Optional, Set, Tuple

from PIL import Image

from tqdm import tqdm

In [2]:
# 학습(train) 세트 경로
TRAIN_ROOT: Path = Path("dataset/traffic_data/training/bbox_tr")
TRAIN_LABEL_DIR: Path = TRAIN_ROOT / "label_bbox"
TRAIN_IMAGE_DIR: Path = TRAIN_ROOT / "image_bbox"

# 검증(val) 세트 경로
VAL_ROOT: Path = Path("dataset/traffic_data/validation/bbox_val")
VAL_LABEL_DIR: Path = VAL_ROOT / "label_bbox"
VAL_IMAGE_DIR: Path = VAL_ROOT / "image_bbox"

# 출력 루트
OUT_ROOT: Path = Path("yolo_transfer_cls_5")

# 전이학습 3단계 규칙 -> 정규화 없이 값 그대로 비교
STAGES: Dict[str, Dict[str, Set[str]]] = {
    "day_data": {
        "weathers": set(["Sunny"]),
        "times": set(["낮", "오전"])
    },
    "night_data": {
        "weathers": set(["Sunny"]),
        "times": set(["새벽", "오후"])
    },
    "bad_weather_data": {
        "weathers": set(["Cloudy", "Foggy", "Rainy", "Snow"]),
        "times": set(["낮", "오전", "새벽", "오후"])
    }
}

# 실행 옵션
OVERWRITE_LABELS: bool = True
COPY_IMAGES: bool = True
DRY_RUN: bool = False

In [3]:
# 카테고리 후처리 규칙 추가
# -> 최종 타깃 클래스 순서 고정
TARGET_CLASS_ORDER: List[str] = [
    "승용차",
    "버스",
    "트럭",
    "오토바이(자전거)",
    "분류없음"
]

# -> 병합 규칙: 원본 이름 -> 통합 이름
MERGE_TO: Dict[str, str] = {
    "소형버스": "버스",
    "대형버스": "버스",
}

# -> 제거할 클래스 이름 집합
DROP_SET: Set[str] = {
    "대형 트레일러",     # 공백 포함 표기
    "대형트레일러",       # 공백 없는 표기 가능성
    "보행자"
}

In [4]:
def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)

def build_camera_index(images_root: Path) -> Dict[str, List[Path]]:
    # 패턴 일반화 -> image_bbox/site1/site2/camera_id/ 까지 모든 디렉터리 순회
    cam_index: Dict[str, List[Path]] = {}
    if not images_root.exists():
        return cam_index
    site1: Path
    for site1 in images_root.iterdir():
        if not site1.is_dir():
            continue
        site2: Path
        for site2 in site1.iterdir():
            if not site2.is_dir():
                continue
            cam_dir: Path
            for cam_dir in site2.iterdir():
                if not cam_dir.is_dir():
                    continue
                cam_id: str = cam_dir.name
                cam_index.setdefault(cam_id, []).append(cam_dir)
    return cam_index

In [5]:
def find_image_path_from_index(cam_index: Dict[str, List[Path]], camera_id: str, filename: str) -> Optional[Path]:
    dirs: List[Path] = cam_index.get(camera_id, [])
    d: Path
    for d in dirs:
        candidate: Path = d / filename
        if candidate.exists():
            return candidate
    return None

In [6]:
def xyxy_to_yolo(x1: float, y1: float, x2: float, y2: float, w: int, h: int) -> Tuple[float, float, float, float]:
    x1c: float = float(min(max(x1, 0.0), float(w)))
    y1c: float = float(min(max(y1, 0.0), float(h)))
    x2c: float = float(min(max(x2, 0.0), float(w)))
    y2c: float = float(min(max(y2, 0.0), float(h)))

    xmin: float = float(min(x1c, x2c))
    xmax: float = float(max(x1c, x2c))
    ymin: float = float(min(y1c, y2c))
    ymax: float = float(max(y1c, y2c))

    bw: float = float(max(xmax - xmin, 1e-6))
    bh: float = float(max(ymax - ymin, 1e-6))

    cx: float = float(xmin + bw / 2.0)
    cy: float = float(ymin + bh / 2.0)

    x_norm: float = float(cx / float(w))
    y_norm: float = float(cy / float(h))
    w_norm: float = float(bw / float(w))
    h_norm: float = float(bh / float(h))
    return x_norm, y_norm, w_norm, h_norm

def decide_stage(weather: str, time_space: str, stages: Dict[str, Dict[str, Set[str]]]) -> Optional[str]:
    stage_name: str
    rule: Dict[str, Set[str]]
    for stage_name, rule in stages.items():
        if weather in rule["weathers"] and time_space in rule["times"]:
            return stage_name
    return None

def process_coco_json_fixed_subset(
    json_path: Path,
    cam_index: Dict[str, List[Path]],
    out_root: Path,
    stages: Dict[str, Dict[str, Set[str]]],
    id_to_idx: Dict[int, int],
    id_to_name: Dict[int, str],
    overwrite_labels: bool,
    copy_images: bool,
    dry_run: bool,
    subset_name: str
) -> Dict[str, int]:
    counts: Dict[str, int] = {"copied": 0, "labeled": 0, "missing_image": 0, "skipped_stage": 0}

    with open(json_path, "r", encoding="utf-8") as f:
        coco_any: Any = json.load(f)

    meta_list: List[Dict[str, Any]] = list(coco_any.get("meta", []))  # type: ignore
    meta_by_id: Dict[int, Dict[str, Any]] = {int(m.get("id", -1)): m for m in meta_list}

    images: List[Dict[str, Any]] = list(coco_any.get("images", []))  # type: ignore
    anns: List[Dict[str, Any]] = list(coco_any.get("annotations", []))  # type: ignore

    anns_by_image: Dict[int, List[Dict[str, Any]]] = {}
    ann: Dict[str, Any]
    for ann in anns:
        img_id_val: int = int(ann.get("image_id", -1))
        anns_by_image.setdefault(img_id_val, []).append(ann)

    stage_key: str
    for stage_key in stages.keys():
        ensure_dir(out_root / stage_key / subset_name / "images")
        ensure_dir(out_root / stage_key / subset_name / "labels")

    img: Dict[str, Any]
    for img in images:
        img_id: int = int(img.get("id", -1))
        meta_id: int = int(img.get("meta_id", -1))
        file_name: str = str(img.get("file_name", ""))  # 예: "BC2000201/xxxx.jpg"

        parts: List[str] = file_name.split("/")
        camera_id: str = parts[0] if len(parts) >= 2 else ""
        fname: str = parts[-1]

        meta: Dict[str, Any] = meta_by_id.get(meta_id, {})
        weather: str = str(meta.get("weather", "")).strip()
        time_space: str = str(meta.get("time_space", "")).strip()

        stage: Optional[str] = decide_stage(weather, time_space, stages)
        if stage is None:
            counts["skipped_stage"] += 1
            continue

        src_path_opt: Optional[Path] = find_image_path_from_index(cam_index, camera_id, fname)
        if src_path_opt is None or not src_path_opt.exists():
            counts["missing_image"] += 1
            continue
        src_path: Path = src_path_opt

        dst_img_dir: Path = out_root / stage / subset_name / "images" / camera_id
        dst_lbl_dir: Path = out_root / stage / subset_name / "labels" / camera_id
        ensure_dir(dst_img_dir)
        ensure_dir(dst_lbl_dir)

        dst_img_path: Path = dst_img_dir / fname
        dst_lbl_path: Path = dst_lbl_dir / (Path(fname).stem + ".txt")

        with Image.open(src_path) as im:
            size_tuple: Tuple[int, int] = im.size
        w: int = int(size_tuple[0])
        h: int = int(size_tuple[1])

        lines: List[str] = []
        img_anns: List[Dict[str, Any]] = anns_by_image.get(img_id, [])
        ann_item: Dict[str, Any]
        for ann_item in img_anns:
            bboxes: List[List[float]] = [list(map(float, b)) for b in ann_item.get("bbox", [])]  # type: ignore
            cat_ids: List[int] = [int(c) for c in ann_item.get("category_id", [])]  # type: ignore
            idx: int
            for idx in range(0, min(len(bboxes), len(cat_ids))):
                b: List[float] = bboxes[idx]
                c: int = cat_ids[idx]
                if len(b) < 4:
                    continue

                # -> 여기서 필터링: id_to_idx에 없는 카테고리는 제거됨
                if c not in id_to_idx:
                    continue

                x1: float = float(b[0]); y1: float = float(b[1]); x2: float = float(b[2]); y2: float = float(b[3])
                cxn: float; cyn: float; wn: float; hn: float
                cxn, cyn, wn, hn = xyxy_to_yolo(x1, y1, x2, y2, w, h)

                cls_idx: int = int(id_to_idx[c])
                line: str = f"{cls_idx} {cxn:.6f} {cyn:.6f} {wn:.6f} {hn:.6f}"
                lines.append(line)

        if not DRY_RUN:
            if OVERWRITE_LABELS or not dst_lbl_path.exists():
                with open(dst_lbl_path, "w", encoding="utf-8") as lf:
                    content: str = "\n".join(lines)
                    lf.write(content)
            if COPY_IMAGES and not dst_img_path.exists():
                shutil.copy2(src_path, dst_img_path)
            counts["labeled"] += 1
            if COPY_IMAGES:
                counts["copied"] += 1
        else:
            counts["labeled"] += 1
            if COPY_IMAGES:
                counts["copied"] += 1

    # data.yaml은 run_pipeline에서 한 번만 생성
    return counts

In [7]:
def build_category_maps(first_json_path: Path) -> Tuple[Dict[int, int], Dict[int, str]]:
    # -> 원본 카테고리를 읽고, 병합/제거를 반영한 새 인덱스 매핑을 만든다.
    with open(first_json_path, "r", encoding="utf-8") as f0:
        first_any: Any = json.load(f0)
    categories: List[Dict[str, Any]] = list(first_any.get("categories", []))  # type: ignore

    # 원본 이름 -> 원본 id 매핑
    name_to_id: Dict[str, int] = {}
    cat: Dict[str, Any]
    for cat in categories:
        cid: int = int(cat.get("id", 0))
        cname: str = str(cat.get("name", ""))
        name_to_id[cname] = cid

    # 최종 타깃 인덱스 정의 -> TARGET_CLASS_ORDER의 인덱스가 곧 최종 클래스 index
    target_index_of: Dict[str, int] = {name: i for i, name in enumerate(TARGET_CLASS_ORDER)}

    # 원본 id -> 최종 index 매핑
    id_to_idx: Dict[int, int] = {}

    # 모든 원본 카테고리에 대해 새 인덱스를 부여
    cname: str
    for cname, cid in name_to_id.items():
        # 제거 대상이면 건너뜀
        if cname in DROP_SET:
            continue

        # 병합 규칙 적용 -> 최종 이름
        final_name: str = MERGE_TO.get(cname, cname)

        # 최종 이름이 타깃 목록에 있어야 사용
        if final_name not in target_index_of:
            continue

        new_idx: int = int(target_index_of[final_name])
        id_to_idx[cid] = new_idx

    # id_to_name 반환은 "최종 index -> 최종 이름" 매핑으로 구성
    # -> run_pipeline의 data.yaml 생성 구간이 여기 반환값을 그대로 사용한다.
    id_to_name: Dict[int, str] = {idx: name for name, idx in target_index_of.items()}

    return id_to_idx, id_to_name


def _count_images_in_json(json_path: Path) -> int:
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            coco_any: Any = json.load(f)
        images_any: Any = coco_any.get("images", [])
        images: List[Dict[str, Any]] = list(images_any) if isinstance(images_any, list) else []
        return len(images)
    except Exception:
        return 0

In [8]:
def run_pipeline() -> Dict[str, int]:
    # 입력 유효성 확인
    if not TRAIN_LABEL_DIR.exists():
        raise FileNotFoundError(f"Missing train labels: {str(TRAIN_LABEL_DIR)}")
    if not TRAIN_IMAGE_DIR.exists():
        raise FileNotFoundError(f"Missing train images: {str(TRAIN_IMAGE_DIR)}")
    if not VAL_LABEL_DIR.exists():
        raise FileNotFoundError(f"Missing val labels: {str(VAL_LABEL_DIR)}")
    if not VAL_IMAGE_DIR.exists():
        raise FileNotFoundError(f"Missing val images: {str(VAL_IMAGE_DIR)}")

    ensure_dir(OUT_ROOT)

    # 카메라 인덱스 각각 구축
    train_cam_index: Dict[str, List[Path]] = build_camera_index(TRAIN_IMAGE_DIR)
    val_cam_index: Dict[str, List[Path]] = build_camera_index(VAL_IMAGE_DIR)

    # 카테고리 맵은 train 첫 JSON에서 추출
    train_json_files: List[Path] = sorted(TRAIN_LABEL_DIR.rglob("*.json"))
    if len(train_json_files) == 0:
        raise FileNotFoundError(f"No train JSON files under {str(TRAIN_LABEL_DIR)}")
    id_to_idx: Dict[int, int]
    id_to_name: Dict[int, str]
    id_to_idx, id_to_name = build_category_maps(train_json_files[0])

    total: Dict[str, int] = {"copied": 0, "labeled": 0, "missing_image": 0, "skipped_stage": 0}

    # 진행률 합계를 위해 JSON별 이미지 개수를 미리 계산
    train_img_counts: Dict[Path, int] = {jp: _count_images_in_json(jp) for jp in train_json_files}
    total_train_imgs: int = sum(train_img_counts.values())

    # 1) train 처리 진행바
    with tqdm(total=len(train_json_files), desc="train json files", unit="file") as pbar_files, \
         tqdm(total=total_train_imgs, desc="train images", unit="img") as pbar_imgs:
        jp: Path
        for jp in train_json_files:
            # 파일 처리
            c: Dict[str, int] = process_coco_json_fixed_subset(
                json_path=jp,
                cam_index=train_cam_index,
                out_root=OUT_ROOT,
                stages=STAGES,
                id_to_idx=id_to_idx,
                id_to_name=id_to_name,
                overwrite_labels=OVERWRITE_LABELS,
                copy_images=COPY_IMAGES,
                dry_run=DRY_RUN,
                subset_name="train"
            )
            # 카운트 갱신
            k: str
            for k in total.keys():
                total[k] += int(c.get(k, 0))
            # 진행바 업데이트
            pbar_files.update(1)
            pbar_imgs.update(train_img_counts.get(jp, 0))
            pbar_files.set_postfix({"labeled": total["labeled"], "missing": total["missing_image"], "skipped": total["skipped_stage"]})

    # 2) val 처리 준비 및 진행바
    val_json_files: List[Path] = sorted(VAL_LABEL_DIR.rglob("*.json"))
    if len(val_json_files) == 0:
        raise FileNotFoundError(f"No val JSON files under {str(VAL_LABEL_DIR)}")
    val_img_counts: Dict[Path, int] = {jp: _count_images_in_json(jp) for jp in val_json_files}
    total_val_imgs: int = sum(val_img_counts.values())

    with tqdm(total=len(val_json_files), desc="val json files", unit="file") as pbar_files, \
         tqdm(total=total_val_imgs, desc="val images", unit="img") as pbar_imgs:
        jp = Path()  # 타입 명시 목적
        for jp in val_json_files:
            c = process_coco_json_fixed_subset(
                json_path=jp,
                cam_index=val_cam_index,
                out_root=OUT_ROOT,
                stages=STAGES,
                id_to_idx=id_to_idx,
                id_to_name=id_to_name,
                overwrite_labels=OVERWRITE_LABELS,
                copy_images=COPY_IMAGES,
                dry_run=DRY_RUN,
                subset_name="val"
            )
            for k in total.keys():
                total[k] += int(c.get(k, 0))
            pbar_files.update(1)
            pbar_imgs.update(val_img_counts.get(jp, 0))
            pbar_files.set_postfix({"labeled": total["labeled"], "missing": total["missing_image"], "skipped": total["skipped_stage"]})

    # data.yaml 생성 -> build_category_maps에서 반환한 "최종 index -> 이름"을 사용
    ordered_items: List[Tuple[int, str]] = sorted(id_to_name.items(), key=lambda kv: kv[0])
    idx_names: List[str] = [name for _, name in ordered_items]
    stage_out: str
    for stage_out in STAGES.keys():
        yaml_path: Path = OUT_ROOT / stage_out / "data.yaml"
        yaml_text: str = (
            f"path: {OUT_ROOT / stage_out}\n"
            f"train: train/images\n"
            f"val: val/images\n"
            f"nc: {len(idx_names)}\n"
            f"names: {idx_names}\n"
        )
        with open(yaml_path, "w", encoding="utf-8") as yf:
            yf.write(yaml_text)

    return total

In [9]:
counts_summary: Dict[str, int] = run_pipeline()
counts_summary

summary: Dict[str, int] = counts_summary

print("Done")
print(f"copied -> {summary.get('copied', 0)}")
print(f"labeled -> {summary.get('labeled', 0)}")
print(f"missing_image -> {summary.get('missing_image', 0)}")
print(f"skipped_stage -> {summary.get('skipped_stage', 0)}")

train json files:   0%|          | 0/72 [00:00<?, ?file/s]
train json files:   1%|▏         | 1/72 [00:20<24:06, 20.38s/file]
train json files:   3%|▎         | 2/72 [00:38<22:06, 18.95s/file, labeled=2404, missing=0, skipped=0]
train json files:   3%|▎         | 2/72 [00:38<22:06, 18.95s/file, labeled=4808, missing=0, skipped=0]
train json files:   4%|▍         | 3/72 [00:54<20:08, 17.52s/file, labeled=4808, missing=0, skipped=0]
train json files:   6%|▌         | 4/72 [01:07<17:51, 15.75s/file, labeled=6590, missing=0, skipped=0]
train json files:   7%|▋         | 5/72 [01:11<12:48, 11.47s/file, labeled=8372, missing=0, skipped=0]
train json files:   8%|▊         | 6/72 [01:25<13:36, 12.37s/file, labeled=8865, missing=0, skipped=0]
train json files:  10%|▉         | 7/72 [01:28<10:15,  9.47s/file, labeled=10391, missing=0, skipped=0]
train json files:  10%|▉         | 7/72 [01:28<10:15,  9.47s/file, labeled=10818, missing=0, skipped=0]
train json files:  11%|█         | 8/72 [02:34<2

Done
copied -> 168318
labeled -> 168318
missing_image -> 4
skipped_stage -> 0



