In [10]:
# ===========================================
# YOLO 라벨 txt 파일에서 클래스별 객체 수 세기
# Src: /home/dw/ws_job_msislab/Golf_Project/data/20250904/20250904_good_data
# ===========================================

from pathlib import Path
from collections import Counter

# 데이터 경로 설정
DATA_DIR = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20250910_merge_data")

# 클래스 이름 정의 (순서 반드시 맞게 작성)
CLASSES = [
    "Divot", 
    "Fixed_Divot", 
    "Diseased_Grass", 
    "Pole", 
    "Obstacle", 
    "Sprinkler", 
    "Drain", 
    "Golf ball"
]

# 모든 라벨 파일 수집 (classes.txt 같은 정의 파일은 제외)
label_files = [
    f for f in DATA_DIR.rglob("*.txt") 
    if f.name.lower() != "classes.txt"
]

class_counter = Counter()

for txt_file in label_files:
    with open(txt_file, "r") as f:
        for line in f:
            if line.strip():  # 빈 줄 무시
                class_id = int(line.split()[0])
                class_counter[class_id] += 1

# 결과 출력
print("===== Class별 객체 수 =====")
for class_id, class_name in enumerate(CLASSES):
    print(f"{class_id}: {class_name:<15} → {class_counter[class_id]} 개")

print("\n총 객체 수:", sum(class_counter.values()))
print("총 라벨 파일 수:", len(label_files))

===== Class별 객체 수 =====
0: Divot           → 5002 개
1: Fixed_Divot     → 5169 개
2: Diseased_Grass  → 750 개
3: Pole            → 163 개
4: Obstacle        → 0 개
5: Sprinkler       → 102 개
6: Drain           → 180 개
7: Golf ball       → 270 개

총 객체 수: 11636
총 라벨 파일 수: 4088


In [6]:
# === 두 경로의 YOLO 데이터 머지(복사) + 매칭/충돌 리포트 ==========================
from pathlib import Path
import shutil
from collections import defaultdict

# ===== 설정 =====
SRC1 = Path("/home/dw/ws_job_msislab/Golf_Project/data/20250700/good_data_fix_label")
SRC2 = Path("/home/dw/ws_job_msislab/Golf_Project/data/20250904/20250904_good_data")
DST  = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20250910_merge_data")

IMG_EXTS_PRIORITY = [".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tif", ".tiff"]
IMG_EXTS = set(IMG_EXTS_PRIORITY)
LABEL_EXT = ".txt"

# ===== 유틸 =====
def is_image(p: Path) -> bool:
    return p.suffix.lower() in IMG_EXTS

def is_label(p: Path) -> bool:
    return p.suffix.lower() == LABEL_EXT

def best_image_choice(paths):
    """동일 stem의 다중 이미지가 있다면 우선순위 확장자 기준으로 1개 선택"""
    if len(paths) == 1:
        return paths[0]
    def keyfunc(p: Path):
        try:
            return IMG_EXTS_PRIORITY.index(p.suffix.lower())
        except ValueError:
            return len(IMG_EXTS_PRIORITY)
    return sorted(paths, key=keyfunc)[0]

def collect_items(src_dir: Path):
    """src_dir 재귀 탐색 → {stem: img_path or None, label_path or None} 목록"""
    images_by_stem = defaultdict(list)
    labels_by_stem = {}
    for p in src_dir.rglob("*"):
        if not p.is_file():
            continue
        if is_image(p):
            images_by_stem[p.stem].append(p)
        elif is_label(p):
            # 동일 stem 다수 라벨이 있으면 첫 번째만 사용
            labels_by_stem.setdefault(p.stem, p)

    items = []
    all_stems = set(images_by_stem.keys()) | set(labels_by_stem.keys())
    for s in sorted(all_stems):
        img_paths = images_by_stem.get(s, [])
        img = best_image_choice(img_paths) if img_paths else None
        lbl = labels_by_stem.get(s)
        items.append({"stem": s, "img": img, "lbl": lbl, "src": src_dir})
    return items

def next_available_stem(dst_dir: Path, base_stem: str, need_img_ext: str|None, need_lbl: bool):
    """DST 내에서 충돌 없도록 stem_1, stem_2 ... 생성"""
    cand = base_stem
    i = 1
    while True:
        img_ok = True
        lbl_ok = True
        if need_img_ext:
            img_ok = not (dst_dir / f"{cand}{need_img_ext}").exists()
        if need_lbl:
            lbl_ok = not (dst_dir / f"{cand}{LABEL_EXT}").exists()
        if img_ok and lbl_ok:
            return cand
        i += 1
        cand = f"{base_stem}_{i}"

def copy_safe(src: Path, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(str(src), str(dst))

def show_samples(title, arr, maxn=5):
    print(f"\n[{title}] (최대 {maxn}개 예시)")
    for s in arr[:maxn]:
        print(" -", s)

# ===== 수집 =====
items1 = collect_items(SRC1)
items2 = collect_items(SRC2)
items = items1 + items2

# ===== 복사 수행 =====
DST.mkdir(parents=True, exist_ok=True)

pairs, img_only, lbl_only = 0, 0, 0
copied_imgs, copied_lbls = 0, 0
renamed = []   # (old_stem -> new_stem, src)
name_map_samples = []

for it in items:
    stem = it["stem"]
    img  = it["img"]
    lbl  = it["lbl"]

    need_img = img is not None
    need_lbl = lbl is not None
    if need_img and need_lbl:
        pairs += 1
    elif need_img:
        img_only += 1
    elif need_lbl:
        lbl_only += 1

    img_ext = img.suffix.lower() if img else None

    # 충돌 방지용 stem 선택 (이미지/라벨 모두 고려)
    dest_stem = next_available_stem(DST, stem, img_ext, need_lbl)

    # 리네임 여부 기록
    if dest_stem != stem:
        renamed.append((stem, dest_stem, it["src"]))
        if len(name_map_samples) < 8:
            name_map_samples.append(f"{it['src'].name}: {stem}  →  {dest_stem}")

    # 실제 복사
    if need_img:
        dst_img = DST / f"{dest_stem}{img_ext}"
        copy_safe(img, dst_img)
        copied_imgs += 1
    if need_lbl:
        dst_lbl = DST / f"{dest_stem}{LABEL_EXT}"
        copy_safe(lbl, dst_lbl)
        copied_lbls += 1

# ===== 리포트 =====
print("====================================")
print("[MERGE RESULT]")
print(f"- SRC1: {SRC1}")
print(f"- SRC2: {SRC2}")
print(f"- DST : {DST}")
print("------------------------------------")
print(f"- 총 stem 개수           : {len(items)}")
print(f"- 짝(이미지+라벨)         : {pairs}")
print(f"- 이미지만 있음           : {img_only}")
print(f"- 라벨만 있음             : {lbl_only}")
print("------------------------------------")
print(f"- 복사된 이미지 수        : {copied_imgs}")
print(f"- 복사된 라벨(.txt) 수    : {copied_lbls}")
print(f"- 이름 충돌로 리네임된 건 : {len(renamed)}")
print("====================================")

show_samples("리네임 매핑 예시 (old → new)", name_map_samples)

[MERGE RESULT]
- SRC1: /home/dw/ws_job_msislab/Golf_Project/data/20250700/good_data_fix_label
- SRC2: /home/dw/ws_job_msislab/Golf_Project/data/20250904/20250904_good_data
- DST : /home/dw/ws_job_msislab/Golf_Project/data/for_study/20250910_merge_data
------------------------------------
- 총 stem 개수           : 2509
- 짝(이미지+라벨)         : 2310
- 이미지만 있음           : 197
- 라벨만 있음             : 2
------------------------------------
- 복사된 이미지 수        : 2507
- 복사된 라벨(.txt) 수    : 2312
- 이름 충돌로 리네임된 건 : 1

[리네임 매핑 예시 (old → new)] (최대 5개 예시)
 - 20250904_good_data: classes  →  classes_2


In [9]:
# augment_balance_final_custom.py
# - 사용자가 원하는 TARGET(클래스별 최종 객체 수)을 직접 설정
# - 재사용 제한을 클래스별로 가산할 수 있도록 개선
# - 나머지 안전장치(yolo_sanitize, 빈/이상 라인 제거, 로그 등) 유지

from pathlib import Path
from collections import Counter, defaultdict
import random, csv
import cv2
import albumentations as A

# ========== 사용자 설정 ==========
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20250910_merge_data")
OUT_DIR = BASE / "aug_balanced"     # 단일 폴더 (이미지/라벨 저장)
IMG_EXTS = [".jpg", ".jpeg", ".png"]
SEED = 42
random.seed(SEED)

# (필수) 최종 목표 개수를 직접 지정하세요. (원본처럼!!)
# 예시: Divot 3500, Fixed_Divot 3500, Dieased_Grass 300, 나머지는 기존 유지 or 적절히 설정
# 클래스 매핑: 0 Divot, 1 Fixed_Divot, 2 Dieased_Grass, 3 Pole, 4 Obstacle, 5 Sprinkler, 6 Drain, 7 Golf ball
TARGET = {
    0: 5000, # Divot
    1: 5000, # Fixed_Divot
    2: 750,  # Dieased_Grass
    3: 160,   # Pole (예시)
    4: 0,    # Obstacle (예시)
    5: 100,   # Sprinkler (예시)
    6: 180,   # Drain (예시)
    7: 270,   # Golf ball (예시)
}
ALL_CLASS_IDS = list(range(8))
ALLOW_IDS = set(ALL_CLASS_IDS)

# 재사용 제한 정책
# - 기본 회수
MAX_USES_BASE = 2
# - 클래스별 가산치(해당 클래스를 포함하고 "아직 부족"한 경우에만 적용)
#   예: 2와 3이 특히 부족하면 2회 더 허용, 5는 1회 더 허용… 필요에 맞게 수정
MAX_USES_BOOST_PER_CLASS = {
    0: 3,
    1: 3,
    2: 5,   # Dieased_Grass 부족하면 +2
    3: 5,
    4: 5,
    5: 5,   # Pole 부족하면 +2
    6: 5,   # Sprinkler 부족하면 +1
    7: 5,
    # 필요시 0..7 아무거나 추가/수정
}

# 배경(라벨 없는 이미지) 증강 배수
# 값=2 → 원본 1 + 증강 2 = 총 3배
BG_AUG_MULTIPLIER = 2

# 증강 파이프라인(색/조명 중심, 약한 변형)
aug = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=(-0.12, 0.18),
                               contrast_limit=(-0.12, 0.18), p=0.9),
    A.HueSaturationValue(hue_shift_limit=2, sat_shift_limit=12, val_shift_limit=10, p=0.8),
    A.RGBShift(r_shift_limit=8, g_shift_limit=6, b_shift_limit=6, p=0.5),
    A.RandomGamma(gamma_limit=(90, 120), p=0.4),
    A.OneOf([
        A.CLAHE(clip_limit=2.0, tile_grid_size=(8, 8), p=0.5),
        A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.08, 0.25), p=0.5),
        A.NoOp()
    ], p=0.4),
], bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"], min_visibility=0.25))

# ========== 유틸 ==========
def ensure_out_dir():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

def read_yolo_label(lbl: Path):
    boxes, cls = [], []
    if not lbl.exists():
        return boxes, cls
    with open(lbl, "r") as f:
        for line in f:
            p = line.strip().split()
            if len(p) < 5:
                continue
            try:
                c = int(float(p[0]))
            except Exception:
                continue
            try:
                x, y, w, h = map(float, p[1:5])
            except Exception:
                continue
            x = min(max(x, 0.0), 1.0)
            y = min(max(y, 0.0), 1.0)
            w = min(max(w, 0.0), 1.0)
            h = min(max(h, 0.0), 1.0)
            if w <= 0.0 or h <= 0.0:
                continue
            boxes.append([x, y, w, h]); cls.append(c)
    return boxes, cls

def write_yolo_label(lbl_path: Path, boxes, cls, *, allow_ids=ALLOW_IDS):
    lbl_path.parent.mkdir(parents=True, exist_ok=True)
    with open(lbl_path, "w") as f:
        for c, (x, y, w, h) in zip(cls, boxes):
            try:
                c_int = int(float(c))
            except Exception:
                continue
            if allow_ids is not None and c_int not in allow_ids:
                continue
            x = float(max(0.0, min(1.0, x)))
            y = float(max(0.0, min(1.0, y)))
            w = float(max(0.0, min(1.0, w)))
            h = float(max(0.0, min(1.0, h)))
            if w <= 0.0 or h <= 0.0:
                continue
            f.write(f"{c_int} {x:.6f} {y:.6f} {w:.6f} {h:.6f}\n")

def yolo_sanitize(boxes):
    eps = 1e-6
    out = []
    for x, y, w, h in boxes:
        l = x - w/2.0; r = x + w/2.0
        t = y - h/2.0; b = y + h/2.0
        l = max(0.0, min(1.0, l))
        r = max(0.0, min(1.0, r))
        t = max(0.0, min(1.0, t))
        b = max(0.0, min(1.0, b))
        if r - l <= eps or b - t <= eps:
            continue
        nx = (l + r) / 2.0
        ny = (t + b) / 2.0
        nw = (r - l)
        nh = (b - t)
        nx = min(max(nx, eps), 1.0 - eps)
        ny = min(max(ny, eps), 1.0 - eps)
        nw = min(max(nw, eps), 1.0 - eps)
        nh = min(max(nh, eps), 1.0 - eps)
        out.append([nx, ny, nw, nh])
    return out

def find_image_for_label(lbl: Path):
    stem = lbl.stem
    for ext in IMG_EXTS:
        p = lbl.with_suffix(ext)
        if p.exists(): return p
    for parent in [lbl.parent, *lbl.parents]:
        img_dir = parent / "images"
        if img_dir.exists():
            for ext in IMG_EXTS:
                p = img_dir / f"{stem}{ext}"
                if p.exists(): return p
    for ext in IMG_EXTS:
        hits = list(BASE.rglob(f"{stem}{ext}"))
        if hits: return hits[0]
    return None

def current_counts(root: Path):
    cnt = Counter()
    for lbl in root.rglob("*.txt"):
        _, cls = read_yolo_label(lbl)
        cnt.update(cls)
    return cnt

def collect_background_images(base: Path):
    label_stems = {p.stem for p in base.rglob("*.txt")}
    bg_images = []
    for ext in IMG_EXTS:
        for img in base.rglob(f"*{ext}"):
            if img.stem not in label_stems:
                bg_images.append(img)
    return sorted(bg_images)

def load_image(path: Path):
    img = cv2.imread(str(path))
    if img is None:
        raise RuntimeError(f"Failed to read image: {path}")
    return img

def next_unique_name(stem: str, used: set, tag: str):
    i = 1
    while True:
        s = f"{stem}_{tag}{i:04d}"
        if s not in used:
            used.add(s)
            return s
        i += 1

def effective_cap_for_file(file_cls_counter: Counter, deficit: Counter):
    """
    이 파일(라벨)의 클래스 구성과 현재 부족분을 보고
    MAX_USES_BASE + (부족 클래스 중 최대 가산치) 를 반환.
    """
    cap = MAX_USES_BASE
    best_boost = 0
    for c, n in file_cls_counter.items():
        if n <= 0: 
            continue
        if deficit.get(c, 0) > 0:
            boost = MAX_USES_BOOST_PER_CLASS.get(c, 0)
            if boost > best_boost:
                best_boost = boost
    return cap + best_boost

# ========== 실행 ==========
ensure_out_dir()

# 1) 배경 3배(원본 + 증강2)
bg_list = collect_background_images(BASE)
print(f"Background images: {len(bg_list)}")

used_names = set()
for bg_img in bg_list:
    try:
        img = load_image(bg_img)
    except Exception:
        continue
    for _ in range(BG_AUG_MULTIPLIER):
        out_stem = next_unique_name(bg_img.stem, used_names, tag="bg")
        out_img = OUT_DIR / f"{out_stem}{bg_img.suffix.lower()}"
        out_lbl = OUT_DIR / f"{out_stem}.txt"
        transformed = aug(image=img, bboxes=[], class_labels=[])
        aug_img = transformed["image"]
        cv2.imwrite(str(out_img), aug_img)
        with open(out_lbl, "w") as f:
            pass
print("Background augmentation done.")

# 2) 라벨 쌍 후보
label_files = sorted(BASE.rglob("*.txt"))
candidates = []  # (img_path, lbl_path, file_counts, stem)
skipped = 0
for lbl in label_files:
    boxes, cls_list = read_yolo_label(lbl)
    if not cls_list:
        continue
    img = find_image_for_label(lbl)
    if img is None or not img.exists():
        skipped += 1
        continue
    fcnt = Counter(cls_list)
    candidates.append((img, lbl, fcnt, lbl.stem))
if not candidates:
    raise SystemExit("No labeled image-label pairs found.")
print(f"Labeled pairs: {len(candidates)}, skipped(no image match): {skipped}")

# 3) 초기 카운트(원본)
base_cnt = current_counts(BASE)
print("Base counts:", dict(base_cnt))

# 4) 타깃/결손 세팅
cur = Counter(base_cnt)
# TARGET이 현재보다 작은 경우는 결손 0으로 간주(감소 작업은 하지 않음)
deficit = Counter({c: max(0, TARGET.get(c, 0) - cur.get(c, 0)) for c in ALL_CLASS_IDS})

use_count = defaultdict(int)
log_path = OUT_DIR / "log.csv"
with open(log_path, "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(["src_img","src_lbl","out_img","out_lbl","add_per_class","totals_after"])

MAX_ITERS = 200000
iters = 0

def score(deficit: Counter, fcnt: Counter):
    # 남은 결손에 기여도가 큰 파일 우선
    return sum(deficit[c] * fcnt.get(c, 0) for c in ALL_CLASS_IDS if deficit[c] > 0)

while any(deficit[c] > 0 for c in ALL_CLASS_IDS) and iters < MAX_ITERS:
    iters += 1
    best = None
    best_score = 0

    for img, lbl, fcnt, stem in candidates:
        cap = effective_cap_for_file(fcnt, deficit)
        if use_count[img] >= cap:
            continue
        sc = score(deficit, fcnt)
        if sc > best_score:
            best_score = sc
            best = (img, lbl, fcnt, stem)

    if not best or best_score <= 0:
        print("No more useful candidates for remaining deficits. Stopping.")
        break

    img_path, lbl_path, fcnt, stem = best
    out_stem = next_unique_name(stem, used_names, tag="dup")
    out_img = OUT_DIR / f"{out_stem}{img_path.suffix.lower()}"
    out_lbl = OUT_DIR / f"{out_stem}.txt"

    # 증강
    img = load_image(img_path)
    boxes0, cls0 = read_yolo_label(lbl_path)
    boxes0 = yolo_sanitize(boxes0)

    transformed = aug(image=img, bboxes=boxes0, class_labels=cls0)
    aug_img = transformed["image"]
    aug_boxes = transformed["bboxes"]
    aug_cls = transformed["class_labels"]

    keep_boxes, keep_cls = [], []
    for (x, y, w, h), c in zip(aug_boxes, aug_cls):
        if w > 0 and h > 0:
            keep_boxes.append([float(x), float(y), float(w), float(h)])
            keep_cls.append(int(float(c)))

    # 실제로 부족 클래스에 기여하는지 확인
    test_cnt = Counter(keep_cls)
    if sum(test_cnt[c] for c in ALL_CLASS_IDS if deficit[c] > 0) <= 0 or not keep_cls:
        use_count[img_path] += 1
        continue

    cv2.imwrite(str(out_img), aug_img)
    write_yolo_label(out_lbl, keep_boxes, keep_cls, allow_ids=ALLOW_IDS)

    _, saved_cls = read_yolo_label(out_lbl)
    add_cnt = Counter(saved_cls)

    cur.update(add_cnt)
    for c in ALL_CLASS_IDS:
        deficit[c] = max(0, TARGET.get(c, 0) - cur.get(c, 0))
    use_count[img_path] += 1

    add_str = "{" + ", ".join(f"{k}:{add_cnt.get(k,0)}" for k in ALL_CLASS_IDS) + "}"
    tot_str = "{" + ", ".join(f"{k}:{cur.get(k,0)}" for k in ALL_CLASS_IDS) + "}"
    with open(log_path, "a", newline="") as f:
        w = csv.writer(f)
        w.writerow([str(img_path), str(lbl_path), str(out_img), str(out_lbl), add_str, tot_str])

    if iters % 50 == 0 or all(deficit[c]==0 for c in ALL_CLASS_IDS):
        print(f"[{iters}] cur={dict(cur)} deficit={dict(deficit)} last={out_stem}")

print("\n=== Finished ===")
print("Final counts:", dict(cur))
print("Deficit:", dict(deficit))
print(f"Saved to: {OUT_DIR}")
print(f"Log: {log_path}")

Background images: 197
Background augmentation done.
Labeled pairs: 2310, skipped(no image match): 0
Base counts: {3: 80, 7: 92, 1: 2475, 0: 2150, 2: 289, 6: 62, 5: 31}
[50] cur={3: 80, 7: 92, 1: 2730, 0: 2355, 2: 289, 6: 62, 5: 31} deficit={0: 2645, 1: 2270, 2: 461, 3: 80, 4: 0, 5: 69, 6: 118, 7: 178} last=20250725_171914_0000_000028_f000810_1920x1536_dup0005
[100] cur={3: 80, 7: 92, 1: 2945, 0: 2515, 2: 289, 6: 62, 5: 31} deficit={0: 2485, 1: 2055, 2: 461, 3: 80, 4: 0, 5: 69, 6: 118, 7: 178} last=20250725_171420_0003_000015_f000420_1920x1536_dup0005
[150] cur={3: 80, 7: 92, 1: 3130, 0: 2660, 2: 289, 6: 62, 5: 31} deficit={0: 2340, 1: 1870, 2: 461, 3: 80, 4: 0, 5: 69, 6: 118, 7: 178} last=20250725_172530_0013_000034_f000990_1920x1536_dup0005
[200] cur={3: 80, 7: 92, 1: 3295, 0: 2810, 2: 289, 6: 62, 5: 31} deficit={0: 2190, 1: 1705, 2: 461, 3: 80, 4: 0, 5: 69, 6: 118, 7: 178} last=20250725_171914_0000_000007_f000180_1920x1536_dup0005
[250] cur={3: 80, 7: 92, 1: 3445, 0: 2960, 2: 289, 6