In [1]:
from pathlib import Path

# 경로 설정
IMG_DIR = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20251010_merge_data/images")
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

count = 0
bad_files = []  # 혹시 확인하고 싶은 제외된 파일

for img in IMG_DIR.rglob("*"):
    if not img.is_file() or img.suffix not in IMG_EXTS:
        continue
    name_lower = img.name.lower()
    # 제외 조건
    if "dup" in name_lower:
        bad_files.append(img)
        continue
    if name_lower.count("bg") >= 2:
        bad_files.append(img)
        continue
    count += 1

print("오리지널 이미지 개수:", count)
print("제외된 파일 수:", len(bad_files))


오리지널 이미지 개수: 5455
제외된 파일 수: 2954


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
from collections import Counter
from typing import List
import pandas as pd
from IPython.display import display

# -------------------- 기본 경로/설정 --------------------
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20251017_merge_data")
IMAGES_DIR = BASE / "images"
LABELS_DIR = BASE / "labels"

DATE_TOKENS = {
    "20250721": "0721",
    "20250725": "0725",
    "20250904": "0904",
    "20250929": "0929",
    "20250930": "0930",
}
# 열 순서: Total → 0904 → 0725 → 0721
COLS = ["Total", "0930","0929","0904", "0725", "0721"]

# 클래스 이름 매핑
CLASS_ORDER = [
    "Divot", "Fixed_Divot", "Diseased_Grass", "Confused_Object",
    "Pole", "Sprinkler", "Drain", "Golf ball"
]
ROWS = ["Image", "bg"] + CLASS_ORDER

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

# -------------------- 유틸 함수 --------------------
def detect_date(name: str):
    for token, col in DATE_TOKENS.items():
        if token in name:
            return col
    return None

def read_label_ids(txt_path: Path) -> List[int]:
    """YOLO txt를 읽어서 class id 리스트 리턴 (한 줄의 첫 토큰만 사용)"""
    try:
        if not txt_path.exists():
            return []
        txt = txt_path.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            return []
        ids = []
        for ln in txt.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            first = ln.split()[0]
            try:
                cid = int(float(first))
            except Exception:
                continue
            ids.append(cid)
        return ids
    except Exception:
        return []

# -------------------- 메인 로직 --------------------
def build_table() -> pd.DataFrame:
    table = {row: Counter({c: 0 for c in COLS}) for row in ROWS}

    for img in IMAGES_DIR.rglob("*"):
        if not img.is_file() or img.suffix not in IMG_EXTS:
            continue

        name_lower = img.name.lower()

        # ✅ 제외 조건: dup 포함 또는 'bg'가 2회 이상 등장하면 전체 집계에서 제외
        if "dup" in name_lower or name_lower.count("bg") >= 2:
            continue

        col = detect_date(name_lower)
        if not col:
            continue

        # 이미지 카운트
        table["Image"][col] += 1
        table["Image"]["Total"] += 1

        # 라벨 경로 매핑
        rel = img.relative_to(IMAGES_DIR)
        label_path = (LABELS_DIR / rel).with_suffix(".txt")
        label_ids = read_label_ids(label_path)

        # bg: 파일명에 "bg"가 정확히 1번 포함 & 라벨 비어있음
        if name_lower.count("bg") == 1 and len(label_ids) == 0:
            table["bg"][col] += 1
            table["bg"]["Total"] += 1

        # 클래스 카운트 (0~7)
        for cid in label_ids:
            if 0 <= cid <= 7:
                row = CLASS_ORDER[cid]
                table[row][col] += 1
                table[row]["Total"] += 1

    # DataFrame 생성 (행/열 순서 고정)
    df = pd.DataFrame(
        {c: [table[row][c] for row in ROWS] for c in COLS},
        index=ROWS
    ).astype(int)

    return df

# -------------------- 실행 & 출력 --------------------
df_counts = build_table()

# Styler로 중앙정렬 적용
styled = (df_counts.style
          .set_properties(**{"text-align": "center"})
          .set_table_styles([dict(selector="th", props=[("text-align", "center")])]))

display(styled)





Unnamed: 0,Total,0930,0929,0904,0725,0721
Image,5367,792,357,3215,440,563
bg,132,28,0,29,45,30
Divot,5995,1545,394,3305,196,555
Fixed_Divot,8542,2179,1171,4093,542,557
Diseased_Grass,679,67,65,505,22,20
Confused_Object,0,0,0,0,0,0
Pole,126,19,0,95,12,0
Sprinkler,67,7,0,34,26,0
Drain,315,47,27,234,7,0
Golf ball,218,41,14,140,23,0


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
from collections import Counter
from typing import List, Optional
import pandas as pd
from IPython.display import display

# -------------------- 기본 경로/설정 --------------------
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20251017_merge_data")
IMAGES_DIR = BASE / "images"
LABELS_DIR = BASE / "labels"

SPLITS     = ["train", "val"]

DATE_MAP   = {"20250721": "0721", "20250725": "0725", "20250904": "0904" , "20250930": "0930" , "20250929": "0929"}
DATE_COLS  = ["0930","0929", "0904", "0725", "0721"]   # 날짜 순서
SUB_COLS   = ["Total"] + DATE_COLS      # 열 구조: Total, 0904, 0725, 0721

CLASS_ORDER = [
    "Divot", "Fixed_Divot", "Diseased_Grass", "Confused_Object",
    "Pole", "Sprinkler", "Drain", "Golf ball"
]
ROWS = ["Image", "bg"] + CLASS_ORDER

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

# -------------------- 유틸 함수 --------------------
def detect_split(rel_path: Path) -> Optional[str]:
    for part in rel_path.parts:
        if part in SPLITS:
            return part
    return None

def detect_date(name: str) -> Optional[str]:
    for token, short in DATE_MAP.items():
        if token in name:
            return short
    return None

def read_label_ids(txt_path: Path) -> List[int]:
    try:
        if not txt_path.exists():
            return []
        txt = txt_path.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            return []
        ids = []
        for ln in txt.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            first = ln.split()[0]
            try:
                cid = int(float(first))
            except Exception:
                continue
            ids.append(cid)
        return ids
    except Exception:
        return []

# -------------------- 메인 로직 --------------------
def build_split_by_date_with_totals() -> pd.DataFrame:
    # (split, date) 카운터
    table = {row: Counter({(sp, dt): 0 for sp in SPLITS for dt in DATE_COLS}) for row in ROWS}

    for img in IMAGES_DIR.rglob("*"):
        if not img.is_file() or img.suffix not in IMG_EXTS:
            continue

        name_lower = img.name.lower()

        # ✅ 전역 제외 규칙: dup 포함 or 'bg'가 2회 이상이면 완전 스킵
        if "dup" in name_lower or name_lower.count("bg") >= 2:
            continue

        rel   = img.relative_to(IMAGES_DIR)
        split = detect_split(rel)
        date  = detect_date(img.name)

        if split not in SPLITS or date not in DATE_COLS:
            continue

        # --- Image 행 ---
        table["Image"][(split, date)] += 1

        # --- 라벨 ---
        label_path = (LABELS_DIR / rel).with_suffix(".txt")
        label_ids = read_label_ids(label_path)

        # bg: 파일명에 "bg"가 '정확히 1번' 포함 & 라벨 비어있음
        if name_lower.count("bg") == 1 and len(label_ids) == 0:
            table["bg"][(split, date)] += 1

        # 클래스 카운트 (0~7)
        for cid in label_ids:
            if 0 <= cid <= 7:
                row = CLASS_ORDER[cid]
                table[row][(split, date)] += 1

    # ----- DataFrame 생성 -----
    columns = pd.MultiIndex.from_product([SPLITS, SUB_COLS])
    df = pd.DataFrame(index=ROWS, columns=columns, dtype="int64").fillna(0)

    for row in ROWS:
        for sp in SPLITS:
            for dt in DATE_COLS:
                df.loc[row, (sp, dt)] = table[row][(sp, dt)]
            # split-Total = 날짜 합계
            df.loc[row, (sp, "Total")] = sum(table[row][(sp, dt)] for dt in DATE_COLS)

    return df.astype(int)

# -------------------- 실행 & 출력 --------------------
df_sbdt = build_split_by_date_with_totals()

styled = (
    df_sbdt.style
    .set_properties(**{"text-align": "center"})
    .set_table_styles([dict(selector="th", props=[("text-align", "center")])])
)

display(styled)





Unnamed: 0_level_0,train,train,train,train,train,train,val,val,val,val,val,val
Unnamed: 0_level_1,Total,0930,0929,0904,0725,0721,Total,0930,0929,0904,0725,0721
Image,4701,698,320,2798,385,500,666,94,37,417,55,63
bg,119,26,0,25,38,30,13,2,0,4,7,0
Divot,5238,1367,369,2848,171,483,757,178,25,457,25,72
Fixed_Divot,7452,1932,1021,3530,485,484,1090,247,150,563,57,73
Diseased_Grass,585,53,46,448,18,20,94,14,19,57,4,0
Confused_Object,0,0,0,0,0,0,0,0,0,0,0,0
Pole,108,15,0,82,11,0,18,4,0,13,1,0
Sprinkler,60,7,0,30,23,0,7,0,0,4,3,0
Drain,274,41,23,204,6,0,41,6,4,30,1,0
Golf ball,187,34,12,122,19,0,31,7,2,18,4,0


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# TEST
from pathlib import Path
from collections import Counter
from typing import List, Optional
import pandas as pd
from IPython.display import display

# -------------------- 기본 경로/설정 --------------------
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_test/test_20251013")
IMAGES_DIR = BASE / "images"
LABELS_DIR = BASE / "labels"

SPLITS     = ["test"]

DATE_MAP   = {"20250904": "0904" , "20250930": "0930"}
DATE_COLS  = ["0930", "0904"]   # 날짜 순서
SUB_COLS   = ["Total"] + DATE_COLS      # 열 구조: Total, 0904, 0725, 0721

CLASS_ORDER = [
    "Divot", "Fixed_Divot", "Diseased_Grass", "Confused_Object",
    "Pole", "Sprinkler", "Drain", "Golf ball"
]
ROWS = ["Image", "bg"] + CLASS_ORDER

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

# -------------------- 유틸 함수 --------------------
def detect_split(rel_path: Path) -> Optional[str]:
    for part in rel_path.parts:
        if part in SPLITS:
            return part
    return None

def detect_date(name: str) -> Optional[str]:
    for token, short in DATE_MAP.items():
        if token in name:
            return short
    return None

def read_label_ids(txt_path: Path) -> List[int]:
    try:
        if not txt_path.exists():
            return []
        txt = txt_path.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            return []
        ids = []
        for ln in txt.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            first = ln.split()[0]
            try:
                cid = int(float(first))
            except Exception:
                continue
            ids.append(cid)
        return ids
    except Exception:
        return []

# -------------------- 메인 로직 --------------------
def build_split_by_date_with_totals() -> pd.DataFrame:
    # (split, date) 카운터
    table = {row: Counter({(sp, dt): 0 for sp in SPLITS for dt in DATE_COLS}) for row in ROWS}

    for img in IMAGES_DIR.rglob("*"):
        if not img.is_file() or img.suffix not in IMG_EXTS:
            continue

        name_lower = img.name.lower()

        # ✅ 전역 제외 규칙: dup 포함 or 'bg'가 2회 이상이면 완전 스킵
        if "dup" in name_lower or name_lower.count("bg") >= 2:
            continue

        rel   = img.relative_to(IMAGES_DIR)
        split = detect_split(rel)
        date  = detect_date(img.name)

        if split not in SPLITS or date not in DATE_COLS:
            continue

        # --- Image 행 ---
        table["Image"][(split, date)] += 1

        # --- 라벨 ---
        label_path = (LABELS_DIR / rel).with_suffix(".txt")
        label_ids = read_label_ids(label_path)

        # bg: 파일명에 "bg"가 '정확히 1번' 포함 & 라벨 비어있음
        if name_lower.count("bg") == 1 and len(label_ids) == 0:
            table["bg"][(split, date)] += 1

        # 클래스 카운트 (0~7)
        for cid in label_ids:
            if 0 <= cid <= 7:
                row = CLASS_ORDER[cid]
                table[row][(split, date)] += 1

    # ----- DataFrame 생성 -----
    columns = pd.MultiIndex.from_product([SPLITS, SUB_COLS])
    df = pd.DataFrame(index=ROWS, columns=columns, dtype="int64").fillna(0)

    for row in ROWS:
        for sp in SPLITS:
            for dt in DATE_COLS:
                df.loc[row, (sp, dt)] = table[row][(sp, dt)]
            # split-Total = 날짜 합계
            df.loc[row, (sp, "Total")] = sum(table[row][(sp, dt)] for dt in DATE_COLS)

    return df.astype(int)

# -------------------- 실행 & 출력 --------------------
df_sbdt = build_split_by_date_with_totals()

styled = (
    df_sbdt.style
    .set_properties(**{"text-align": "center"})
    .set_table_styles([dict(selector="th", props=[("text-align", "center")])])
)

display(styled)

Unnamed: 0_level_0,test,test,test
Unnamed: 0_level_1,Total,0930,0904
Image,426,100,326
bg,10,5,5
Divot,588,182,406
Fixed_Divot,809,296,513
Diseased_Grass,80,8,72
Confused_Object,0,0,0
Pole,11,0,11
Sprinkler,8,3,5
Drain,48,8,40
Golf ball,21,6,15


In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
from collections import Counter
from typing import List, Optional, Tuple
import pandas as pd
from IPython.display import display
from itertools import product

# -------------------- 기본 경로/설정 --------------------
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20251017_merge_data")
IMAGES_DIR = BASE / "images"
LABELS_DIR = BASE / "labels"

DATE_MAP   = {"20250721": "0721", "20250725": "0725", "20250904": "0904" , "20250930": "0930" , "20250929": "0929"}
DATE_COLS  = ["0930", "0929", "0904", "0725", "0721"]   # 날짜 순서
SUB_COLS   = ["Total"] + DATE_COLS
AUG_COLS   = ["orig", "aug"]

CLASS_ORDER = [
    "Divot", "Fixed_Divot", "Diseased_Grass", "Confused_Object",
    "Pole", "Sprinkler", "Drain", "Golf ball"
]
ROWS = ["Image", "bg"] + CLASS_ORDER

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

# -------------------- 유틸 함수 --------------------
def detect_date_from_name(name: str) -> Optional[str]:
    low = name.lower()
    for token, short in DATE_MAP.items():
        if token in low:
            return short
    return None

def read_label_ids(txt_path: Path) -> List[int]:
    try:
        if not txt_path.exists():
            return []
        txt = txt_path.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            return []
        ids = []
        for ln in txt.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            first = ln.split()[0]
            try:
                cid = int(float(first))
            except Exception:
                continue
            ids.append(cid)
        return ids
    except Exception:
        return []

def is_augmented(name_lower: str) -> bool:
    """증강 판단 규칙: dup 포함 OR 'bg'가 2회 이상"""
    return ("dup" in name_lower) or (name_lower.count("bg") >= 2)

def is_original(name_lower: str) -> bool:
    """원본 판단 규칙: dup 없음 AND 'bg'가 0~1회"""
    return ("dup" not in name_lower) and (name_lower.count("bg") <= 1)

# -------------------- 메인 로직 --------------------
def build_train_only_table() -> pd.DataFrame:
    # (date, augflag) 카운터 준비
    keys = list(product(DATE_COLS, AUG_COLS))
    def empty_counter():
        return Counter({k: 0 for k in keys})

    table = {row: empty_counter() for row in ROWS}

    train_dir = IMAGES_DIR / "train"
    for img in train_dir.rglob("*"):   # ✅ train만 순회
        if not img.is_file() or img.suffix not in IMG_EXTS:
            continue

        name = img.name
        low  = name.lower()

        date = detect_date_from_name(name)
        if date not in DATE_COLS:
            continue

        # --- 증강 여부 판단 ---
        if is_augmented(low):
            augflag = "aug"
        elif is_original(low):
            augflag = "orig"
        else:
            # 규칙에 애매하게 걸리면(이론상 없겠지만) 스킵
            continue

        # --- Image 행 ---
        table["Image"][(date, augflag)] += 1

        # --- 라벨 경로 (train 기준으로 상대경로 맞추기) ---
        rel_from_train = img.relative_to(train_dir)                 # e.g. 0904/xxx.jpg
        label_path = (LABELS_DIR / "train" / rel_from_train).with_suffix(".txt")
        label_ids = read_label_ids(label_path)

        # bg: 라벨 비었고 파일명에 'bg'가 1회 이상(원본/증강 모두 카운트)
        if len(label_ids) == 0 and ("bg" in low):
            table["bg"][(date, augflag)] += 1

        # 클래스 카운트 (0~7)
        for cid in label_ids:
            if 0 <= cid <= 7:
                row = CLASS_ORDER[cid]
                table[row][(date, augflag)] += 1

    # ----- DataFrame: 멀티컬럼( train / Total·날짜 / orig·aug ) -----
    columns = pd.MultiIndex.from_product([["train"], SUB_COLS, AUG_COLS])
    df = pd.DataFrame(index=ROWS, columns=columns, dtype="int64").fillna(0)

    for row in ROWS:
        # 날짜별 값
        for dt in DATE_COLS:
            for ag in AUG_COLS:
                df.loc[row, ("train", dt, ag)] = table[row][(dt, ag)]
        # Total = 날짜 합계
        for ag in AUG_COLS:
            df.loc[row, ("train", "Total", ag)] = sum(table[row][(d, ag)] for d in DATE_COLS)

    return df.astype(int)

# -------------------- 실행 & 출력 --------------------
df_train = build_train_only_table()

styled = (
    df_train.style
    .set_properties(**{"text-align": "center"})
    .set_table_styles([dict(selector="th", props=[("text-align", "center")])])
)

display(styled)




Unnamed: 0_level_0,train,train,train,train,train,train,train,train,train,train,train,train
Unnamed: 0_level_1,Total,Total,0930,0930,0929,0929,0904,0904,0725,0725,0721,0721
Unnamed: 0_level_2,orig,aug,orig,aug,orig,aug,orig,aug,orig,aug,orig,aug
Image,4701,4252,698,774,320,342,2798,2405,385,297,500,434
bg,119,238,26,52,0,0,25,50,38,76,30,60
Divot,5238,5576,1367,1436,369,371,2848,3111,171,181,483,477
Fixed_Divot,7452,6393,1932,2118,1021,984,3530,2733,485,245,484,313
Diseased_Grass,585,609,53,61,46,44,448,466,18,18,20,20
Confused_Object,0,0,0,0,0,0,0,0,0,0,0,0
Pole,108,216,15,30,0,0,82,164,11,22,0,0
Sprinkler,60,121,7,14,0,0,30,60,23,47,0,0
Drain,274,548,41,82,23,46,204,408,6,12,0,0
Golf ball,187,374,34,68,12,24,122,244,19,38,0,0


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
from collections import Counter
from typing import List, Optional
import pandas as pd
from IPython.display import display

# -------------------- 기본 경로 --------------------
BASE = Path("/home/dw/ws_job_msislab/Golf_Project/data/for_study/20251107_merge_data")
IMAGES_DIR = BASE / "images"
LABELS_DIR = BASE / "labels"

SPLITS = ["train", "val"]
COLS   = ["Total", "train", "val"]

CLASS_ORDER = [
    "Divot", "Fixed_Divot", "Diseased_Grass", "Confused_Object",
    "Pole", "Sprinkler", "Drain", "Golf ball"
]
ROWS = ["Image", "bg"] + CLASS_ORDER

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp",
            ".JPG", ".JPEG", ".PNG", ".BMP", ".WEBP"}

# -------------------- 유틸 함수 --------------------
def detect_split(rel_path: Path) -> Optional[str]:
    for part in rel_path.parts:
        if part in SPLITS:
            return part
    return None

def read_label_ids(txt_path: Path) -> List[int]:
    try:
        if not txt_path.exists():
            return []
        txt = txt_path.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            return []
        ids = []
        for ln in txt.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            first = ln.split()[0]
            try:
                cid = int(float(first))
            except Exception:
                continue
            ids.append(cid)
        return ids
    except Exception:
        return []

# -------------------- 메인 로직 --------------------
def build_split_totals() -> pd.DataFrame:
    table = {row: Counter({c: 0 for c in COLS}) for row in ROWS}

    for img in IMAGES_DIR.rglob("*"):
        if not img.is_file() or img.suffix not in IMG_EXTS:
            continue

        rel   = img.relative_to(IMAGES_DIR)
        split = detect_split(rel)
        if split not in SPLITS:
            continue

        # === Image ===
        table["Image"][split] += 1
        table["Image"]["Total"] += 1

        # === 라벨 ===
        label_path = (LABELS_DIR / rel).with_suffix(".txt")
        label_ids = read_label_ids(label_path)

        # bg: 라벨 비었고 'bg' 포함 시
        if len(label_ids) == 0 and ("bg" in img.name.lower()):
            table["bg"][split] += 1
            table["bg"]["Total"] += 1

        # 클래스 카운트
        for cid in label_ids:
            if 0 <= cid <= 7:
                row = CLASS_ORDER[cid]
                table[row][split] += 1
                table[row]["Total"] += 1

    df = pd.DataFrame({c: [table[row][c] for row in ROWS] for c in COLS}, index=ROWS).astype(int)
    return df

# -------------------- 실행 & 출력 --------------------
df_counts = build_split_totals()

styled = (
    df_counts.style
    .set_properties(**{"text-align": "center"})
    .set_table_styles([dict(selector="th", props=[("text-align", "center")])])
)

display(styled)


Unnamed: 0,Total,train,val
Image,10573,9819,754
bg,557,536,21
Divot,12583,11809,774
Fixed_Divot,15462,14189,1273
Diseased_Grass,1725,1586,139
Confused_Object,0,0,0
Pole,323,300,23
Sprinkler,235,229,6
Drain,759,723,36
Golf ball,551,513,38
