In [16]:
import os
from pathlib import Path
from typing import Dict, List, Tuple

import cv2
import numpy as np
import xml.etree.ElementTree as ET


In [18]:
def imwrite_unicode(path: str | Path, img: np.ndarray) -> bool:
    path = str(path)
    ext = os.path.splitext(path)[1]
    ok, buf = cv2.imencode(ext, img)
    if not ok:
        return False
    with open(path, "wb") as f:
        f.write(buf.tobytes())
    return True


def imread_unicode(path: str | Path, flags=cv2.IMREAD_GRAYSCALE) -> np.ndarray | None:
    path = str(path)
    try:
        with open(path, "rb") as f:
            data = f.read()
        arr = np.frombuffer(data, np.uint8)
        img = cv2.imdecode(arr, flags)
        return img
    except Exception as e:
        print("[imread_unicode ERROR]", e, "->", path)
        return None


In [20]:
def decompose_hangul_syllable(ch: str) -> Tuple[int, int, int]:
    if len(ch) != 1:
        raise ValueError(f"Expected single char, got: {ch}")
    code = ord(ch)
    base = 0xAC00
    end = 0xD7A3
    if not (base <= code <= end):
        raise ValueError(f"Not a Hangul syllable: {ch} (U+{code:04X})")

    s_index = code - base
    cho = s_index // (21 * 28)
    jung = (s_index % (21 * 28)) // 28
    jong = s_index % 28
    return cho, jung, jong


def index_for_output(folder_name: str, syllable_char: str) -> int:
    cho, jung, jong = decompose_hangul_syllable(syllable_char)

    if folder_name.endswith("_chosung"):
        return cho
    if folder_name.endswith("_jungsung"):
        return jung
    if folder_name.endswith("_jongsung"):
        return jong

    raise ValueError(f"Folder name must end with _chosung/_jungsung/_jongsung: {folder_name}")


In [64]:
LABEL_TO_ID = {"chosung": 1, "jungsung": 2, "jongsung": 3}

def parse_points(points_str: str) -> np.ndarray:
    pts = []
    for pair in points_str.strip().split(";"):
        pair = pair.strip()
        if not pair:
            continue
        x_str, y_str = pair.split(",")
        pts.append([float(x_str), float(y_str)])
    if len(pts) < 3:
        raise ValueError(f"Polygon needs >=3 points: {points_str}")
    return np.array(pts, dtype=np.float32)


def polygons_to_region_mask(h: int, w: int, polygons: List[np.ndarray]) -> np.ndarray:
    mask = np.zeros((h, w), dtype=np.uint8)
    if not polygons:
        return mask
    pts_list = []
    for pts in polygons:
        pts_int = np.round(pts).astype(np.int32).reshape((-1, 1, 2))
        pts_list.append(pts_int)
    cv2.fillPoly(mask, pts_list, 1)
    return mask


def build_label_mask_from_xml(
    xml_path: str | Path,
    images_dir: str | Path,
    white_thresh: int = 10,
    priority: Tuple[str, ...] = ("jongsung", "jungsung", "chosung"),
) -> Dict[str, Dict]:
    xml_path = Path(xml_path)
    images_dir = Path(images_dir)

    tree = ET.parse(xml_path)
    root = tree.getroot()

    out: Dict[str, Dict] = {}

    for img_el in root.findall("image"):
        img_name = img_el.get("name")
        w = int(img_el.get("width"))
        h = int(img_el.get("height"))

        img_path = images_dir / img_name
        orig = imread_unicode(img_path, flags=cv2.IMREAD_GRAYSCALE)
        if orig is None:
            print("[WARN] Missing image:", img_path)
            continue
        if orig.shape != (h, w):
            h, w = orig.shape[:2]

        label_polys: Dict[str, List[np.ndarray]] = {k: [] for k in LABEL_TO_ID.keys()}

        def parse_points(points_str):
            if not points_str:   # None 또는 빈 문자열 방어
                return None
            pts = []
            for pair in points_str.strip().split(";"):
                pair = pair.strip()
                if not pair:
                    continue
                x, y = pair.split(",")
                pts.append([float(x), float(y)])
            return np.array(pts, dtype=np.float32)
        
        def box_to_points(box_el):
            # CVAT 스타일 예시: xtl, ytl, xbr, ybr
            xtl = float(box_el.get("xtl"))
            ytl = float(box_el.get("ytl"))
            xbr = float(box_el.get("xbr"))
            ybr = float(box_el.get("ybr"))
            # 사각형을 polygon 형태(4점)로 변환
            return np.array([[xtl, ytl], [xbr, ytl], [xbr, ybr], [xtl, ybr]], dtype=np.float32)
        
        for el in img_el:
            if el.tag not in ("polygon", "box"):
                continue
        
            label = el.get("label")
            if label not in LABEL_TO_ID:
                continue
        
            if el.tag == "polygon":
                pts = parse_points(el.get("points"))
                if pts is None:
                    continue
            else:  # box
                pts = box_to_points(el)
        
            label_polys[label].append(pts)

        ink = (orig > white_thresh).astype(np.uint8)

        label_mask = np.zeros((h, w), dtype=np.uint8)

        for lbl in priority[::-1]:
            pass

        for lbl in ("chosung", "jungsung", "jongsung"):  # 기본 순서로 채움
            region = polygons_to_region_mask(h, w, label_polys[lbl])
            keep = (region == 1) & (ink == 1)
            label_mask[keep] = LABEL_TO_ID[lbl]


        out[img_name] = {"orig": orig, "label_mask": label_mask}

    return out


In [24]:
def uplus_filename_to_char(fname: str) -> str:
    stem = Path(fname).stem  # "U+C624"
    if not stem.startswith("U+"):
        raise ValueError(f"Unexpected filename format (expected U+XXXX): {fname}")
    hex_part = stem[2:]
    code = int(hex_part, 16)
    return chr(code)


def make_vis_rgb(label_mask: np.ndarray) -> np.ndarray:
    h, w = label_mask.shape
    vis = np.zeros((h, w, 3), dtype=np.uint8)
    vis[label_mask == 1, 2] = 255  
    vis[label_mask == 2, 1] = 255  
    vis[label_mask == 3, 0] = 255 
    return vis


def process_one_folder(
    folder_dir: str | Path,
    out_dir: str | Path,
    xml_name: str = "annotation.xml",
    white_thresh: int = 10,
    save_vis: bool = True,
) -> None:
    folder_dir = Path(folder_dir)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    folder_name = folder_dir.name
    base_prefix = folder_name.rsplit("_", 1)[0]  

    xml_path = folder_dir / xml_name
    if not xml_path.exists():
        print("[SKIP] no annotation.xml:", xml_path)
        return

    parsed = build_label_mask_from_xml(
        xml_path=xml_path,
        images_dir=folder_dir,
        white_thresh=white_thresh,
    )

    out_folder = out_dir / folder_name
    out_folder.mkdir(parents=True, exist_ok=True)

    vis_folder = out_folder / "_vis"
    if save_vis:
        vis_folder.mkdir(parents=True, exist_ok=True)

    for img_name, pack in parsed.items():
        label_mask = pack["label_mask"]

        ch = uplus_filename_to_char(img_name)  
        idx = index_for_output(folder_name, ch)

        out_name = f"{idx}.png"
        out_path = out_folder / out_name

        imwrite_unicode(out_path, label_mask)

        if save_vis:
            vis = make_vis_rgb(label_mask)  # BGR
            vis_path = vis_folder / out_name.replace(".png", "_vis.png")
            imwrite_unicode(vis_path, vis)

    print(f"[DONE] {folder_name}: saved {len(parsed)} masks -> {out_folder}")


In [74]:
# (1) 15개 폴더가 들어있는 루트
dataset_root = Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample\korean_sample_옥비") 

# (2) 출력 루트
out_root = Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비")

# (3) 설정
WHITE_THRESH = 10   
SAVE_VIS = True

for folder in sorted(dataset_root.iterdir()):
    if not folder.is_dir():
        continue
    xml_path = folder / "annotations.xml"
    if not xml_path.exists():
        continue  
    process_one_folder(
        folder_dir=folder,
        out_dir=out_root,
        xml_name="annotations.xml",
        white_thresh=WHITE_THRESH,
        save_vis=SAVE_VIS,
    )
print("ALL DONE")


[DONE] horiz_jong_chosung: saved 19 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비\horiz_jong_chosung
[DONE] horiz_jong_jongsung: saved 27 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비\horiz_jong_jongsung
[DONE] horiz_jong_jungsung: saved 9 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비\horiz_jong_jungsung
[DONE] horiz_nojong_chosung: saved 19 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비\horiz_nojong_chosung
[DONE] horiz_nojong_jungsung: saved 9 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_옥비\horiz_nojong_jungsung
[DONE] mix_jong_chosung: saved 19 masks -> D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sampl