In [1]:
import os
import csv
from pathlib import Path
from typing import Dict, Optional, Tuple, List
import sys

import cv2
import numpy as np

def setup_project_path():
    current = Path.cwd()
    while not (current / 'craft').exists():
        current = current.parent
    return current
project_root = setup_project_path()
sys.path.insert(0, str(project_root))

In [2]:
def imwrite_unicode(path: str | Path, img: np.ndarray) -> bool:
    path = str(path)
    ext = os.path.splitext(path)[1]
    ok, buf = cv2.imencode(ext, img)
    if not ok:
        return False
    with open(path, "wb") as f:
        f.write(buf.tobytes())
    return True


def imread_unicode(path: str | Path, flags=cv2.IMREAD_GRAYSCALE) -> Optional[np.ndarray]:
    path = str(path)
    try:
        with open(path, "rb") as f:
            data = f.read()
        arr = np.frombuffer(data, np.uint8)
        return cv2.imdecode(arr, flags)
    except Exception as e:
        print("[imread_unicode ERROR]", e, "->", path)
        return None


In [3]:
def decompose_hangul(ch: str) -> Tuple[int, int, int]:
    code = ord(ch)
    if not (0xAC00 <= code <= 0xD7A3):
        raise ValueError("한글 음절만 지원")
    s = code - 0xAC00
    cho = s // (21 * 28)
    jung = (s % (21 * 28)) // 28
    jong = s % 28
    return cho, jung, jong


JUNG_HORIZ = {0,1,2,3,4,5,6,7,20}
JUNG_VERT  = {8,12,13,17,18}
JUNG_MIX   = {9,10,11,14,15,16,19}


def vowel_type(jung_idx: int) -> str:
    if jung_idx in JUNG_HORIZ: return "horizontal"
    if jung_idx in JUNG_VERT:  return "vertical"
    if jung_idx in JUNG_MIX:   return "complex"
    raise ValueError("Unknown jung index")


In [4]:
def load_mask_255(path: Path) -> np.ndarray:
    img = imread_unicode(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(path)
    return (img > 0).astype(np.uint8) * 255

In [5]:
def load_parts(ch: str, jamo_root: Path) -> Dict[str, Optional[np.ndarray]]:
    cho, jung, jong = decompose_hangul(ch)
    vtype = vowel_type(jung)
    has_jong = (jong != 0)

    base = "jong" if has_jong else "nojong"
    folder = jamo_root / base / vtype

    parts = {
        "cho":  load_mask_255(folder / "chosung"  / f"{cho}.png"),
        "jung": load_mask_255(folder / "jungsung" / f"{jung}.png"),
        "jong": None
    }

    if has_jong:
        parts["jong"] = load_mask_255(folder / "jongsung" / f"{jong}.png")

    return parts


In [6]:
def make_affine(rng, h, w, scale_range, shift_frac):
    cx, cy = (w-1)/2, (h-1)/2
    s = rng.uniform(*scale_range)
    tx = rng.uniform(*shift_frac) * w
    ty = rng.uniform(*shift_frac) * h
    return np.array([
        [s, 0, (1-s)*cx + tx],
        [0, s, (1-s)*cy + ty]
    ], dtype=np.float32)

def warp(mask, M):
    h, w = mask.shape
    out = cv2.warpAffine(
        mask, M, (w, h),
        flags=cv2.INTER_NEAREST,
        borderMode=cv2.BORDER_CONSTANT,
        borderValue=0
    )
    return (out > 0).astype(np.uint8) * 255

def has_overlap(cho, jung, jong):
    if np.any((cho>0) & (jung>0)): return True
    if jong is not None:
        if np.any((cho>0) & (jong>0)): return True
        if np.any((jung>0) & (jong>0)): return True
    return False

def augment_parts_with_retry(parts, rng, scale_range, shift_frac, max_tries=50):
    base_cho, base_jung, base_jong = parts["cho"], parts["jung"], parts["jong"]
    h, w = base_cho.shape

    last = None

    for _ in range(max_tries):
        cho  = warp(base_cho,  make_affine(rng, h, w, scale_range, shift_frac))
        jung = warp(base_jung, make_affine(rng, h, w, scale_range, shift_frac))
        jong = warp(base_jong, make_affine(rng, h, w, scale_range, shift_frac)) if base_jong is not None else None

        last = {"cho": cho, "jung": jung, "jong": jong}

        if not has_overlap(cho, jung, jong):
            return last

    return last


In [7]:
def to_label(parts):
    label = np.zeros(parts["cho"].shape, np.uint8)
    label[parts["cho"]>0] = 1
    label[parts["jung"]>0] = 2
    if parts["jong"] is not None:
        label[parts["jong"]>0] = 3
    return label

def to_visual(parts):
    out = np.maximum(parts["cho"], parts["jung"])
    if parts["jong"] is not None:
        out = np.maximum(out, parts["jong"])
    return out


In [8]:
def read_csv(csv_path: Path):
    rows = []
    with open(csv_path, "r", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        for r in reader:
            rows.append({
                "char": r["char"],
                "group6": r["group6"]
            })
    return rows


In [36]:
def generate_dataset(
    csv_path: Path,
    jamo_root: Path,
    out_root: Path,
    variants_per_char=5,
    scale_range=(0.92, 1.08),
    shift_frac=(-0.03, 0.03),
    seed=42,
    max_tries=50,
):
    rng = np.random.default_rng(seed)

    img_root   = out_root / "images"
    label_root = out_root / "labels"

    groups = [
        "vertical_no_jong", "vertical_jong",
        "horizontal_no_jong", "horizontal_jong",
        "complex_no_jong", "complex_jong"
    ]

    GROUP6_MAP = {
        "vert_no_jong": "vertical_no_jong",
        "vert_jong": "vertical_jong",
        "horiz_no_jong": "horizontal_no_jong",
        "horiz_jong": "horizontal_jong",
        "mix_no_jong": "complex_no_jong",
        "mix_jong": "complex_jong",

        "vertical_no_jong": "vertical_no_jong",
        "vertical_jong": "vertical_jong",
        "horizontal_no_jong": "horizontal_no_jong",
        "horizontal_jong": "horizontal_jong",
        "complex_no_jong": "complex_no_jong",
        "complex_jong": "complex_jong",
    }

    for g in groups:
        (img_root / g).mkdir(parents=True, exist_ok=True)
        (label_root / g).mkdir(parents=True, exist_ok=True)

    rows = read_csv(csv_path)
    if len(rows) == 0:
        raise ValueError("CSV is empty")

    for r in rows:
        ch = r.get("char")
        if not ch or len(ch) != 1:
            raise ValueError(f"Invalid char in CSV row: {r}")

        raw_group = r.get("group6")
        if raw_group not in GROUP6_MAP:
            raise ValueError(f"Unknown group6 value: {raw_group}")

        group = GROUP6_MAP[raw_group]

        parts_base = load_parts(ch, jamo_root)

        stem = f"{ch}_0x{ord(ch):04x}"

        for k in range(variants_per_char):
            aug = augment_parts_with_retry(
                parts_base,
                rng=rng,
                scale_range=scale_range,
                shift_frac=shift_frac,
                max_tries=max_tries
            )

            vis = to_visual(aug) 
            lab = to_label(aug)   

            imwrite_unicode(img_root   / group / f"{stem}_{k:02d}_흰꼬리수리.png", vis)
            imwrite_unicode(label_root / group / f"{stem}_{k:02d}_흰꼬리수리.png", lab)

    print("Dataset generation complete.")


In [38]:
DATA_ROOT = Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo")

csv_path  = project_root / "hangul_dataset"/"korean_char_frequence_analysis"/"hangul_syllable_frequency.csv"
out_root  = project_root / "hangul_dataset" / "korean_generated"

generate_dataset(
    csv_path=project_root / "hangul_dataset"/"korean_char_frequence_analysis"/"hangul_syllable_frequency.csv",
    jamo_root=Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\korean_sample_jamo\korean_sample_jamo_흰꼬리수리"),
    out_root= out_root / "korean_generated_흰꼬리수리",
    variants_per_char=3
)


Dataset generation complete.
