In [25]:
import json
import re
from pathlib import Path
from collections import Counter
from typing import Iterable, Tuple, Optional,  Dict, List
import csv
from dataclasses import dataclass

In [3]:
HANGUL_SYLLABLE_RE = re.compile(r"[가-힣]")

In [9]:
def iter_content_sentences_from_file(json_path: Path) -> Iterable[str]:
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[WARN] failed to read {json_path}: {e}")
        return

    named_entity = data.get("named_entity")
    if not isinstance(named_entity, list):
        return

    for ne in named_entity:
        if not isinstance(ne, dict):
            continue
        content = ne.get("content")
        if not isinstance(content, list):
            continue
        for item in content:
            if not isinstance(item, dict):
                continue
            s = item.get("sentence")
            if isinstance(s, str) and s:
                yield s


In [11]:
def iter_json_files_recursive(root_dir: Path) -> Iterable[Path]:
    yield from root_dir.rglob("*.json")


In [15]:
def count_hangul_syllables_from_root(root_dir: Path) -> Tuple[Counter, int, int]:
    counter = Counter()
    total_chars = 0
    processed = 0

    for jp in iter_json_files_recursive(root_dir):
        processed += 1
        for sent in iter_content_sentences_from_file(jp):
            chars = HANGUL_SYLLABLE_RE.findall(sent)
            counter.update(chars)
            total_chars += len(chars)

        if processed % 200 == 0:
            print(f"...processed {processed} files, total_hangul_chars={total_chars:,}, unique={len(counter):,}")

    return counter, total_chars, processed


In [17]:
def print_top(counter: Counter, k: int = 50):
    for ch, c in counter.most_common(k):
        print(f"{ch}\t{c}")

In [21]:
root_dir = Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\030.웹데이터 기반 한국어 말뭉치 데이터\01.데이터\1.Training\라벨링데이터\TL1")  
top_k = 100

counter, total_chars, n_files = count_hangul_syllables_from_root(root_dir)

print("\n===== SUMMARY =====")
print("Root:", root_dir)
print("JSON files processed:", n_files)
print("Total Hangul syllables:", f"{total_chars:,}")
print("Unique Hangul syllables:", f"{len(counter):,}")

print(f"\n===== TOP {top_k} =====")
print_top(counter, top_k)

...processed 200 files, total_hangul_chars=13,202,615, unique=1,674
...processed 400 files, total_hangul_chars=26,680,904, unique=1,779
...processed 600 files, total_hangul_chars=41,687,496, unique=1,862
...processed 800 files, total_hangul_chars=62,098,610, unique=2,023
...processed 1000 files, total_hangul_chars=83,916,818, unique=2,146
...processed 1200 files, total_hangul_chars=101,193,942, unique=2,211
...processed 1400 files, total_hangul_chars=112,820,723, unique=2,229
...processed 1600 files, total_hangul_chars=125,048,484, unique=2,258
...processed 1800 files, total_hangul_chars=133,398,476, unique=2,265
...processed 2000 files, total_hangul_chars=143,718,479, unique=2,268
...processed 2200 files, total_hangul_chars=154,830,040, unique=2,275
...processed 2400 files, total_hangul_chars=164,775,948, unique=2,278
...processed 2600 files, total_hangul_chars=175,092,235, unique=2,283
...processed 2800 files, total_hangul_chars=187,753,025, unique=2,289
...processed 3000 files, tota

In [33]:
HANGUL_BASE = 0xAC00
HANGUL_END  = 0xD7A3
N_CHO  = 19
N_JUNG = 21
N_JONG = 28

JUNG_LIST = [
    "ㅏ","ㅐ","ㅑ","ㅒ","ㅓ","ㅔ","ㅕ","ㅖ","ㅗ","ㅘ","ㅙ","ㅚ",
    "ㅛ","ㅜ","ㅝ","ㅞ","ㅟ","ㅠ","ㅡ","ㅢ","ㅣ"
]

VERTICAL_VOWELS = {"ㅏ","ㅐ","ㅑ","ㅒ","ㅓ","ㅔ","ㅕ","ㅖ","ㅣ"}  
HORIZONTAL_VOWELS = {"ㅗ","ㅛ","ㅜ","ㅠ","ㅡ"}          
MIX_VOWELS = {"ㅘ","ㅙ","ㅚ","ㅝ","ㅞ","ㅟ","ㅢ"}        

def decompose_syllable(ch: str) -> Optional[Tuple[int, int, int]]:
    if not ch or len(ch) != 1:
        return None
    code = ord(ch)
    if code < HANGUL_BASE or code > HANGUL_END:
        return None

    s_index = code - HANGUL_BASE
    cho = s_index // (N_JUNG * N_JONG)
    jung = (s_index % (N_JUNG * N_JONG)) // N_JONG
    jong = s_index % N_JONG
    return cho, jung, jong

def vowel_shape(jung_idx: int) -> str:
    v = JUNG_LIST[jung_idx]
    if v in VERTICAL_VOWELS:
        return "vertical"
    if v in HORIZONTAL_VOWELS:
        return "horizontal"
    if v in MIX_VOWELS:
        return "mix"
    return "unknown"

def classify_6group(ch: str) -> Optional[str]:
    dec = decompose_syllable(ch)
    if dec is None:
        return None
    _, jung, jong = dec
    vs = vowel_shape(jung)
    has_jong = (jong != 0)
    return f"{vs}_{'jong' if has_jong else 'no_jong'}"

In [27]:
def save_counter_with_6group_csv(
    counter, 
    out_csv: Path, 
    top_n: int = 1000,           
    save_all_groups_csv: bool = True
):
    out_csv = Path(out_csv)
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    items = counter.most_common(top_n) if top_n else counter.most_common()

    rows = []
    for ch, cnt in items:
        grp = classify_6group(ch)
        if grp is None:
            continue
        dec = decompose_syllable(ch)
        _, jung, jong = dec
        rows.append({
            "char": ch,
            "count": cnt,
            "group6": grp,
            "vowel_shape": vowel_shape(jung),
            "has_jong": int(jong != 0),
            "jung_vowel": JUNG_LIST[jung],
        })

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(
            f, 
            fieldnames=["char","count","group6","vowel_shape","has_jong","jung_vowel"]
        )
        w.writeheader()
        w.writerows(rows)

    print("Saved:", out_csv.resolve(), f"(rows={len(rows)})")

    if save_all_groups_csv:
        by_group = {}
        for r in rows:
            by_group.setdefault(r["group6"], []).append(r)

        for g, g_rows in by_group.items():
            g_path = out_csv.with_name(out_csv.stem + f"__{g}" + out_csv.suffix)
            with open(g_path, "w", newline="", encoding="utf-8") as f:
                w = csv.DictWriter(
                    f, 
                    fieldnames=["char","count","group6","vowel_shape","has_jong","jung_vowel"]
                )
                w.writeheader()
                w.writerows(g_rows)
            print("Saved group:", g_path.resolve(), f"(rows={len(g_rows)})")


In [35]:
out_csv = Path(r"D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000.csv")

save_counter_with_6group_csv(
    counter=counter,
    out_csv=out_csv,
    top_n=None,                 # 상위 1000개만 저장. 전체 저장하려면 None
    save_all_groups_csv=True    # 그룹별 CSV도 따로 저장
)


Saved: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000.csv (rows=2841)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__vertical_no_jong.csv (rows=160)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__horizontal_jong.csv (rows=714)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__mix_no_jong.csv (rows=112)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__horizontal_no_jong.csv (rows=94)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__vertical_jong.csv (rows=1303)
Saved group: D:\Study\학교강의\4학년2학기\캡스톤\Baram_Handwritting_Analysis\hangul_dataset\hangul_syllable_frequency_top1000__mix_jong.csv (rows=458)


In [None]:
save_top = None

rows = counter.most_common(save_top) if save_top else counter.most_common()

with open(out_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["char", "count"])
    for ch, c in rows:
        w.writerow([ch, c])

print("Saved:", out_csv.resolve())