In [7]:
#!/usr/bin/env python3
"""
Translate Excel SHEET CONTENTS (cell text) using a user-provided map.json.

- Does NOT rename sheets
- Does NOT modify formulas (skips them by default)
- Does NOT overwrite map.json (read-only)
- Uses strict Hangul-with-spaces phrases (Option A) + remaining Hangul runs
- Writes:
  * translated workbook
  * missing_items.csv (Hangul detected but not present in map.json)

"""

import re
import json
import collections
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
from openpyxl import load_workbook

# -----------------------------
# CONFIG
# -----------------------------
INPUT_XLSX  = r"C:\Users\r103317\Projects\Monthly Report\.venv\25.12-36WM_full.xlsx"
OUTPUT_XLSX = "2025.12-36WM-translated.xlsx"

# Use your cleaned map file here (or rename it to map.json)
MAP_JSON = "map.json"

MISSING_CSV  = "missing_items.csv"

SKIP_FORMULAS = True
APPLY_SPACE_NORMALIZATION = True

# -----------------------------
# HANGUL DETECTION / EXTRACTION
# -----------------------------
HANGUL_CHAR_RE = re.compile(r"[\uac00-\ud7a3]")
HANGUL_PHRASE_RE = re.compile(r"[\uac00-\ud7a3]+(?:\s+[\uac00-\ud7a3]+)+")
HANGUL_RUN_RE = re.compile(r"[\uac00-\ud7a3]+")

def has_hangul(s: str) -> bool:
    return isinstance(s, str) and bool(HANGUL_CHAR_RE.search(s))

def extract_phrases_and_runs_strict(text: str) -> Tuple[List[str], List[str]]:
    phrases = HANGUL_PHRASE_RE.findall(text)
    remainder = text
    for p in phrases:
        remainder = remainder.replace(p, " ")
    runs = HANGUL_RUN_RE.findall(remainder)
    return phrases, runs

# -----------------------------
# OPTIONAL SPACE NORMALIZATION
# -----------------------------
def normalize_spaces(text: str) -> str:
    if not isinstance(text, str):
        return text
    text = re.sub(r"\)(?=[A-Za-z0-9])", ") ", text)
    text = re.sub(r"([A-Za-z])([\uac00-\ud7a3])", r"\1 \2", text)
    text = re.sub(r"([\uac00-\ud7a3])([A-Za-z])", r"\1 \2", text)
    text = re.sub(r"([\uac00-\ud7a3])(\d*Plant)", r"\1 \2", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text

# -----------------------------
# TOKEN PROTECTION
# -----------------------------
# Protect codes/ids/numbers; protect parentheses only if NO Hangul inside.
TOKEN_PATTERNS = [
    r"\[[^\]]+\]",
    r"\b[A-Z]{1,5}\d{0,5}\b",
    r"\b\d+(?:\.\d+)?(?:E[-+]\d+)?\b",
    r"\((?!.*[\uac00-\ud7a3])[^\)]*\)",
]
TOKEN_RE = re.compile("|".join(f"({p})" for p in TOKEN_PATTERNS))

def protect_tokens(text: str) -> Tuple[str, List[str]]:
    tokens: List[str] = []
    def repl(m):
        tokens.append(m.group(0))
        return f"__TOK{len(tokens)-1}__"
    return TOKEN_RE.sub(repl, text), tokens

def restore_tokens(text: str, tokens: List[str]) -> str:
    for i, tok in enumerate(tokens):
        text = text.replace(f"__TOK{i}__", tok)
    return text

# -----------------------------
# MAP LOADING (READ-ONLY)
# -----------------------------
def load_fragments_map(path: str) -> Dict[str, str]:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Map file not found: {p.resolve()}")
    with p.open("r", encoding="utf-8") as f:
        data = json.load(f)
    fragments = data.get("fragments", {})
    if not isinstance(fragments, dict):
        raise ValueError("Map must look like: {\"fragments\": { ... }}")
    return fragments

def build_pairs(fragments: Dict[str, str]) -> List[Tuple[str, str]]:
    keys = sorted(fragments.keys(), key=len, reverse=True)
    return [(k, fragments[k]) for k in keys]

def translate_text_with_map(text: str, pairs: List[Tuple[str, str]]) -> str:
    masked, toks = protect_tokens(text)
    out = masked
    for k, v in pairs:
        out = out.replace(k, v)
    out = restore_tokens(out, toks)
    return out

# -----------------------------
# MAIN
# -----------------------------
def main():
    fragments = load_fragments_map(MAP_JSON)
    pairs = build_pairs(fragments)

    wb = load_workbook(INPUT_XLSX, data_only=False)

    missing_counts = collections.Counter()
    missing_example = {}

    scanned = 0
    changed = 0

    for ws in wb.worksheets:
        for row in ws.iter_rows():
            for cell in row:
                v = cell.value
                if not isinstance(v, str):
                    continue
                if SKIP_FORMULAS and v.startswith("="):
                    continue

                scanned += 1
                text = v.strip()
                if APPLY_SPACE_NORMALIZATION:
                    text = normalize_spaces(text)

                if not has_hangul(text):
                    if APPLY_SPACE_NORMALIZATION and text != v:
                        cell.value = text
                    continue

                # report missing (if any)
                phrases, runs = extract_phrases_and_runs_strict(text)
                for item in phrases + runs:
                    if item and item not in fragments:
                        missing_counts[item] += 1
                        missing_example.setdefault(item, f"{ws.title}!{cell.coordinate}")

                out = translate_text_with_map(text, pairs)
                if APPLY_SPACE_NORMALIZATION:
                    out = normalize_spaces(out)

                if out != v:
                    cell.value = out
                    changed += 1

    wb.save(OUTPUT_XLSX)

    df_missing = pd.DataFrame(
        [{"hangul": k, "count": int(c), "example_cell": missing_example.get(k, "")}
         for k, c in missing_counts.most_common()]
    )
    df_missing.to_csv(MISSING_CSV, index=False, encoding="utf-8-sig")

    print("Done.")
    print(f"Output workbook: {OUTPUT_XLSX}")
    print(f"Scanned text cells (non-formula): {scanned}")
    print(f"Changed cells: {changed}")
    print(f"Missing unique Hangul items: {len(missing_counts)} -> {MISSING_CSV}")

if __name__ == "__main__":
    main()

Done.
Output workbook: 2025.12-36WM-translated.xlsx
Scanned text cells (non-formula): 43896
Changed cells: 31970
Missing unique Hangul items: 0 -> missing_items.csv
