In [1]:
from pathlib import Path

import pandas as pd

from mandarin_tamer.mandarin_tamer import convert_mandarin_script

In [2]:
def convert_csv_sentences(input_file: str, output_dir: str, target_script: str = "zh_cn") -> None:
    # Read the input CSV file
    df = pd.read_csv(input_file)

    # Get the opposite script for reconversion
    opposite_script = "zh_tw" if target_script == "zh_cn" else "zh_cn"

    # Convert sentences to target script and then back
    df["converted_sentence"] = df.iloc[:, 0].apply(lambda x: convert_mandarin_script(x, target_script=target_script))
    df["reconverted_sentence"] = df["converted_sentence"].apply(
        lambda x: convert_mandarin_script(x, target_script=opposite_script)
    )

    # Check if the reconverted sentence is the same as the original sentence
    df["same_as_original"] = df["reconverted_sentence"] == df.iloc[:, 0]

    # Create output filename
    input_path = Path(input_file)
    output_file = Path(output_dir) / f"tested_{input_path.name}"

    # Save to new CSV file
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Conversion complete. Output saved to: {output_file}")

In [3]:
input_csv = "sentence_csvs/trad_tatoeba_sentences_sample.csv"
convert_csv_sentences(input_csv, output_dir="output", target_script="zh_cn")


Phrase conversion timing breakdown [modern_trad]:
  - [modern_trad] Trie building took: 0.000s
  - [modern_trad] Finding matches took: 0.000s
  - [modern_trad] Number of matches found: 0
  - [modern_trad] Applying replacements took: 0.000s
Total [modern_trad] phrase conversion took: 0.000s

Phrase conversion timing breakdown [norm_trad]:
  - [norm_trad] Trie building took: 0.001s
  - [norm_trad] Finding matches took: 0.000s
  - [norm_trad] Number of matches found: 0
  - [norm_trad] Applying replacements took: 0.000s
Total [norm_trad] phrase conversion took: 0.001s

One-to-many conversion timing breakdown [norm_trad]:
[norm_trad] Using OpenCC mode:
  - [norm_trad] OpenCC conversion took: 0.276s
  - [norm_trad] Found 0 characters to convert
  - [norm_trad] Character replacement took: 0.000s
  - [norm_trad] Protecting indexes took: 0.000s
Total [norm_trad] one-to-many conversion took: 0.276s

Phrase conversion timing breakdown [tw2t]:
  - [tw2t] Trie building took: 0.001s
  - [tw2t] Find