In [1]:
import time
from pathlib import Path

import pandas as pd

from mandarin_tamer.mandarin_tamer import convert_mandarin_script

In [2]:
def convert_csv_sentences(input_file: str, output_dir: str, target_script: str = "zh_cn") -> None:
    """Convert sentences in a CSV file from one script to another and back, with time tracking."""
    start_time = time.time()

    # Read the input CSV file
    df = pd.read_csv(input_file)
    row_count = len(df)

    # Get the opposite script for reconversion
    opposite_script = "zh_tw" if target_script == "zh_cn" else "zh_cn"

    # Track conversion times for each row
    row_times = []

    # Function to time individual row conversion
    def timed_convert(text, target):
        row_start = time.time()
        result = convert_mandarin_script(text, target_script=target)
        row_times.append(time.time() - row_start)
        return result

    # Convert sentences to target script and then back
    df["converted_sentence"] = df.iloc[:, 0].apply(lambda x: timed_convert(x, target_script))
    df["reconverted_sentence"] = df["converted_sentence"].apply(
        lambda x: convert_mandarin_script(x, target_script=opposite_script)
    )

    # Check if the reconverted sentence is the same as the original sentence
    df["same_as_original"] = df["reconverted_sentence"] == df.iloc[:, 0]

    # Create output filename
    input_path = Path(input_file)
    output_file = Path(output_dir) / f"tested_{input_path.name}"

    # Save to new CSV file
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    df.to_csv(output_file, index=False)

    # Calculate time metrics
    total_time = time.time() - start_time
    avg_time_per_row = sum(row_times) / len(row_times) if row_times else 0

    # Print results with time metrics
    print(f"Conversion complete. Output saved to: {output_file}")
    print(f"Processed {row_count} rows in {total_time:.2f} seconds")
    print(f"Average conversion time per row: {avg_time_per_row * 1000:.2f} ms")

In [3]:
input_csv = "sentence_csvs/trad_tatoeba_sentences_sample.csv"
convert_csv_sentences(input_csv, output_dir="output", target_script="zh_cn")

Conversion complete. Output saved to: output\tested_trad_tatoeba_sentences_sample.csv
