# Combine the generated data

In [1]:
import os
import pandas as pd

# Define the folder containing the CSV files and the output file name
input_folder = "generated"
output_file = "combined_sentences.csv"
output_file_no_filter = "combined_sentences_no_filter.csv"

# List to store dataframes from all files
dataframes = []
dataframes_no_filter = []

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        file_path = os.path.join(input_folder, filename)
        try:
            # Read the CSV file without headers
            df = pd.read_csv(file_path, header=None, names=["thai sentence", "english sentence"])
            
            # Remove rows where "thai sentence" contains "ๆ" or "ฯ"
            df_no_filter = df.copy()
            df = df[~df["thai sentence"].str.contains(r"[ๆฯ]", na=False)]
            
            # Add the dataframe to the list
            dataframes.append(df)
            dataframes_no_filter.append(df_no_filter)
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Combine all dataframes
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Save to the output file
    combined_df.to_csv(output_file, index=False)
    print(f"Combined file saved as {output_file}")
else:
    print("No valid CSV files found in the folder.")

# Combine all dataframes without filter
if dataframes_no_filter:
    combined_df_no_filter = pd.concat(dataframes_no_filter, ignore_index=True)

    # Save to the output file
    combined_df_no_filter.to_csv(output_file_no_filter, index=False)
    print(f"Combined file without filter saved as {output_file_no_filter}")
else:
    print("No valid CSV files found in the folder.")

Combined file saved as combined_sentences.csv
Combined file without filter saved as combined_sentences_no_filter.csv


# Phonemize the combined file

In [1]:
import os
import pandas as pd
from Transliterate.transliterate.tokenizer import EnglishTokenizer, ThaiTokenizer, setup_espeak

# Set up eSpeak library path
_ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'
setup_espeak(_ESPEAK_LIBRARY)

# Initialize tokenizers
english_tokenizer = EnglishTokenizer()
thai_tokenizer = ThaiTokenizer()

# Input and output file names
input_file = "combined_sentences.csv"
output_file = "combined_sentences_with_phoneme.csv"

try:
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Generate phonemes for Thai and English sentences
    df["thai phoneme"] = df["thai sentence"].apply(
        lambda x: " ".join(thai_tokenizer.phonemize(x)) if pd.notnull(x) else "Error in Thai sentence"
    )
    df["english phoneme"] = df["english sentence"].apply(
        lambda x: " ".join(english_tokenizer.phonemize(x)) if pd.notnull(x) else "Error in English sentence"
    )

    # Save to the output file
    df.to_csv(output_file, index=False)
    print(f"File with phonemes saved as {output_file}")

except Exception as e:
    print(f"Error processing file: {e}")


File with phonemes saved as combined_sentences_with_phoneme.csv
