# Combine the tsync2 data

In [1]:
import os
import csv
from tqdm import tqdm

input_folder = "tsync2/wrd_ph"
output_file = "tsync2/processed/combined_tsync2_thai_sentences.csv"
output_file_no_filter = "tsync2/processed/combined_tsync2_thai_sentences_no_filter.csv"

# Create lists to store sentences
thai_sentences_filtered = []
thai_sentences_unfiltered = []

# Get list of files first
files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])

# Process each .txt file in the input directory with progress bar
for filename in tqdm(files, desc="Processing files"):
    input_path = os.path.join(input_folder, filename)
    
    # Read the input file
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.readlines()
    
    # Process only the first line which contains the Thai text
    if content:
        # Process each word and add to sentence
        thai_sentence = ''.join([word for word in content[0].strip().split('|') if word])
        
        # Add the Thai sentence if not empty
        if thai_sentence:
            # Always add to unfiltered list
            thai_sentences_unfiltered.append([thai_sentence])
            
            # Add to filtered list only if it doesn't contain ๆ or ฯ
            if 'ๆ' not in thai_sentence and 'ฯ' not in thai_sentence:
                thai_sentences_filtered.append([thai_sentence])

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Write filtered sentences to CSV
print("\nWriting filtered sentences...")
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['thai sentence'])  # Header
    writer.writerows(thai_sentences_filtered)

# Write unfiltered sentences to CSV
print("Writing unfiltered sentences...")
with open(output_file_no_filter, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['thai sentence'])  # Header
    writer.writerows(thai_sentences_unfiltered)

print(f"\nCreated CSV files:")
print(f"1. Filtered sentences: {output_file} ({len(thai_sentences_filtered)} sentences)")
print(f"2. Unfiltered sentences: {output_file_no_filter} ({len(thai_sentences_unfiltered)} sentences)")
print(f"Filtered out sentences: {len(thai_sentences_unfiltered) - len(thai_sentences_filtered)}")

Processing files:   0%|          | 0/2710 [00:00<?, ?it/s]

Processing files: 100%|██████████| 2710/2710 [00:00<00:00, 11430.64it/s]


Writing filtered sentences...
Writing unfiltered sentences...

Created CSV files:
1. Filtered sentences: tsync2/processed/combined_tsync2_thai_sentences.csv (2709 sentences)
2. Unfiltered sentences: tsync2/processed/combined_tsync2_thai_sentences_no_filter.csv (2710 sentences)
Filtered out sentences: 1





# Phonemize the combined file

In [2]:
import pandas as pd
from Transliterate.transliterate.phonemizer import ThaiPhonemizer, setup_espeak

# Set up eSpeak library path
_ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'
setup_espeak(_ESPEAK_LIBRARY)
    
# Initialize phonemizer
thai_phonemizer = ThaiPhonemizer()

# Input and output file names
input_file = "tsync2/processed/combined_tsync2_thai_sentences.csv"
output_file = "tsync2/processed/combined_tsync2_thai_sentences_phoneme.csv"

try:
  # Read the CSV file
  df = pd.read_csv(input_file)

  # Generate phonemes for Thai sentences
  df["thai phoneme"] = df["thai sentence"].apply(
    lambda x: " ".join(thai_phonemizer.phonemize(x)) if pd.notnull(x) else "Error in Thai sentence"
  )

  # Save to the output file
  df.to_csv(output_file, index=False)
  print(f"File with phonemes saved as {output_file}")

except Exception as e:
  print(f"Error processing file: {e}")

File with phonemes saved as tsync2/processed/combined_tsync2_thai_sentences_phoneme.csv
