# Combine and Phonemize

In [1]:
import os
import csv
import ast
from Transliterate.transliterate.tokenizer import ThaiTokenizer

thai_tokenizer = ThaiTokenizer()

def format_phoneme(phoneme_str):
    """Convert phoneme string to space-separated format"""
    phoneme_list = ast.literal_eval(phoneme_str)
    return ' '.join(phoneme_list)

def combine_thai_texts(input_dir, output_dir):
    """
    Process all Thai text files in a directory and create two CSV files:
    1. Thai sentences only (all sentences)
    2. Thai sentences with phonemes (only sentences that successfully converted to phonemes)
    
    Args:
        input_dir (str): Path to input directory containing text files
        output_dir (str): Path to output directory for CSV files
    """
    # Create list to store sentence pairs
    thai_sentences = []
    sentence_phoneme_pairs = []
    
    # Process each .txt file in the input directory
    for filename in sorted(os.listdir(input_dir)):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            
            # Read the input file
            with open(input_path, 'r', encoding='utf-8') as f:
                content = f.readlines()
            
            # Process only the first line which contains the Thai text
            if content:
                thai_words = content[0].strip().split('|')
                thai_sentence = ''
                phoneme_parts = []
                phoneme_success = True
                
                # Process each word
                for word in thai_words:
                    if word:  # Skip empty strings
                        # Add word to sentence
                        thai_sentence += word
                        
                        # Try to get phoneme
                        try:
                            phoneme = thai_tokenizer.phonemize(word)
                            phoneme_str = ' '.join(phoneme)
                            phoneme_parts.append(phoneme_str)
                        except Exception as e:
                            print(f"Error processing word '{word}' in file {filename}: {str(e)}")
                            phoneme_success = False
                            continue
                
                # Always add the Thai sentence
                if thai_sentence:
                    thai_sentences.append([thai_sentence])
                
                # Only add to phoneme pairs if all words were successfully processed
                if thai_sentence and phoneme_parts and phoneme_success:
                    # Join phonemes with underscore between words
                    full_phoneme = ' _ '.join(phoneme_parts)
                    sentence_phoneme_pairs.append([thai_sentence, full_phoneme])
                
            print(f"Processed {filename}")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # File paths for the two output files
    sentences_file = os.path.join(output_dir, 'combined_tsync2_thai_sentences.csv')
    phonemes_file = os.path.join(output_dir, 'combined_tsync2_thai_sentences_phoneme.csv')
    
    # Write Thai sentences to first CSV (all sentences)
    with open(sentences_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['thai sentence'])  # Header
        writer.writerows(thai_sentences)
    
    # Write Thai sentences with phonemes to second CSV (only successful conversions)
    with open(phonemes_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['thai sentence', 'thai phoneme'])  # Header
        writer.writerows(sentence_phoneme_pairs)
    
    print(f"\nCreated CSV files:")
    print(f"1. Thai sentences only: {sentences_file} ({len(thai_sentences)} sentences)")
    print(f"2. Thai sentences with phonemes: {phonemes_file} ({len(sentence_phoneme_pairs)} sentences)")
    print(f"Sentences with failed phoneme conversion: {len(thai_sentences) - len(sentence_phoneme_pairs)}")

if __name__ == "__main__":
    input_dir = "tsync2/wrd_ph"  # Directory containing the .txt files
    output_dir = "tsync2/processed"  # Directory for output files
    
    combine_thai_texts(input_dir, output_dir)

Processed tsync2_noon_0_1228.txt
Processed tsync2_noon_0_1866.txt
Processed tsync2_noon_0_1943.txt
Processed tsync2_noon_0_250.txt
Processed tsync2_noon_0_3012_1.txt
Processed tsync2_noon_0_3012_2.txt
Processed tsync2_noon_0_3064.txt
Processed tsync2_noon_0_3685.txt
Processed tsync2_noon_0_4151.txt
Processed tsync2_noon_0_4177.txt
Processed tsync2_noon_0_4412.txt
Processed tsync2_noon_0_4453_1.txt
Processed tsync2_noon_0_4453_2.txt
Processed tsync2_noon_0_4453_3.txt
Processed tsync2_noon_0_4567.txt
Processed tsync2_noon_0_5158.txt
Processed tsync2_noon_0_6063.txt
Processed tsync2_noon_0_6448.txt
Processed tsync2_noon_0_6450_1.txt
Processed tsync2_noon_0_6450_2.txt
Processed tsync2_noon_0_6594.txt
Processed tsync2_noon_0_7124.txt
Processed tsync2_noon_0_7137.txt
Processed tsync2_noon_0_728_1.txt
Processed tsync2_noon_0_728_2.txt
Processed tsync2_noon_0_9087.txt
Processed tsync2_noon_0_9168.txt
Processed tsync2_noon_0_9241.txt
Processed tsync2_noon_10_11088.txt
Processed tsync2_noon_10_1