# Import libraries

In [1]:
import os
from pathlib import Path
import shutil
from pydub import AudioSegment
from tqdm import tqdm
import torch

from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

import json

from transliterate.phonemizer import ThaiPhonemizer

# Moving files to the new directory

In [2]:
thai_phonemizer = ThaiPhonemizer()

In [None]:
# Define paths
DEST_DIR = "../data/converted/TSync2-to-vctk-ph"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt/TSync2")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav44/TSync2")
SRC_AUDIO_PATH = "../data/raw/TSync2/wav"
SRC_TEXT_PATH = "../data/raw/TSync2/wrd_ph"

def convert_wav_to_flac(src_path: str, dst_path: str) -> bool:
    """Convert WAV file to FLAC format using pydub"""
    try:
        # Load WAV file
        audio = AudioSegment.from_file(src_path, format="wav")
        
        # Export as FLAC"
        audio.export(
            dst_path,
            format="flac",
            parameters=[
                "-ac", "1",  # mono audio
                "-ar", "32000",  # 32kHz sample rate
                "-compression_level", "8"  # highest compression
            ]
        )
    except Exception as e:
        print(f"Error converting {src_path}: {str(e)}")
        return False
    return True

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

all_chars = set()
skip_files = []

# Get sorted lists of files
audio_files = sorted(Path(SRC_AUDIO_PATH).glob("*.wav"))
text_files = sorted(Path(SRC_TEXT_PATH).glob("*.txt"))

# Process files with progress bar
for i, (audio_file, text_file) in enumerate(tqdm(zip(audio_files, text_files), total=len(audio_files), desc="Processing files"), 1):
    try:
        # Process audio
        src_audio = str(audio_file)
        dest_audio = os.path.join(DEST_AUDIO_PATH, f"TSync2_{i:03d}_mic1.flac")
        
        if not convert_wav_to_flac(src_audio, dest_audio):
            raise Exception("Failed to convert audio")
        
        # Process text
        with text_file.open('r', encoding='utf-8') as f:
            clean_text = " ".join(thai_phonemizer.phonemize("".join(f.readline().strip().split("|"))))
            # print(clean_text)
            all_chars.update(clean_text)
        
        dest_text = os.path.join(DEST_TEXT_PATH, f"TSync2_{i:03d}.txt")
        with open(dest_text, 'w', encoding='utf-8') as f:
            f.write(clean_text)
            
    except Exception as e:
        print(f"Error processing pair {i}: {e}")
        skip_files.append(i)
        continue

print(f"Processed {len(audio_files) - len(skip_files)} file pairs")
print(f"Skipped {len(skip_files)} pairs")
print(f"Unique characters found: {''.join(sorted(all_chars))}")

Clearing destination folder


Processing files:  13%|█▎        | 341/2710 [03:18<24:00,  1.64it/s]

# Resample and trim audio

In [3]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/TSync2-to-vctk-ph/wav44"
dst_dir = "../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed"

# Walk through the source directory and copy files while preserving directory structure
for root, dirs, files in os.walk(src_dir):
  for dir_name in dirs:
    src_path = os.path.join(root, dir_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    os.makedirs(dst_path, exist_ok=True)
  
  for file_name in files:
    src_path = os.path.join(root, file_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    shutil.copy2(src_path, dst_path)

In [4]:
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_files("../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed", SAMPLE_RATE, file_ext="flac", n_jobs=NUM_RESAMPLE_THREADS)

Resampling the audio files...
Found 2710 files...


100%|██████████| 2710/2710 [00:21<00:00, 124.46it/s]

Done !





In [5]:
input_folder = Path("../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed")

# Get VAD model once
model_and_utils = get_vad_model_and_utils(use_cuda=torch.cuda.is_available(), use_onnx=False)

# Get all .flac files
flac_files = list(input_folder.glob('**/*.flac'))
total_files = len(flac_files)
print(f"Found {total_files} .flac files to process")

# Track files with no speech detected
no_speech_files = []

for input_path in tqdm(flac_files, desc="Processing files"):
   # Preserve directory structure
   relative_path = input_path.relative_to(input_folder)
   output_path = input_folder / relative_path
   
   # Create subdirectories
   output_path.parent.mkdir(parents=True, exist_ok=True)
   
   try:
       output_path, is_speech = remove_silence(
           model_and_utils,
           str(input_path),
           str(output_path),
           trim_just_beginning_and_end=True,
           use_cuda=torch.cuda.is_available()
       )
       # If no speech detected, add to list
       if not is_speech:
           no_speech_files.append(str(output_path))
   except Exception as e:
       print(f"Error processing {relative_path}: {str(e)}")

print("\nProcessing complete")

# Write list of files with no speech detected
if no_speech_files:
   log_path = input_folder.parent / "no_speech_files.txt"
   with open(log_path, "w", encoding="utf-8") as f:
       for file in no_speech_files:
           f.write(f"{file}\n")
   print(f"\nFound {len(no_speech_files)} files with no speech. List saved to {log_path}")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 2710 .flac files to process


Processing files:  51%|█████     | 1377/2710 [08:38<07:17,  3.05it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2161_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████     | 1380/2710 [08:39<05:48,  3.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2163_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2164_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████     | 1383/2710 [08:40<06:48,  3.25it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2168_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████     | 1388/2710 [08:41<06:21,  3.47it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2171_mic1.flac probably does not have speech please check it !!


Processing files:  52%|█████▏    | 1415/2710 [08:50<06:00,  3.59it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2196_mic1.flac probably does not have speech please check it !!


Processing files:  53%|█████▎    | 1439/2710 [08:58<07:49,  2.71it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2219_mic1.flac probably does not have speech please check it !!


Processing files:  53%|█████▎    | 1442/2710 [08:59<05:55,  3.57it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2220_mic1.flac probably does not have speech please check it !!


Processing files:  54%|█████▍    | 1457/2710 [09:04<05:42,  3.66it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2234_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▍    | 1477/2710 [09:09<04:18,  4.78it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2252_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▍    | 1483/2710 [09:11<04:45,  4.29it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2259_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▌    | 1499/2710 [09:16<04:41,  4.30it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2271_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▌    | 1514/2710 [09:20<05:54,  3.37it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2287_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▌    | 1517/2710 [09:21<04:06,  4.84it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2289_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▌    | 1519/2710 [09:22<06:15,  3.18it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2291_mic1.flac probably does not have speech please check it !!


Processing files:  57%|█████▋    | 1535/2710 [09:26<04:42,  4.16it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2305_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1561/2710 [09:35<06:11,  3.09it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2329_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1564/2710 [09:36<07:00,  2.72it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2332_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1579/2710 [09:41<05:13,  3.61it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2346_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1585/2710 [09:43<05:05,  3.69it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2350_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1595/2710 [09:46<05:38,  3.29it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2360_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1602/2710 [09:48<05:05,  3.63it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2366_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1608/2710 [09:50<06:31,  2.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2372_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1616/2710 [09:53<04:58,  3.67it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2379_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1618/2710 [09:53<05:19,  3.42it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2380_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1620/2710 [09:54<04:44,  3.83it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2382_mic1.flac probably does not have speech please check it !!


Processing files:  60%|██████    | 1630/2710 [09:57<04:45,  3.78it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2391_mic1.flac probably does not have speech please check it !!


Processing files:  60%|██████    | 1635/2710 [09:59<07:07,  2.52it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2397_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████    | 1651/2710 [10:05<05:18,  3.33it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2410_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████▏   | 1663/2710 [10:09<05:11,  3.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2420_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2421_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████▏   | 1665/2710 [10:09<03:30,  4.97it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2422_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2423_mic1.flac probably does not have speech please check it !!


Processing files:  62%|██████▏   | 1681/2710 [10:16<05:06,  3.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2438_mic1.flac probably does not have speech please check it !!


Processing files:  62%|██████▏   | 1690/2710 [10:19<05:44,  2.96it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2447_mic1.flac probably does not have speech please check it !!


Processing files:  63%|██████▎   | 1714/2710 [10:28<04:54,  3.38it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2468_mic1.flac probably does not have speech please check it !!


Processing files:  64%|██████▎   | 1723/2710 [10:32<06:53,  2.39it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2477_mic1.flac probably does not have speech please check it !!


Processing files:  64%|██████▍   | 1743/2710 [10:38<04:43,  3.41it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2493_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2494_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1778/2710 [10:49<03:48,  4.09it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2526_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1782/2710 [10:51<04:15,  3.63it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2530_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1785/2710 [10:51<03:43,  4.14it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2533_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▋   | 1797/2710 [10:54<03:59,  3.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2543_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1818/2710 [11:02<04:53,  3.04it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2563_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1823/2710 [11:03<04:23,  3.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2567_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1825/2710 [11:04<04:24,  3.35it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2569_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1832/2710 [11:06<03:40,  3.97it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2575_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1853/2710 [11:14<02:46,  5.14it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2593_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2594_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1866/2710 [11:18<04:04,  3.46it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2606_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1877/2710 [11:22<04:21,  3.19it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2617_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1883/2710 [11:24<05:40,  2.43it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2622_mic1.flac probably does not have speech please check it !!


Processing files:  70%|██████▉   | 1896/2710 [11:27<02:57,  4.59it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2633_mic1.flac probably does not have speech please check it !!


Processing files:  70%|███████   | 1898/2710 [11:28<02:48,  4.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2635_mic1.flac probably does not have speech please check it !!


Processing files:  70%|███████   | 1908/2710 [11:31<03:06,  4.29it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2643_mic1.flac probably does not have speech please check it !!


Processing files:  71%|███████   | 1925/2710 [11:37<04:44,  2.76it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2660_mic1.flac probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1941/2710 [11:43<03:48,  3.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2674_mic1.flac probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1954/2710 [11:46<02:46,  4.55it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2685_mic1.flac probably does not have speech please check it !!


Processing files: 100%|██████████| 2710/2710 [17:09<00:00,  2.63it/s]


Processing complete

Found 58 files with no speech. List saved to ..\data\converted\TSync2-to-vctk\no_speech_files.txt





# Create metadata

In [6]:
DEST_DIR = Path(DEST_DIR)

# Write character files
sorted_chars = sorted(all_chars)
with open(DEST_DIR / 'all_chars_unicode.txt', 'w') as f:
   f.write(''.join(c.encode('unicode_escape').decode('ascii') for c in sorted_chars))
   
with open(DEST_DIR / 'all_chars.txt', 'w') as f:
   f.write(''.join(sorted_chars))