# Import libraries

In [1]:
# !pip install git+https://github.com/dubbing-ai/Transliterate.git

In [2]:
import os
from pathlib import Path
import shutil
from tqdm import tqdm
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor
import threading

import sys
sys.path.append('..')
from utils.audio_util import convert_wav_to_flac, resample_audios, trim_silence_with_vad
from utils.file_util import recursive_copy

from transliterate.phonemizer import ThaiPhonemizer

# Moving files to the new directory

In [3]:
thai_phonemizer = ThaiPhonemizer()

In [4]:
!find "../data/raw/TSync2/wav" -type f -name "*.wav" -exec sh -c 'ffmpeg -i "$1" -c:a pcm_mulaw "${1%.wav}.tmp.wav" && mv "${1%.wav}.tmp.wav" "$1"' _ {} \;

File not found - "*.wav"


In [5]:
# Define paths
DEST_DIR = "../data/converted/TSync2-to-vctk-ph"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt/TSync2")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav44/TSync2")
SRC_AUDIO_PATH = "../data/raw/TSync2/wav"
SRC_TEXT_PATH = "../data/raw/TSync2/wrd_ph"

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

# Thread-safe set for character collection
all_chars = set()
chars_lock = threading.Lock()

# Thread-safe list for tracking skipped files
skip_files = []
skip_lock = threading.Lock()

# Thread-safe for phonemizer
phonemizer_lock = threading.Lock()

def process_file_pair(args: Tuple[int, Path, Path]) -> None:
    """Process a single pair of audio and text files"""
    i, audio_file, text_file = args
    try:
        # Process audio
        src_audio = str(audio_file)
        dest_audio = os.path.join(DEST_AUDIO_PATH, f"TSync2_{i:03d}_mic1.flac")
        
        if not convert_wav_to_flac(src_audio, dest_audio):
            raise Exception("Failed to convert audio")
        
        # Process text
        with text_file.open('r', encoding='utf-8') as f:
            with phonemizer_lock:
                clean_text = " ".join(thai_phonemizer.phonemize("".join(f.readline().strip().split("|"))))
            # Thread-safe update of all_chars
            with chars_lock:
                all_chars.update(clean_text)
        
        dest_text = os.path.join(DEST_TEXT_PATH, f"TSync2_{i:03d}.txt")
        with open(dest_text, 'w', encoding='utf-8') as f:
            f.write(clean_text)
            
    except Exception as e:
        print(f"Error processing pair {i}: {e}")
        with skip_lock:
            skip_files.append(i)

# Get sorted lists of files
audio_files = sorted(Path(SRC_AUDIO_PATH).glob("*.wav"))
text_files = sorted(Path(SRC_TEXT_PATH).glob("*.txt"))

# Create processing arguments
process_args = [
    (i, audio_file, text_file) 
    for i, (audio_file, text_file) 
    in enumerate(zip(audio_files, text_files), 1)
]

max_workers = os.cpu_count()

# Process files in parallel with progress bar
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    list(tqdm(
        executor.map(process_file_pair, process_args),
        total=len(process_args),
        desc=f"Processing files (using {max_workers} workers)"
    ))

# Print results
print(f"Processed {len(audio_files) - len(skip_files)} file pairs")
print(f"Skipped {len(skip_files)} pairs")
print(f"Unique characters found: {''.join(sorted(all_chars))}")

Clearing destination folder


Processing files (using 16 workers): 100%|██████████| 2710/2710 [14:20<00:00,  3.15it/s]

Processed 2710 file pairs
Skipped 0 pairs
Unique characters found:  2345_abcdefhijklmnoprstuwŋɛɤɯʔʰːᴐ





# Resample, trim, and normalize audio

In [6]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/TSync2-to-vctk-ph/wav44"
dst_dir = "../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed"

recursive_copy(src_dir, dst_dir)

In [None]:
# Resample all files in wav16_silence_trimmed to 16kHz
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_audios(
  input_folders="../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed",
  file_ext="flac",
  sample_rate=SAMPLE_RATE,
  n_jobs=NUM_RESAMPLE_THREADS
)

Resampling the audio files...
Found 2710 files...


100%|██████████| 2710/2710 [00:17<00:00, 154.76it/s]

Done !





In [8]:
# Trim silence at the beginning and end of each audio file
trim_silence_with_vad(
  input_folder="../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed",
  file_extension="flac",
)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 2710 .flac files to process


Processing files:  51%|█████     | 1376/2710 [06:19<06:18,  3.53it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2161_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████     | 1379/2710 [06:19<04:49,  4.60it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2163_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2164_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████     | 1385/2710 [06:21<04:09,  5.30it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2168_mic1.flac probably does not have speech please check it !!


Processing files:  51%|█████▏    | 1389/2710 [06:21<04:18,  5.12it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2171_mic1.flac probably does not have speech please check it !!


Processing files:  52%|█████▏    | 1415/2710 [06:28<05:19,  4.06it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2196_mic1.flac probably does not have speech please check it !!


Processing files:  53%|█████▎    | 1439/2710 [06:35<05:55,  3.57it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2219_mic1.flac probably does not have speech please check it !!


Processing files:  53%|█████▎    | 1441/2710 [06:35<05:27,  3.88it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2220_mic1.flac probably does not have speech please check it !!


Processing files:  54%|█████▎    | 1456/2710 [06:39<05:19,  3.93it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2234_mic1.flac probably does not have speech please check it !!


Processing files:  54%|█████▍    | 1476/2710 [06:43<04:21,  4.72it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2252_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▍    | 1483/2710 [06:44<04:15,  4.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2259_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▌    | 1499/2710 [06:49<03:53,  5.18it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2271_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▌    | 1516/2710 [06:52<03:39,  5.44it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2287_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2289_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▌    | 1519/2710 [06:53<03:48,  5.20it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2291_mic1.flac probably does not have speech please check it !!


Processing files:  57%|█████▋    | 1534/2710 [06:56<03:59,  4.91it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2305_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1560/2710 [07:02<05:47,  3.30it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2329_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1564/2710 [07:03<05:16,  3.62it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2332_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1581/2710 [07:08<03:18,  5.68it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2346_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1585/2710 [07:08<03:34,  5.24it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2350_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1595/2710 [07:11<04:14,  4.38it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2360_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1602/2710 [07:12<03:54,  4.73it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2366_mic1.flac probably does not have speech please check it !!


Processing files:  59%|█████▉    | 1609/2710 [07:14<04:12,  4.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2372_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1615/2710 [07:16<04:47,  3.81it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2379_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1618/2710 [07:17<04:27,  4.08it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2380_mic1.flac probably does not have speech please check it !!


Processing files:  60%|█████▉    | 1619/2710 [07:17<04:45,  3.82it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2382_mic1.flac probably does not have speech please check it !!


Processing files:  60%|██████    | 1630/2710 [07:20<03:47,  4.75it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2391_mic1.flac probably does not have speech please check it !!


Processing files:  60%|██████    | 1635/2710 [07:21<05:36,  3.19it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2397_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████    | 1652/2710 [07:26<04:09,  4.23it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2410_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████▏   | 1663/2710 [07:29<04:07,  4.22it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2420_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2421_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2422_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████▏   | 1665/2710 [07:29<03:05,  5.62it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2423_mic1.flac probably does not have speech please check it !!


Processing files:  62%|██████▏   | 1680/2710 [07:34<05:03,  3.39it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2438_mic1.flac probably does not have speech please check it !!


Processing files:  62%|██████▏   | 1690/2710 [07:37<04:51,  3.49it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2447_mic1.flac probably does not have speech please check it !!


Processing files:  63%|██████▎   | 1714/2710 [07:44<03:50,  4.32it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2468_mic1.flac probably does not have speech please check it !!


Processing files:  64%|██████▎   | 1723/2710 [07:47<05:03,  3.26it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2477_mic1.flac probably does not have speech please check it !!


Processing files:  64%|██████▍   | 1743/2710 [07:52<03:56,  4.08it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2493_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2494_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1777/2710 [08:01<03:53,  3.99it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2526_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1782/2710 [08:02<03:22,  4.58it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2530_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1785/2710 [08:03<03:23,  4.54it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2533_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▋   | 1796/2710 [08:05<03:46,  4.04it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2543_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1818/2710 [08:11<03:48,  3.91it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2563_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1822/2710 [08:12<04:16,  3.46it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2567_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1826/2710 [08:13<03:09,  4.67it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2569_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1831/2710 [08:14<03:51,  3.80it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2575_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1853/2710 [08:20<01:58,  7.26it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2593_mic1.flac probably does not have speech please check it !!
> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2594_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1866/2710 [08:24<03:41,  3.80it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2606_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1877/2710 [08:27<03:17,  4.21it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2617_mic1.flac probably does not have speech please check it !!


Processing files:  69%|██████▉   | 1883/2710 [08:29<04:06,  3.36it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2622_mic1.flac probably does not have speech please check it !!


Processing files:  70%|██████▉   | 1895/2710 [08:31<02:46,  4.88it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2633_mic1.flac probably does not have speech please check it !!


Processing files:  70%|███████   | 1897/2710 [08:31<02:29,  5.44it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2635_mic1.flac probably does not have speech please check it !!


Processing files:  70%|███████   | 1906/2710 [08:34<03:10,  4.22it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2643_mic1.flac probably does not have speech please check it !!


Processing files:  71%|███████   | 1927/2710 [08:39<02:48,  4.65it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2660_mic1.flac probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1940/2710 [08:43<03:42,  3.46it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2674_mic1.flac probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1954/2710 [08:46<01:58,  6.37it/s]

> The file ..\data\converted\TSync2-to-vctk\wav16_silence_trimmed\TSync2\TSync2_2685_mic1.flac probably does not have speech please check it !!


Processing files: 100%|██████████| 2710/2710 [12:48<00:00,  3.52it/s]


Processing complete

Found 58 files with no speech. List saved to ..\data\converted\TSync2-to-vctk\no_speech_files.txt





In [9]:
# Normalize the volume of all audio files to -27dB
!find "../data/converted/TSync2-to-vctk-ph/wav16_silence_trimmed" -type f -name "*.flac" -exec sh -c 'ffmpeg-normalize "$1" -nt rms -t=-27 -o "$1" -ar 16000 -f -ext flac -c:a flac' _ {} \;

File not found - "*.flac"


# Create metadata

In [10]:
DEST_DIR = Path(DEST_DIR)

# Write character files
sorted_chars = sorted(all_chars)
with open(DEST_DIR / 'all_chars_unicode.txt', 'w') as f:
   f.write(''.join(c.encode('unicode_escape').decode('ascii') for c in sorted_chars))
   
with open(DEST_DIR / 'all_chars.txt', 'w') as f:
   f.write(''.join(sorted_chars))