# Import libraries

In [1]:
import os
from pathlib import Path
import shutil
from pydub import AudioSegment
from tqdm import tqdm
import torch

from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

import json

# Moving files to the new directory

In [1]:
!find ../data/raw/TSync2/wav -type f -name "*.wav"  -exec sh -c 'ffmpeg -i "$1" -c:a pcm_mulaw "${1%.wav}.tmp.wav" && mv "${1%.wav}.tmp.wav" "$1"' _ {} \;

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

In [None]:
# Define paths
DEST_DIR = "../data/converted/TSync2-to-vctk"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt/TSync2")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav44/TSync2")
SRC_AUDIO_PATH = "../data/raw/TSync2/wav"
SRC_TEXT_PATH = "../data/raw/TSync2/wrd_ph"

def convert_wav_to_flac(src_path: str, dst_path: str) -> bool:
    """Convert WAV file to FLAC format using pydub"""
    try:
        # Load WAV file
        audio = AudioSegment.from_file(src_path, format="wav")
        
        # Export as FLAC"
        audio.export(
            dst_path,
            format="flac",
            parameters=[
                "-ac", "1",  # mono audio
                "-ar", "32000",  # 32kHz sample rate
                "-compression_level", "8"  # highest compression
            ]
        )
    except Exception as e:
        print(f"Error converting {src_path}: {str(e)}")
        return False
    return True

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

all_chars = set()
skip_files = []

# Get sorted lists of files
audio_files = sorted(Path(SRC_AUDIO_PATH).glob("*.wav"))
text_files = sorted(Path(SRC_TEXT_PATH).glob("*.txt"))

# Process files with progress bar
for i, (audio_file, text_file) in enumerate(tqdm(zip(audio_files, text_files), total=len(audio_files), desc="Processing files"), 1):
    try:
        # Process audio
        src_audio = str(audio_file)
        dest_audio = os.path.join(DEST_AUDIO_PATH, f"TSync2_{i:03d}_mic1.flac")
        
        if not convert_wav_to_flac(src_audio, dest_audio):
            raise Exception("Failed to convert audio")
        
        # Process text
        with text_file.open('r', encoding='utf-8') as f:
            clean_text = "".join(f.readline().strip().split("|"))
            all_chars.update(clean_text)
        
        dest_text = os.path.join(DEST_TEXT_PATH, f"TSync2_{i:03d}.txt")
        with open(dest_text, 'w', encoding='utf-8') as f:
            f.write(clean_text)
            
    except Exception as e:
        print(f"Error processing pair {i}: {e}")
        skip_files.append(i)
        continue

print(f"Processed {len(audio_files) - len(skip_files)} file pairs")
print(f"Skipped {len(skip_files)} pairs")
print(f"Unique characters found: {''.join(sorted(all_chars))}")

Clearing destination folder


Processing files:   4%|▎         | 100/2710 [00:21<08:52,  4.90it/s]

# Resample and trim audio

In [3]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/TSync2-to-vctk/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/TSync2-to-vctk/wav44"
dst_dir = "../data/converted/TSync2-to-vctk/wav16_silence_trimmed"

# Walk through the source directory and copy files while preserving directory structure
for root, dirs, files in os.walk(src_dir):
  for dir_name in dirs:
    src_path = os.path.join(root, dir_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    os.makedirs(dst_path, exist_ok=True)
  
  for file_name in files:
    src_path = os.path.join(root, file_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    shutil.copy2(src_path, dst_path)

In [4]:
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_files("../data/converted/TSync2-to-vctk/wav16_silence_trimmed", SAMPLE_RATE, file_ext="flac", n_jobs=NUM_RESAMPLE_THREADS)

Resampling the audio files...
Found 2710 files...


100%|██████████| 2710/2710 [00:06<00:00, 397.51it/s]

Done !





In [5]:
input_folder = Path("../data/converted/TSync2-to-vctk/wav16_silence_trimmed")

# Get VAD model once
model_and_utils = get_vad_model_and_utils(use_cuda=torch.cuda.is_available(), use_onnx=False)

# Get all .flac files
flac_files = list(input_folder.glob('**/*.flac'))
total_files = len(flac_files)
print(f"Found {total_files} .flac files to process")

# Track files with no speech detected
no_speech_files = []

for input_path in tqdm(flac_files, desc="Processing files"):
   # Preserve directory structure
   relative_path = input_path.relative_to(input_folder)
   output_path = input_folder / relative_path
   
   # Create subdirectories
   output_path.parent.mkdir(parents=True, exist_ok=True)
   
   try:
       output_path, is_speech = remove_silence(
           model_and_utils,
           str(input_path),
           str(output_path),
           trim_just_beginning_and_end=True,
           use_cuda=torch.cuda.is_available()
       )
       # If no speech detected, add to list
       if not is_speech:
           no_speech_files.append(str(output_path))
   except Exception as e:
       print(f"Error processing {relative_path}: {str(e)}")

print("\nProcessing complete")

# Write list of files with no speech detected
if no_speech_files:
   log_path = input_folder.parent / "no_speech_files.txt"
   with open(log_path, "w", encoding="utf-8") as f:
       for file in no_speech_files:
           f.write(f"{file}\n")
   print(f"\nFound {len(no_speech_files)} files with no speech. List saved to {log_path}")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/titor/.cache/torch/hub/master.zip


Found 2710 .flac files to process


Processing files:   1%|          | 28/2710 [00:04<06:20,  7.05it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2447_mic1.flac probably does not have speech please check it !!


Processing files:   3%|▎         | 84/2710 [00:10<05:34,  7.85it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2421_mic1.flac probably does not have speech please check it !!


Processing files:   3%|▎         | 93/2710 [00:11<03:42, 11.76it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2164_mic1.flac probably does not have speech please check it !!


Processing files:   5%|▌         | 147/2710 [00:18<05:19,  8.03it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2219_mic1.flac probably does not have speech please check it !!


Processing files:   6%|▌         | 154/2710 [00:19<05:01,  8.47it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2196_mic1.flac probably does not have speech please check it !!


Processing files:   6%|▌         | 159/2710 [00:20<04:12, 10.09it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2350_mic1.flac probably does not have speech please check it !!


Processing files:   6%|▌         | 167/2710 [00:20<04:05, 10.37it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2234_mic1.flac probably does not have speech please check it !!


Processing files:   7%|▋         | 199/2710 [00:24<04:48,  8.71it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2674_mic1.flac probably does not have speech please check it !!


Processing files:   9%|▉         | 239/2710 [00:29<05:52,  7.01it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2622_mic1.flac probably does not have speech please check it !!


Processing files:  10%|█         | 271/2710 [00:33<03:57, 10.28it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2346_mic1.flac probably does not have speech please check it !!


Processing files:  12%|█▏        | 312/2710 [00:38<03:57, 10.09it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2161_mic1.flac probably does not have speech please check it !!


Processing files:  12%|█▏        | 315/2710 [00:38<03:28, 11.49it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2391_mic1.flac probably does not have speech please check it !!


Processing files:  12%|█▏        | 335/2710 [00:41<07:49,  5.06it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2635_mic1.flac probably does not have speech please check it !!


Processing files:  13%|█▎        | 360/2710 [00:45<04:03,  9.65it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2289_mic1.flac probably does not have speech please check it !!


Processing files:  14%|█▍        | 386/2710 [00:49<03:39, 10.61it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2685_mic1.flac probably does not have speech please check it !!


Processing files:  19%|█▉        | 512/2710 [01:05<03:32, 10.36it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2493_mic1.flac probably does not have speech please check it !!


Processing files:  20%|█▉        | 531/2710 [01:08<04:51,  7.47it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2271_mic1.flac probably does not have speech please check it !!


Processing files:  21%|██        | 566/2710 [01:14<06:50,  5.22it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2372_mic1.flac probably does not have speech please check it !!


Processing files:  22%|██▏       | 601/2710 [01:20<05:25,  6.48it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2594_mic1.flac probably does not have speech please check it !!


Processing files:  22%|██▏       | 607/2710 [01:20<03:30,  9.99it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2163_mic1.flac probably does not have speech please check it !!


Processing files:  24%|██▍       | 652/2710 [01:25<03:00, 11.39it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2305_mic1.flac probably does not have speech please check it !!


Processing files:  25%|██▍       | 671/2710 [01:28<03:08, 10.80it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2606_mic1.flac probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2569_mic1.flac probably does not have speech please check it !!


Processing files:  26%|██▌       | 706/2710 [01:32<02:48, 11.89it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2660_mic1.flac probably does not have speech please check it !!


Processing files:  31%|███       | 827/2710 [01:48<03:09,  9.95it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2530_mic1.flac probably does not have speech please check it !!


Processing files:  34%|███▎      | 908/2710 [01:58<03:22,  8.88it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2332_mic1.flac probably does not have speech please check it !!


Processing files:  41%|████      | 1099/2710 [02:24<02:43,  9.85it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2259_mic1.flac probably does not have speech please check it !!


Processing files:  41%|████      | 1101/2710 [02:24<02:27, 10.89it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2477_mic1.flac probably does not have speech please check it !!


Processing files:  45%|████▌     | 1229/2710 [02:39<02:11, 11.24it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2171_mic1.flac probably does not have speech please check it !!


Processing files:  50%|████▉     | 1344/2710 [02:52<02:15, 10.07it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2291_mic1.flac probably does not have speech please check it !!


Processing files:  53%|█████▎    | 1428/2710 [03:04<01:59, 10.70it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2220_mic1.flac probably does not have speech please check it !!


Processing files:  55%|█████▌    | 1497/2710 [03:12<02:03,  9.82it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2633_mic1.flac probably does not have speech please check it !!


Processing files:  56%|█████▋    | 1530/2710 [03:16<02:09,  9.10it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2410_mic1.flac probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1563/2710 [03:20<02:14,  8.53it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2366_mic1.flac probably does not have speech please check it !!


Processing files:  60%|██████    | 1638/2710 [03:30<01:59,  8.99it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2168_mic1.flac probably does not have speech please check it !!


Processing files:  61%|██████    | 1658/2710 [03:32<02:03,  8.50it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2468_mic1.flac probably does not have speech please check it !!


Processing files:  65%|██████▍   | 1758/2710 [03:45<01:23, 11.37it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2397_mic1.flac probably does not have speech please check it !!


Processing files:  67%|██████▋   | 1826/2710 [03:54<01:33,  9.49it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2617_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1831/2710 [03:54<01:48,  8.08it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2382_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 1852/2710 [03:57<01:31,  9.41it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2329_mic1.flac probably does not have speech please check it !!


Processing files:  70%|███████   | 1906/2710 [04:04<01:42,  7.86it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2252_mic1.flac probably does not have speech please check it !!


Processing files:  71%|███████   | 1921/2710 [04:06<01:40,  7.88it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2533_mic1.flac probably does not have speech please check it !!


Processing files:  71%|███████   | 1927/2710 [04:06<01:23,  9.39it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2494_mic1.flac probably does not have speech please check it !!


Processing files:  75%|███████▌  | 2045/2710 [04:20<01:06,  9.95it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2422_mic1.flac probably does not have speech please check it !!


Processing files:  76%|███████▌  | 2056/2710 [04:22<01:31,  7.12it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2567_mic1.flac probably does not have speech please check it !!


Processing files:  77%|███████▋  | 2085/2710 [04:26<01:33,  6.72it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2287_mic1.flac probably does not have speech please check it !!


Processing files:  80%|████████  | 2176/2710 [04:37<00:50, 10.51it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2593_mic1.flac probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2543_mic1.flac probably does not have speech please check it !!


Processing files:  81%|████████  | 2185/2710 [04:38<00:57,  9.19it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2423_mic1.flac probably does not have speech please check it !!


Processing files:  81%|████████  | 2195/2710 [04:40<00:47, 10.91it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2563_mic1.flac probably does not have speech please check it !!


Processing files:  81%|████████▏ | 2206/2710 [04:41<00:48, 10.42it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2379_mic1.flac probably does not have speech please check it !!


Processing files:  83%|████████▎ | 2237/2710 [04:45<01:16,  6.18it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2420_mic1.flac probably does not have speech please check it !!


Processing files:  83%|████████▎ | 2245/2710 [04:46<01:00,  7.72it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2438_mic1.flac probably does not have speech please check it !!


Processing files:  85%|████████▍ | 2296/2710 [04:53<00:41, 10.10it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2360_mic1.flac probably does not have speech please check it !!


Processing files:  88%|████████▊ | 2394/2710 [05:04<00:33,  9.35it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2643_mic1.flac probably does not have speech please check it !!


Processing files:  92%|█████████▏| 2484/2710 [05:16<00:24,  9.41it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2526_mic1.flac probably does not have speech please check it !!


Processing files:  94%|█████████▎| 2538/2710 [05:23<00:27,  6.22it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2575_mic1.flac probably does not have speech please check it !!


Processing files:  94%|█████████▍| 2545/2710 [05:24<00:18,  8.69it/s]

> The file ../data/converted/TSync2-to-vctk/wav16_silence_trimmed/Tsync2/Tsync2_2380_mic1.flac probably does not have speech please check it !!


Processing files: 100%|██████████| 2710/2710 [05:47<00:00,  7.80it/s]


Processing complete

Found 58 files with no speech. List saved to ../data/converted/TSync2-to-vctk/no_speech_files.txt





# Create metadata

In [None]:
DEST_DIR = Path(DEST_DIR)

# Write character files
sorted_chars = sorted(all_chars)
with open(DEST_DIR / 'all_chars_unicode.txt', 'w') as f:
   f.write(''.join(c.encode('unicode_escape').decode('ascii') for c in sorted_chars))
   
with open(DEST_DIR / 'all_chars.txt', 'w') as f:
   f.write(''.join(sorted_chars))