# Import libraries

In [1]:
import pandas as pd
import os
from pathlib import Path
import shutil
from typing import List
import json 
from tqdm import tqdm

import concurrent.futures
from typing import List, Tuple, Dict
import multiprocessing

import sys
sys.path.append('..')
from utils.audio_util import convert_mp3_to_flac, resample_audios, trim_silence_with_vad
from utils.file_util import recursive_copy

# Read the validated tsv

In [2]:
validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')

  validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')


# Define word replacement function

In [3]:
def replace_words(dataframe: pd.DataFrame, column_name: str, replacing_pairs: List[List[str]]) -> pd.DataFrame:
    for old_word, new_word in replacing_pairs:
        dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))
    return dataframe

replacing_word = [
    ['เพฃร', 'เพชร'],
]

# Filter and group client_id that have over 100 records

In [4]:
filtered_data = validated_data[
    validated_data['client_id'].map(
        validated_data['client_id'].value_counts() >= 100
    )
]
filtered_data = replace_words(filtered_data, 'sentence', replacing_word)
grouped = filtered_data.groupby('client_id').agg(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))


# Define new id instead of client_id

In [5]:
id_mapper = {id_: f'cv{str(i+1).zfill(3)}' for i, id_ in enumerate(filtered_data['client_id'].unique())}

grouped_data = {
    id_mapper[client_id]: list(zip(sentences, paths))
    for (client_id, (sentences, paths)) in grouped[['sentence', 'path']].iterrows()
}

# Moving files to the new folder

In [6]:
# Define paths
DEST_DIR = "../data/converted/commonvoice-to-vctk"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav32")
SRC_AUDIO_PATH = "../data/raw/cv-corpus-20.0-2024-12-06/th/clips"

# Number of worker threads (adjust based on your CPU)
NUM_WORKERS = multiprocessing.cpu_count()

def process_client_data(client_data: Tuple[str, List]) -> Dict:
    """
    Process all data for a single client
    
    Args:
        client_data: Tuple of (client_id, data)
    
    Returns:
        Dict with processing results
    """
    client_id, data = client_data
    results = {
        'client_id': client_id,
        'processed': 0,
        'failed': 0,
        'chars': set()
    }
    
    # Create client directories
    client_text_dir = os.path.join(DEST_TEXT_PATH, client_id)
    client_audio_dir = os.path.join(DEST_AUDIO_PATH, client_id)
    os.makedirs(client_text_dir, exist_ok=True)
    os.makedirs(client_audio_dir, exist_ok=True)
    
    for i, d in enumerate(data):
        # Write text file
        text_path = os.path.join(client_text_dir, f"{client_id}_{(i + 1):03d}.txt")
        with open(text_path, 'w') as f:
            f.write(d[0])
            results['chars'].update(d[0])
        
        # Convert audio file
        src_audio_path = os.path.join(SRC_AUDIO_PATH, d[1])
        dst_audio_path = os.path.join(
            client_audio_dir,
            f"{client_id}_{(i + 1):03d}_mic1.flac"
        )
        
        if convert_mp3_to_flac(src_audio_path, dst_audio_path):
            results['processed'] += 1
        else:
            results['failed'] += 1
    
    return results

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

print(f"Starting parallel processing with {NUM_WORKERS} workers")

# Create progress bar for overall processing
with tqdm(total=len(grouped_data), desc="Processing clients") as pbar:
    all_chars = set()
    total_processed = 0
    total_failed = 0
    
    # Process clients in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        # Submit all client processing tasks
        future_to_client = {
            executor.submit(process_client_data, (client_id, data)): client_id 
            for client_id, data in grouped_data.items()
        }
        
        # Process completed tasks
        for future in concurrent.futures.as_completed(future_to_client):
            client_id = future_to_client[future]
            try:
                result = future.result()
                all_chars.update(result['chars'])
                total_processed += result['processed']
                total_failed += result['failed']
            except Exception as e:
                print(f"Client {client_id} generated an exception: {str(e)}")
                total_failed += len(grouped_data[client_id])
            pbar.update(1)

print("\nConversion Summary:")
print(f"Total files processed successfully: {total_processed}")
print(f"Total files failed: {total_failed}")
print(f"Total unique characters: {len(all_chars)}")
print("Restructuring and conversion complete")

Starting parallel processing with 8 workers


Processing clients: 100%|██████████| 134/134 [1:32:25<00:00, 41.39s/it] 


Conversion Summary:
Total files processed successfully: 92956
Total files failed: 0
Total unique characters: 115
Restructuring and conversion complete





# Resample and trim audio

In [7]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/commonvoice-to-vctk/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/commonvoice-to-vctk/wav32"
dst_dir = "../data/converted/commonvoice-to-vctk/wav16_silence_trimmed"

recursive_copy(src_dir, dst_dir)

In [8]:
# Resample all files in wav16_silence_trimmed to 16kHz
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_audios(
  input_folders="../data/converted/commonvoice-to-vctk/wav16_silence_trimmed",
  file_ext="flac",
  sample_rate=SAMPLE_RATE,
  n_jobs=NUM_RESAMPLE_THREADS
)

Resampling the audio files...
Found 92956 files...


100%|██████████| 92956/92956 [02:17<00:00, 678.40it/s]

Done !





In [9]:
# Trim silence at the beginning and end of each audio file
trim_silence_with_vad(
  input_folder="../data/converted/commonvoice-to-vctk/wav16_silence_trimmed",
  file_extension="flac",
)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /Users/titor/.cache/torch/hub/master.zip


Found 92956 .flac files to process


Processing files:   3%|▎         | 2491/92956 [00:56<28:33, 52.80it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv050/cv050_088_mic1.flac probably does not have speech please check it !!


Processing files:   4%|▍         | 3919/92956 [01:30<33:23, 44.43it/s]  

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv032/cv032_010_mic1.flac probably does not have speech please check it !!


Processing files:   6%|▌         | 5326/92956 [02:04<28:34, 51.12it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv114/cv114_105_mic1.flac probably does not have speech please check it !!


Processing files:   8%|▊         | 7430/92956 [02:56<30:41, 46.45it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv125/cv125_946_mic1.flac probably does not have speech please check it !!


Processing files:  11%|█         | 9863/92956 [03:47<35:06, 39.45it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv123/cv123_251_mic1.flac probably does not have speech please check it !!


Processing files:  17%|█▋        | 15379/92956 [05:59<30:08, 42.90it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv101/cv101_116_mic1.flac probably does not have speech please check it !!


Processing files:  18%|█▊        | 16385/92956 [06:22<26:35, 47.99it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv107/cv107_384_mic1.flac probably does not have speech please check it !!
> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv107/cv107_451_mic1.flac probably does not have speech please check it !!


Processing files:  19%|█▉        | 18119/92956 [07:01<24:25, 51.06it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv131/cv131_2196_mic1.flac probably does not have speech please check it !!


Processing files:  27%|██▋       | 25493/92956 [09:41<23:45, 47.31it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv039/cv039_039_mic1.flac probably does not have speech please check it !!


Processing files:  29%|██▊       | 26564/92956 [10:07<27:47, 39.81it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv063/cv063_111_mic1.flac probably does not have speech please check it !!


Processing files:  30%|██▉       | 27748/92956 [10:34<26:21, 41.24it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv031/cv031_067_mic1.flac probably does not have speech please check it !!


Processing files:  37%|███▋      | 33998/92956 [12:39<19:24, 50.63it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv126/cv126_853_mic1.flac probably does not have speech please check it !!


Processing files:  47%|████▋     | 43852/92956 [16:12<18:09, 45.06it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv129/cv129_1729_mic1.flac probably does not have speech please check it !!


Processing files:  48%|████▊     | 44175/92956 [16:18<17:23, 46.75it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv129/cv129_1648_mic1.flac probably does not have speech please check it !!


Processing files:  50%|█████     | 46909/92956 [17:22<16:58, 45.22it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_15607_mic1.flac probably does not have speech please check it !!


Processing files:  54%|█████▎    | 49763/92956 [18:28<17:08, 42.01it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_8662_mic1.flac probably does not have speech please check it !!


Processing files:  64%|██████▎   | 59246/92956 [22:13<11:07, 50.50it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_10987_mic1.flac probably does not have speech please check it !!


Processing files:  66%|██████▌   | 61264/92956 [23:00<13:57, 37.85it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_13435_mic1.flac probably does not have speech please check it !!


Processing files:  68%|██████▊   | 63562/92956 [23:54<11:02, 44.37it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_18451_mic1.flac probably does not have speech please check it !!


Processing files:  72%|███████▏  | 66839/92956 [25:10<10:30, 41.45it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv134/cv134_18844_mic1.flac probably does not have speech please check it !!


Processing files:  83%|████████▎ | 76836/92956 [29:00<05:39, 47.51it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv133/cv133_202_mic1.flac probably does not have speech please check it !!


Processing files:  90%|████████▉ | 83549/92956 [31:24<02:22, 66.07it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv132/cv132_116_mic1.flac probably does not have speech please check it !!


Processing files:  91%|█████████ | 84492/92956 [31:39<02:19, 60.70it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv132/cv132_5889_mic1.flac probably does not have speech please check it !!


Processing files:  92%|█████████▏| 85953/92956 [32:05<01:57, 59.48it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv132/cv132_4774_mic1.flac probably does not have speech please check it !!


Processing files:  94%|█████████▍| 87342/92956 [32:28<01:31, 61.63it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv132/cv132_923_mic1.flac probably does not have speech please check it !!


Processing files:  96%|█████████▌| 88992/92956 [32:56<01:03, 62.37it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv132/cv132_4025_mic1.flac probably does not have speech please check it !!


Processing files:  98%|█████████▊| 90937/92956 [33:36<00:47, 42.57it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv013/cv013_103_mic1.flac probably does not have speech please check it !!


Processing files:  99%|█████████▉| 92249/92956 [34:06<00:15, 45.31it/s]

> The file ../data/converted/commonvoice-to-vctk/wav16_silence_trimmed/cv046/cv046_030_mic1.flac probably does not have speech please check it !!


Processing files: 100%|██████████| 92956/92956 [34:24<00:00, 45.03it/s]


Processing complete

Found 29 files with no speech. List saved to ../data/converted/commonvoice-to-vctk/no_speech_files.txt





In [10]:
# Normalize the volume of all audio files to -27dB
!find "../data/converted/TSync2-to-vctk/wav16_silence_trimmed" -type f -name "*.flac" -exec sh -c 'ffmpeg-normalize "$1" -nt rms -t=-27 -o "$1" -ar 16000 -f -ext flac -c:a flac' _ {} \;

# Create metadata

In [11]:
DEST_DIR = Path(DEST_DIR)

json_files = {
   'grouped_data.json': [
       {"client_id": cid, "data": [{"path": d[1], "sentence": d[0]} for d in data]}
       for cid, data in grouped_data.items()
   ],
   'language_ids.json': {'th': 1},
   'speakers_ids.json': {cid: i for i, cid in enumerate(grouped_data)},
   'id_mapper.json': id_mapper
}

# Write JSON files
for filename, data in json_files.items():
   with open(DEST_DIR / filename, 'w') as f:
       json.dump(data, f, indent=2)

# Write character files
sorted_chars = sorted(all_chars)
with open(DEST_DIR / 'all_chars_unicode.txt', 'w') as f:
   f.write(''.join(c.encode('unicode_escape').decode('ascii') for c in sorted_chars))
   
with open(DEST_DIR / 'all_chars.txt', 'w') as f:
   f.write(''.join(sorted_chars))