# Import libraries

In [20]:
import pandas as pd
import os
from pathlib import Path
import shutil
from typing import List
import json 
from tqdm import tqdm
import random

import concurrent.futures
from typing import List, Tuple, Dict
import multiprocessing

import soundfile as sf
import io
from huggingface_hub import login
from dotenv import load_dotenv

from datasets import Dataset, DatasetDict, Audio, load_dataset

import sys
sys.path.append('..')
from utils.audio_util import convert_mp3_to_wav, resample_audios, trim_silence_with_vad
from utils.file_util import recursive_copy
from utils.text_util import clean_text_cv

In [None]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

login(token=HF_TOKEN)

# Read the validated tsv

In [2]:
validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')

  validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')


# Define word replacement function

In [None]:
def preprocess_words(dataframe: pd.DataFrame, column_name: str, replacing_pairs: List[List[str]]) -> pd.DataFrame:
    for old_word, new_word in replacing_pairs:
        dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))
    dataframe[column_name] = dataframe[column_name].apply(clean_text_cv)
    return dataframe

replacing_word = [
    ['เพฃร', 'เพชร'],
]

# Filter and group client_id that have over 100 records

In [4]:
filtered_data = validated_data[
    validated_data['client_id'].map(
        validated_data['client_id'].value_counts() >= 100
    )
]
filtered_data = preprocess_words(filtered_data, 'sentence', replacing_word)
grouped = filtered_data.groupby('client_id').agg(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))


# Define new id instead of client_id

In [5]:
id_mapper = {id_: f'cv{str(i+1).zfill(3)}' for i, id_ in enumerate(filtered_data['client_id'].unique())}

grouped_data = {
    id_mapper[client_id]: list(zip(sentences, paths))
    for (client_id, (sentences, paths)) in grouped[['sentence', 'path']].iterrows()
}

# Moving files to the new folder

In [6]:
# Define paths
DEST_DIR = "../data/converted/commonvoice-to-gigaspeech"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav32")
SRC_AUDIO_PATH = "../data/raw/cv-corpus-20.0-2024-12-06/th/clips"

# Number of worker threads (adjust based on your CPU)
NUM_WORKERS = multiprocessing.cpu_count()

def process_client_data(client_data: Tuple[str, List]) -> Dict:
    """
    Process all data for a single client
    
    Args:
        client_data: Tuple of (client_id, data)
    
    Returns:
        Dict with processing results
    """
    client_id, data = client_data
    results = {
        'client_id': client_id,
        'processed': 0,
        'failed': 0,
        'chars': set()
    }
    
    # Create client directories
    client_text_dir = os.path.join(DEST_TEXT_PATH, client_id)
    client_audio_dir = os.path.join(DEST_AUDIO_PATH, client_id)
    os.makedirs(client_text_dir, exist_ok=True)
    os.makedirs(client_audio_dir, exist_ok=True)
    
    for i, d in enumerate(data):
        # Write text file
        text_path = os.path.join(client_text_dir, f"{client_id}_{(i + 1):03d}.txt")
        with open(text_path, 'w') as f:
            f.write(d[0])
            results['chars'].update(d[0])
        
        # Convert audio file
        src_audio_path = os.path.join(SRC_AUDIO_PATH, d[1])
        dst_audio_path = os.path.join(
            client_audio_dir,
            f"{client_id}_{(i + 1):03d}_mic1.wav"
        )
        
        if convert_mp3_to_wav(src_audio_path, dst_audio_path):
            results['processed'] += 1
        else:
            results['failed'] += 1
    
    return results

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

print(f"Starting parallel processing with {NUM_WORKERS} workers")

# Create progress bar for overall processing
with tqdm(total=len(grouped_data), desc="Processing clients") as pbar:
    all_chars = set()
    total_processed = 0
    total_failed = 0
    
    # Process clients in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        # Submit all client processing tasks
        future_to_client = {
            executor.submit(process_client_data, (client_id, data)): client_id 
            for client_id, data in grouped_data.items()
        }
        
        # Process completed tasks
        for future in concurrent.futures.as_completed(future_to_client):
            client_id = future_to_client[future]
            try:
                result = future.result()
                all_chars.update(result['chars'])
                total_processed += result['processed']
                total_failed += result['failed']
            except Exception as e:
                print(f"Client {client_id} generated an exception: {str(e)}")
                total_failed += len(grouped_data[client_id])
            pbar.update(1)

print("\nConversion Summary:")
print(f"Total files processed successfully: {total_processed}")
print(f"Total files failed: {total_failed}")
print(f"Total unique characters: {len(all_chars)}")
print("Restructuring and conversion complete")

Clearing destination folder
Starting parallel processing with 16 workers


Processing clients: 100%|██████████| 134/134 [37:46<00:00, 16.91s/it] 


Conversion Summary:
Total files processed successfully: 92956
Total files failed: 0
Total unique characters: 115
Restructuring and conversion complete





# Resample and trim audio

In [7]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/commonvoice-to-gigaspeech/wav32"
dst_dir = "../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed"

recursive_copy(src_dir, dst_dir)

In [8]:
# Resample all files in wav16_silence_trimmed to 16kHz
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_audios(
  input_folders="../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed",
  file_ext="wav",
  sample_rate=SAMPLE_RATE,
  n_jobs=NUM_RESAMPLE_THREADS
)

Resampling the audio files...
Found 92956 files...


100%|██████████| 92956/92956 [04:34<00:00, 338.41it/s]

Done !





In [2]:
# Trim silence at the beginning and end of each audio file
for folder in os.listdir("../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed"):
    if folder in [f'cv{id}' for id in [129, 130, 131, 132, 133, 134]]:
      trim_silence_with_vad(
          input_folder=os.path.join("../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed", folder),
          file_extension="wav",
      )
      print(f"Finished trimming silence for {folder}")
# trim_silence_with_vad(
#   input_folder="../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed",
#   file_extension="wav",
# )

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 2208 .wav files to process


Processing files:  34%|███▍      | 753/2208 [01:06<02:02, 11.85it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1593_mic1.wav probably does not have speech please check it !!


Processing files:  37%|███▋      | 813/2208 [01:12<02:26,  9.49it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1648_mic1.wav probably does not have speech please check it !!


Processing files:  37%|███▋      | 820/2208 [01:13<01:53, 12.22it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1652_mic1.wav probably does not have speech please check it !!


Processing files:  38%|███▊      | 847/2208 [01:15<02:44,  8.29it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1679_mic1.wav probably does not have speech please check it !!


Processing files:  41%|████      | 902/2208 [01:20<01:41, 12.82it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1729_mic1.wav probably does not have speech please check it !!


Processing files:  43%|████▎     | 954/2208 [01:24<01:57, 10.64it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1775_mic1.wav probably does not have speech please check it !!


Processing files:  47%|████▋     | 1028/2208 [01:31<01:41, 11.64it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1842_mic1.wav probably does not have speech please check it !!


Processing files:  47%|████▋     | 1046/2208 [01:33<01:31, 12.64it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1860_mic1.wav probably does not have speech please check it !!


Processing files:  49%|████▉     | 1077/2208 [01:36<01:46, 10.57it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1889_mic1.wav probably does not have speech please check it !!


Processing files:  49%|████▉     | 1090/2208 [01:37<01:23, 13.43it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_1899_mic1.wav probably does not have speech please check it !!


Processing files:  84%|████████▎ | 1847/2208 [02:54<00:28, 12.68it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv129\cv129_638_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 2208/2208 [03:28<00:00, 10.58it/s]



Processing complete

Found 11 files with no speech. List saved to ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\no_speech_files.txt
Finished trimming silence for cv129


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 2741 .wav files to process


Processing files: 100%|██████████| 2741/2741 [06:37<00:00,  6.89it/s]



Processing complete
Finished trimming silence for cv130


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 3422 .wav files to process


Processing files:  41%|████▏     | 1416/3422 [03:02<03:59,  8.36it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv131\cv131_2196_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 3422/3422 [07:47<00:00,  7.32it/s]



Processing complete

Found 1 files with no speech. List saved to ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\no_speech_files.txt
Finished trimming silence for cv131


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 6081 .wav files to process


Processing files:   5%|▍         | 285/6081 [00:36<11:50,  8.16it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv132\cv132_116_mic1.wav probably does not have speech please check it !!


Processing files:  56%|█████▋    | 3428/6081 [06:53<04:15, 10.38it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv132\cv132_4025_mic1.wav probably does not have speech please check it !!


Processing files:  70%|██████▉   | 4252/6081 [08:29<03:15,  9.37it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv132\cv132_4774_mic1.wav probably does not have speech please check it !!


Processing files:  90%|█████████ | 5477/6081 [10:48<00:58, 10.30it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv132\cv132_5889_mic1.wav probably does not have speech please check it !!


Processing files:  99%|█████████▉| 6007/6081 [11:53<00:07,  9.30it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv132\cv132_923_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 6081/6081 [12:03<00:00,  8.41it/s]



Processing complete

Found 5 files with no speech. List saved to ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\no_speech_files.txt
Finished trimming silence for cv132


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 9669 .wav files to process


Processing files:  13%|█▎        | 1233/9669 [03:31<21:49,  6.44it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv133\cv133_202_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 9669/9669 [25:07<00:00,  6.41it/s]



Processing complete

Found 1 files with no speech. List saved to ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\no_speech_files.txt
Finished trimming silence for cv133


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 26698 .wav files to process


Processing files:   4%|▍         | 1195/26698 [03:11<1:09:26,  6.12it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_10987_mic1.wav probably does not have speech please check it !!


Processing files:  15%|█▍        | 3911/26698 [11:15<1:06:06,  5.74it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_13435_mic1.wav probably does not have speech please check it !!


Processing files:  24%|██▎       | 6322/26698 [18:30<1:01:32,  5.52it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_15607_mic1.wav probably does not have speech please check it !!


Processing files:  36%|███▌      | 9481/26698 [28:19<55:57,  5.13it/s]  

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_18451_mic1.wav probably does not have speech please check it !!


Processing files:  37%|███▋      | 9917/26698 [29:43<44:42,  6.26it/s]  

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_18844_mic1.wav probably does not have speech please check it !!


Processing files:  94%|█████████▍| 25228/26698 [1:08:34<03:44,  6.54it/s]

> The file ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\cv134\cv134_8662_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 26698/26698 [1:12:36<00:00,  6.13it/s]


Processing complete

Found 6 files with no speech. List saved to ..\data\converted\commonvoice-to-gigaspeech\wav16_silence_trimmed\no_speech_files.txt
Finished trimming silence for cv134





In [None]:
"""TODO: Normalize wav files' volume to -27dB"""

# # Normalize the volume of all audio files to -27dB
# !find "../data/converted/TSync2-to-vctk/wav16_silence_trimmed" -type f -name "*.flac" -exec sh -c 'ffmpeg-normalize "$1" -nt rms -t=-27 -o "$1" -ar 16000 -f -ext flac -c:a flac' _ {} \;

# Create Dataset

In [23]:
SENTENCE_PATH = "../data/converted/commonvoice-to-gigaspeech/txt"
AUDIO_PATH = "../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed"

file_pairs = []
for speaker in tqdm(os.listdir(SENTENCE_PATH)):
    speaker_sentence_path = os.path.join(SENTENCE_PATH, speaker)
    speaker_audio_path = os.path.join(AUDIO_PATH, speaker)
    for file in os.listdir(speaker_sentence_path):
        file_name = file.split('.')[0]
        sentence_file = os.path.join(speaker_sentence_path, file)
        audio_file = os.path.join(speaker_audio_path, file_name + '_mic1.wav')
        file_pairs.append((sentence_file, audio_file, file.split('.')[0]))

print(f"Total file pairs: {len(file_pairs)}")
print(file_pairs[:5])

100%|██████████| 134/134 [00:00<00:00, 247.31it/s]

Total file pairs: 92956
[('../data/converted/commonvoice-to-gigaspeech/txt\\cv001\\cv001_001.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv001\\cv001_001_mic1.wav', 'cv001_001'), ('../data/converted/commonvoice-to-gigaspeech/txt\\cv001\\cv001_002.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv001\\cv001_002_mic1.wav', 'cv001_002'), ('../data/converted/commonvoice-to-gigaspeech/txt\\cv001\\cv001_003.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv001\\cv001_003_mic1.wav', 'cv001_003'), ('../data/converted/commonvoice-to-gigaspeech/txt\\cv001\\cv001_004.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv001\\cv001_004_mic1.wav', 'cv001_004'), ('../data/converted/commonvoice-to-gigaspeech/txt\\cv001\\cv001_005.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv001\\cv001_005_mic1.wav', 'cv001_005')]





# Train-Val-Test Split

In [None]:
random.seed(42)
random.shuffle(file_pairs)

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1
train_pairs = file_pairs[:int(len(file_pairs) * train_ratio)]
val_pairs = file_pairs[int(len(file_pairs) * train_ratio): int(len(file_pairs) * (train_ratio + val_ratio))]
test_pairs = file_pairs[int(len(file_pairs) * (train_ratio + val_ratio)):]

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")
print(f"Test pairs: {len(test_pairs)}")

Train pairs: 74364
Val pairs: 930
Test pairs: 17662


In [30]:
def load_data(filepairs):
    features = ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path']
    data = {f: [] for f in features}

    for txt_file, wav_file, file_name in tqdm(filepairs):
        
        segment_id = file_name
        speaker = file_name.split('_')[0]

        with open(txt_file, 'r') as f:
            text = f.readline().strip()
        with open(wav_file, 'rb') as f: 
            audio_bytes = f.read()
            
        file_like_object = io.BytesIO(audio_bytes)
        audio_array, sr = sf.read(file_like_object)

        begin_time = 0.0
        end_time = audio_array.shape[0] / sr
        audio_id = file_name
        title = file_name
        url = 'N/A'
        source = 'cv-corpus-20.0-2024-12-06'
        category = 10
        original_full_path = ''

        data['segment_id'].append(segment_id)
        data['speaker'].append(speaker)
        data['text'].append(text)
        data['audio'].append(wav_file)
        data['begin_time'].append(begin_time)
        data['end_time'].append(end_time)
        data['audio_id'].append(audio_id)
        data['title'].append(title)
        data['url'].append(url)
        data['source'].append(source)
        data['category'].append(category)
        data['original_full_path'].append(original_full_path)

    return Dataset.from_dict(data).cast_column("audio", Audio(sampling_rate=16000))

In [None]:
train_data = load_data(train_pairs)
val_data = load_data(val_pairs)
test_data = load_data(test_pairs)

In [32]:
print(val_data[0])

{'segment_id': 'cv129_2145', 'speaker': 'cv129', 'text': 'ผมก็มีในสต็อกนะ แต่เดี๋ยวต้องออกไปซื้ออีก', 'audio': {'path': '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed\\cv129\\cv129_2145_mic1.wav', 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       9.15527344e-05, 6.10351562e-05, 9.15527344e-05]), 'sampling_rate': 16000}, 'begin_time': 0.0, 'end_time': 3.29, 'audio_id': 'cv129_2145', 'title': 'cv129_2145', 'url': 'N/A', 'source': 'cv-corpus-20.0-2024-12-06', 'category': 10, 'original_full_path': ''}


In [None]:
dataset_dict = DatasetDict({'train': train_data, 'validation': val_data, 'test': test_data})
print(dataset_dict)

# Push to Huggingface Hub

In [None]:
dataset_dict.push_to_hub('dubbing-ai/commonvoice-to-gigaspeech')