# Import libraries

In [1]:
import pandas as pd
import os
import shutil
from typing import List
from tqdm import tqdm
import random

import concurrent.futures
from typing import List, Tuple, Dict
import multiprocessing

import soundfile as sf
import io
from huggingface_hub import login
from dotenv import load_dotenv

from datasets import Dataset, DatasetDict, Audio

import sys
sys.path.append('..')
from utils.audio_util import convert_mp3_to_wav, resample_audios, trim_silence_with_vad
from utils.file_util import recursive_copy
from utils.text_util import clean_text_cv

# Huggingface login

In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

login(token=HF_TOKEN)

# Read the validated tsv

In [3]:
validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')

  validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')


# Define word replacement function

In [4]:
replace_dict = {
    "Facebook": "เฟซบุ๊ก",
    "softmax": "ซอฟต์แม็กซ์",
    "Astroturf": "แอสโตรเทิร์ฟ",
    "Burke": "เบิร์ก",
    "whilst": "ไวล์สท์",
    "Kenny": "เคนนี",
    "Flickr": "ฟลิกเกอร์",
    "Asperger": "แอสเพอร์เกอร์",
    "Johanna": "โจแอนนา",
    "C" : "ซี",
    "section": "เซคชัน",
    "Mr Lincoln": "มิสเตอร์ ลินคอล์น",
    "Brexiteers": "เบร็กซิทเทียร์ส",
    "Brexit": "เบร็กซิท"
}

def preprocess_words(dataframe: pd.DataFrame, column_name: str, replacing_pairs: List[List[str]]) -> pd.DataFrame:
    for old_word, new_word in replacing_pairs:
        dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))
    dataframe[column_name] = dataframe[column_name].apply(clean_text_cv, replace_dict=replace_dict)
    return dataframe

replacing_word = [
    ['เพฃร', 'เพชร'],
]

# Filter and group client_id that have over 100 records

In [5]:
filtered_data = validated_data[
    validated_data['client_id'].map(
        validated_data['client_id'].value_counts() >= 100
    )
]
filtered_data = preprocess_words(filtered_data, 'sentence', replacing_word)
grouped = filtered_data.groupby('client_id').agg(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column_name] = dataframe[column_name].apply(clean_text_cv)


# Define new id instead of client_id

In [6]:
id_mapper = {id_: f'cv{str(i+1).zfill(3)}' for i, id_ in enumerate(filtered_data['client_id'].unique())}

grouped_data = {
    id_mapper[client_id]: list(zip(sentences, paths))
    for (client_id, (sentences, paths)) in grouped[['sentence', 'path']].iterrows()
}

# Moving files to the new folder

In [7]:
# Define paths
DEST_DIR = "../data/converted/commonvoice-to-gigaspeech"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav32")
SRC_AUDIO_PATH = "../data/raw/cv-corpus-20.0-2024-12-06/th/clips"

# Number of worker threads (adjust based on your CPU)
NUM_WORKERS = multiprocessing.cpu_count()

def process_client_data(client_data: Tuple[str, List]) -> Dict:
    """
    Process all data for a single client
    
    Args:
        client_data: Tuple of (client_id, data)
    
    Returns:
        Dict with processing results
    """
    client_id, data = client_data
    results = {
        'client_id': client_id,
        'processed': 0,
        'failed': 0,
        'chars': set()
    }
    
    # Create client directories
    client_text_dir = os.path.join(DEST_TEXT_PATH, client_id)
    client_audio_dir = os.path.join(DEST_AUDIO_PATH, client_id)
    os.makedirs(client_text_dir, exist_ok=True)
    os.makedirs(client_audio_dir, exist_ok=True)
    
    for i, d in enumerate(data):
        # Write text file
        text_path = os.path.join(client_text_dir, f"{client_id}_{(i + 1):03d}.txt")
        with open(text_path, 'w') as f:
            f.write(d[0])
            results['chars'].update(d[0])
        
        # Convert audio file
        src_audio_path = os.path.join(SRC_AUDIO_PATH, d[1])
        dst_audio_path = os.path.join(
            client_audio_dir,
            f"{client_id}_{(i + 1):03d}_mic1.wav"
        )
        
        if convert_mp3_to_wav(src_audio_path, dst_audio_path):
            results['processed'] += 1
        else:
            results['failed'] += 1
    
    return results

# Clean and create directories
if os.path.exists(DEST_DIR):
    print("Clearing destination folder")
    shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

print(f"Starting parallel processing with {NUM_WORKERS} workers")

# Create progress bar for overall processing
with tqdm(total=len(grouped_data), desc="Processing clients") as pbar:
    all_chars = set()
    total_processed = 0
    total_failed = 0
    
    # Process clients in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        # Submit all client processing tasks
        future_to_client = {
            executor.submit(process_client_data, (client_id, data)): client_id 
            for client_id, data in grouped_data.items()
        }
        
        # Process completed tasks
        for future in concurrent.futures.as_completed(future_to_client):
            client_id = future_to_client[future]
            try:
                result = future.result()
                all_chars.update(result['chars'])
                total_processed += result['processed']
                total_failed += result['failed']
            except Exception as e:
                print(f"Client {client_id} generated an exception: {str(e)}")
                total_failed += len(grouped_data[client_id])
            pbar.update(1)

print("\nConversion Summary:")
print(f"Total files processed successfully: {total_processed}")
print(f"Total files failed: {total_failed}")
print(f"Total unique characters: {len(all_chars)}")
print("Restructuring and conversion complete")

Starting parallel processing with 6 workers


Processing clients: 100%|██████████| 134/134 [22:54<00:00, 10.26s/it] 


Conversion Summary:
Total files processed successfully: 92956
Total files failed: 0
Total unique characters: 98
Restructuring and conversion complete





# Resample, trim, normalize audio

In [9]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/converted/commonvoice-to-gigaspeech/wav32"
dst_dir = "../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed"

recursive_copy(src_dir, dst_dir)

In [10]:
# Resample all files in wav16_silence_trimmed to 16kHz
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_audios(
  input_folders="../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed",
  file_ext="wav",
  sample_rate=SAMPLE_RATE,
  n_jobs=NUM_RESAMPLE_THREADS
)

Resampling the audio files...
Found 92956 files...


100%|██████████| 92956/92956 [01:32<00:00, 1000.72it/s]


Done !


In [11]:
trim_silence_with_vad(
  input_folder="../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed",
  file_extension="wav",
)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/titor/.cache/torch/hub/master.zip


Found 92956 .wav files to process


Processing files:   1%|▏         | 1391/92956 [01:38<1:45:32, 14.46it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv114/cv114_105_mic1.wav probably does not have speech please check it !!


Processing files:   3%|▎         | 3223/92956 [04:03<2:03:27, 12.11it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv123/cv123_251_mic1.wav probably does not have speech please check it !!


Processing files:   7%|▋         | 6760/92956 [08:17<1:27:05, 16.50it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv126/cv126_853_mic1.wav probably does not have speech please check it !!


Processing files:   8%|▊         | 7588/92956 [09:17<1:30:21, 15.75it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv063/cv063_111_mic1.wav probably does not have speech please check it !!


Processing files:  10%|█         | 9651/92956 [11:36<1:31:25, 15.19it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv032/cv032_010_mic1.wav probably does not have speech please check it !!


Processing files:  11%|█         | 10321/92956 [12:26<1:22:30, 16.69it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv031/cv031_067_mic1.wav probably does not have speech please check it !!


Processing files:  26%|██▌       | 23959/92956 [29:02<1:38:21, 11.69it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv125/cv125_946_mic1.wav probably does not have speech please check it !!


Processing files:  32%|███▏      | 29742/92956 [36:30<1:11:04, 14.82it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv133/cv133_202_mic1.wav probably does not have speech please check it !!


Processing files:  42%|████▏     | 38817/92956 [47:59<1:13:00, 12.36it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv046/cv046_030_mic1.wav probably does not have speech please check it !!


Processing files:  44%|████▎     | 40453/92956 [50:00<1:00:11, 14.54it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv039/cv039_039_mic1.wav probably does not have speech please check it !!


Processing files:  45%|████▌     | 41891/92956 [51:43<59:28, 14.31it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv129/cv129_1648_mic1.wav probably does not have speech please check it !!


Processing files:  45%|████▌     | 42245/92956 [52:08<56:51, 14.86it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv129/cv129_1729_mic1.wav probably does not have speech please check it !!


Processing files:  49%|████▉     | 45662/92956 [56:40<56:33, 13.93it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_18451_mic1.wav probably does not have speech please check it !!


Processing files:  56%|█████▌    | 51801/92956 [1:04:48<54:58, 12.48it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_13435_mic1.wav probably does not have speech please check it !!


Processing files:  59%|█████▉    | 54980/92956 [1:08:58<39:38, 15.97it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_18844_mic1.wav probably does not have speech please check it !!


Processing files:  70%|██████▉   | 64763/92956 [1:21:53<32:03, 14.66it/s]  

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_15607_mic1.wav probably does not have speech please check it !!


Processing files:  74%|███████▍  | 68834/92956 [1:27:15<28:40, 14.02it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_8662_mic1.wav probably does not have speech please check it !!


Processing files:  74%|███████▍  | 69006/92956 [1:27:29<33:27, 11.93it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv134/cv134_10987_mic1.wav probably does not have speech please check it !!


Processing files:  78%|███████▊  | 72587/92956 [1:31:31<22:07, 15.34it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv131/cv131_2196_mic1.wav probably does not have speech please check it !!


Processing files:  80%|████████  | 74380/92956 [1:33:31<18:26, 16.78it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv132/cv132_4025_mic1.wav probably does not have speech please check it !!


Processing files:  82%|████████▏ | 76074/92956 [1:35:08<14:28, 19.43it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv132/cv132_116_mic1.wav probably does not have speech please check it !!


Processing files:  82%|████████▏ | 76142/92956 [1:35:12<14:47, 18.94it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv132/cv132_5889_mic1.wav probably does not have speech please check it !!


Processing files:  82%|████████▏ | 76220/92956 [1:35:17<17:24, 16.03it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv132/cv132_923_mic1.wav probably does not have speech please check it !!


Processing files:  84%|████████▍ | 77998/92956 [1:36:58<12:52, 19.36it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv132/cv132_4774_mic1.wav probably does not have speech please check it !!


Processing files:  95%|█████████▍| 88073/92956 [1:49:27<06:56, 11.73it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv101/cv101_116_mic1.wav probably does not have speech please check it !!


Processing files:  98%|█████████▊| 90995/92956 [1:53:08<02:01, 16.18it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv107/cv107_384_mic1.wav probably does not have speech please check it !!


Processing files:  98%|█████████▊| 91285/92956 [1:53:30<02:02, 13.66it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv107/cv107_451_mic1.wav probably does not have speech please check it !!


Processing files:  99%|█████████▉| 92001/92956 [1:54:18<01:09, 13.74it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv013/cv013_103_mic1.wav probably does not have speech please check it !!


Processing files: 100%|█████████▉| 92682/92956 [1:55:06<00:19, 13.70it/s]

> The file ../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv050/cv050_088_mic1.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 92956/92956 [1:55:29<00:00, 13.41it/s]


Processing complete

Found 29 files with no speech. List saved to ../data/converted/commonvoice-to-gigaspeech/no_speech_files.txt





In [13]:
# Normalize the volume of all audio files to -27dB
!find "../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed" -type f -name "*.wav" -exec sh -c 'ffmpeg-normalize "$1" -nt rms -t=-27 -o "$1" -ar 16000 -f' _ {} \;



# Create Dataset

In [14]:
SENTENCE_PATH = "../data/converted/commonvoice-to-gigaspeech/txt"
AUDIO_PATH = "../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed"

file_pairs = []
for speaker in tqdm(os.listdir(SENTENCE_PATH)):
    speaker_sentence_path = os.path.join(SENTENCE_PATH, speaker)
    speaker_audio_path = os.path.join(AUDIO_PATH, speaker)
    for file in os.listdir(speaker_sentence_path):
        file_name = file.split('.')[0]
        sentence_file = os.path.join(speaker_sentence_path, file)
        audio_file = os.path.join(speaker_audio_path, file_name + '_mic1.wav')
        file_pairs.append((sentence_file, audio_file, file.split('.')[0]))

print(f"Total file pairs: {len(file_pairs)}")
print(file_pairs[:5])

100%|██████████| 134/134 [00:00<00:00, 779.84it/s]

Total file pairs: 92956
[('../data/converted/commonvoice-to-gigaspeech/txt/cv119/cv119_514.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv119/cv119_514_mic1.wav', 'cv119_514'), ('../data/converted/commonvoice-to-gigaspeech/txt/cv119/cv119_364.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv119/cv119_364_mic1.wav', 'cv119_364'), ('../data/converted/commonvoice-to-gigaspeech/txt/cv119/cv119_465.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv119/cv119_465_mic1.wav', 'cv119_465'), ('../data/converted/commonvoice-to-gigaspeech/txt/cv119/cv119_856.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv119/cv119_856_mic1.wav', 'cv119_856'), ('../data/converted/commonvoice-to-gigaspeech/txt/cv119/cv119_147.txt', '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv119/cv119_147_mic1.wav', 'cv119_147')]





# Train-Val-Test Split

In [15]:
random.seed(42)
random.shuffle(file_pairs)

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1
train_pairs = file_pairs[:int(len(file_pairs) * train_ratio)]
val_pairs = file_pairs[int(len(file_pairs) * train_ratio): int(len(file_pairs) * (train_ratio + val_ratio))]
test_pairs = file_pairs[int(len(file_pairs) * (train_ratio + val_ratio)):]

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")
print(f"Test pairs: {len(test_pairs)}")

Train pairs: 74364
Val pairs: 9296
Test pairs: 9296


In [18]:
def load_data(filepairs):
    features = ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path']
    data = {f: [] for f in features}

    for txt_file, wav_file, file_name in tqdm(filepairs):
        
        segment_id = file_name
        speaker = file_name.split('_')[0]

        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.readline().strip()
        with open(wav_file, 'rb') as f: 
            audio_bytes = f.read()
            
        file_like_object = io.BytesIO(audio_bytes)
        audio_array, sr = sf.read(file_like_object)

        begin_time = 0.0
        end_time = audio_array.shape[0] / sr
        audio_id = file_name
        title = file_name
        url = 'N/A'
        source = 'cv-corpus-20.0-2024-12-06'
        category = 10
        original_full_path = ''

        data['segment_id'].append(segment_id)
        data['speaker'].append(speaker)
        data['text'].append(text)
        data['audio'].append(wav_file)
        data['begin_time'].append(begin_time)
        data['end_time'].append(end_time)
        data['audio_id'].append(audio_id)
        data['title'].append(title)
        data['url'].append(url)
        data['source'].append(source)
        data['category'].append(category)
        data['original_full_path'].append(original_full_path)

    return Dataset.from_dict(data).cast_column("audio", Audio(sampling_rate=16000))

In [19]:
train_data = load_data(train_pairs)
val_data = load_data(val_pairs)
test_data = load_data(test_pairs)

100%|██████████| 74364/74364 [00:24<00:00, 2998.68it/s]
100%|██████████| 9296/9296 [00:03<00:00, 3005.24it/s]
100%|██████████| 9296/9296 [00:03<00:00, 3087.88it/s]


In [20]:
print(val_data[0])

{'segment_id': 'cv014_004', 'speaker': 'cv014', 'text': 'การฆาตกรรมเป็นการกระทำที่หยาบช้า', 'audio': {'path': '../data/converted/commonvoice-to-gigaspeech/wav16_silence_trimmed/cv014/cv014_004_mic1.wav', 'array': array([-0.00076294, -0.0005188 , -0.00039673, ...,  0.01580811,
        0.01452637,  0.01071167]), 'sampling_rate': 16000}, 'begin_time': 0.0, 'end_time': 2.428, 'audio_id': 'cv014_004', 'title': 'cv014_004', 'url': 'N/A', 'source': 'cv-corpus-20.0-2024-12-06', 'category': 10, 'original_full_path': ''}


In [21]:
dataset_dict = DatasetDict({'train': train_data, 'validation': val_data, 'test': test_data})
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 74364
    })
    validation: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 9296
    })
    test: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 9296
    })
})


# Push to Huggingface Hub

In [22]:
dataset_dict.push_to_hub('dubbing-ai/commonvoice-to-gigaspeech')

Uploading the dataset shards:   0%|          | 0/13 [00:00<?, ?it/s]

Map:   0%|          | 0/5721 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5721 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5721 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5721 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/58 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dubbing-ai/commonvoice-to-gigaspeech/commit/2bd2c5e77744c769cdf9be5a9fb441016ec40fe6', commit_message='Upload dataset', commit_description='', oid='2bd2c5e77744c769cdf9be5a9fb441016ec40fe6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dubbing-ai/commonvoice-to-gigaspeech', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dubbing-ai/commonvoice-to-gigaspeech'), pr_revision=None, pr_num=None)