# Import libraries

In [None]:
from dotenv import load_dotenv
import os
import shutil
from pathlib import Path
import random
from tqdm import tqdm
import torch
import io
import soundfile as sf

from huggingface_hub import login

from datasets import Dataset, DatasetDict, Audio, load_dataset

from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

# Huggingface Login

In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

login(token=HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Ming\.cache\huggingface\token
Login successful


# Load Gigaspeech

In [3]:
dataset_path = 'speechcolab/gigaspeech'
cached_path = '../data/raw/gigaspeech'

gigaspeech = load_dataset(path=dataset_path, name='xs', cache_dir=cached_path)
print(gigaspeech)

Using the latest cached version of the module from C:\Users\Ming\.cache\huggingface\modules\datasets_modules\datasets\speechcolab--gigaspeech\0db31224ad43470c71b459deb2f2b40956b3a4edfde5fb313aaec69ec7b50d3c (last modified on Tue Feb  4 14:03:48 2025) since it couldn't be found locally at speechcolab/gigaspeech, or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 9389
    })
    validation: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 6750
    })
    test: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 25619
    })
})


In [4]:
# print samples
print(gigaspeech['train'][0])

{'segment_id': 'YOU0000000315_S0000660', 'speaker': 'N/A', 'text': "AS THEY'RE LEAVING <COMMA> CAN KASH PULL ZAHRA ASIDE REALLY QUICKLY <QUESTIONMARK>", 'audio': {'path': '../data/raw/gigaspeech/downloads\\extracted\\38fe4c94b1b1d9e64fdcb1171e0c934e20120f08c7da8e780122cf1a55df4cc7\\xs_chunks_0000/YOU0000000315_S0000660.wav', 'array': array([0.0005188 , 0.00085449, 0.00012207, ..., 0.00125122, 0.00076294,
       0.00036621]), 'sampling_rate': 16000}, 'begin_time': 2941.889892578125, 'end_time': 2945.070068359375, 'audio_id': 'YOU0000000315', 'title': 'Return to Vasselheim | Critical Role: VOX MACHINA | Episode 43', 'url': 'https://www.youtube.com/watch?v=zr2n1fLVasU', 'source': 2, 'category': 24, 'original_full_path': 'audio/youtube/P0004/YOU0000000315.opus'}


# Load TSync2

In [5]:
tsync2_path = '../data/raw/TSync2'

wav_files = sorted(os.listdir(f'{tsync2_path}/wav'))
wrd_ph_files = sorted(os.listdir(f'{tsync2_path}/wrd_ph'))

assert len(wav_files) == len(wrd_ph_files)

len_tsync2 = len(wav_files)
print(f'TSync2 has {len_tsync2} audio files')

TSync2 has 2710 audio files


In [6]:
# shuffle pairs
random.seed(42)
combined = list(zip(wav_files, wrd_ph_files))
random.shuffle(combined)

In [7]:
# split train test val
train_val_test = [0.8, 0.1, 0.1] # adjust these values to change the split
train_size = int(len_tsync2 * train_val_test[0])
val_size = int(len_tsync2 * train_val_test[1])
test_size = len_tsync2 - train_size - val_size

train_files = combined[:train_size]
val_files = combined[train_size:train_size + val_size]
test_files = combined[train_size + val_size:]

print(f'Train: {len(train_files)} Val: {len(val_files)} Test: {len(test_files)}')

Train: 2168 Val: 271 Test: 271


# Preprocess the audio files

In [8]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/tsync2-to-gigaspeech/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/raw/TSync2/wav"
dst_dir = "../data/converted/tsync2-to-gigaspeech/wav16_silence_trimmed"

# Walk through the source directory and copy files while preserving directory structure
for root, dirs, files in os.walk(src_dir):
  for dir_name in dirs:
    src_path = os.path.join(root, dir_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    os.makedirs(dst_path, exist_ok=True)
  
  for file_name in files:
    src_path = os.path.join(root, file_name)
    dst_path = os.path.join(dst_dir, os.path.relpath(src_path, src_dir))
    shutil.copy2(src_path, dst_path)

In [9]:
# Resample all files in the destination directory to 16kHz
resample_files(dst_dir, 16000, file_ext='wav', n_jobs=4)

Resampling the audio files...
Found 2710 files...


100%|██████████| 2710/2710 [00:12<00:00, 213.52it/s]

Done !





# Trim start and end with VAD

In [10]:
input_folder = Path(dst_dir)

# Get VAD model once
model_and_utils = get_vad_model_and_utils(use_cuda=torch.cuda.is_available(), use_onnx=False)

# Get all .flac files
wav_files = list(input_folder.glob('*.wav'))
total_files = len(wav_files)
print(f"Found {total_files} .wav files to process")

# Track files with no speech detected
no_speech_files = []

for input_path in tqdm(wav_files, desc="Processing files"):
   # Preserve directory structure
   relative_path = input_path.relative_to(input_folder)
   output_path = input_folder / relative_path
   
   # Create subdirectories
   output_path.parent.mkdir(parents=True, exist_ok=True)
   
   try:
       output_path, is_speech = remove_silence(
           model_and_utils,
           str(input_path),
           str(output_path),
           trim_just_beginning_and_end=True,
           use_cuda=torch.cuda.is_available()
       )
       # If no speech detected, add to list
       if not is_speech:
           no_speech_files.append(str(output_path))
   except Exception as e:
       print(f"Error processing {relative_path}: {str(e)}")

print("\nProcessing complete")

# Write list of files with no speech detected
if no_speech_files:
   log_path = input_folder.parent / "no_speech_files.txt"
   with open(log_path, "w", encoding="utf-8") as f:
       for file in no_speech_files:
           f.write(f"{file}\n")
   print(f"\nFound {len(no_speech_files)} files with no speech. List saved to {log_path}")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip


Found 2710 .wav files to process


Processing files:  80%|███████▉  | 2160/2710 [11:05<02:34,  3.56it/s] 

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_001.wav probably does not have speech please check it !!


Processing files:  80%|███████▉  | 2164/2710 [11:06<01:38,  5.56it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_003.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_004.wav probably does not have speech please check it !!


Processing files:  80%|████████  | 2169/2710 [11:07<01:33,  5.78it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_008.wav probably does not have speech please check it !!


Processing files:  80%|████████  | 2170/2710 [11:07<01:45,  5.10it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_011.wav probably does not have speech please check it !!


Processing files:  81%|████████  | 2196/2710 [11:14<01:57,  4.38it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_036.wav probably does not have speech please check it !!


Processing files:  82%|████████▏ | 2220/2710 [11:19<01:28,  5.56it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_059.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_060.wav probably does not have speech please check it !!


Processing files:  82%|████████▏ | 2233/2710 [11:23<02:04,  3.85it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_074.wav probably does not have speech please check it !!


Processing files:  83%|████████▎ | 2253/2710 [11:27<01:18,  5.85it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_092.wav probably does not have speech please check it !!


Processing files:  83%|████████▎ | 2258/2710 [11:28<01:26,  5.25it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_099.wav probably does not have speech please check it !!


Processing files:  84%|████████▍ | 2272/2710 [11:31<01:12,  6.00it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_111.wav probably does not have speech please check it !!


Processing files:  84%|████████▍ | 2288/2710 [11:34<01:19,  5.32it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_127.wav probably does not have speech please check it !!


Processing files:  85%|████████▍ | 2290/2710 [11:34<01:07,  6.25it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_129.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_131.wav probably does not have speech please check it !!


Processing files:  85%|████████▌ | 2304/2710 [11:37<01:16,  5.32it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_145.wav probably does not have speech please check it !!


Processing files:  86%|████████▌ | 2328/2710 [11:43<01:48,  3.51it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_169.wav probably does not have speech please check it !!


Processing files:  86%|████████▌ | 2331/2710 [11:44<01:42,  3.70it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_172.wav probably does not have speech please check it !!


Processing files:  87%|████████▋ | 2347/2710 [11:47<01:00,  6.00it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_186.wav probably does not have speech please check it !!


Processing files:  87%|████████▋ | 2349/2710 [11:48<01:16,  4.71it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_190.wav probably does not have speech please check it !!


Processing files:  87%|████████▋ | 2359/2710 [11:50<01:27,  4.03it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_200.wav probably does not have speech please check it !!


Processing files:  87%|████████▋ | 2366/2710 [11:52<01:14,  4.63it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_206.wav probably does not have speech please check it !!


Processing files:  87%|████████▋ | 2371/2710 [11:53<01:44,  3.25it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_212.wav probably does not have speech please check it !!


Processing files:  88%|████████▊ | 2380/2710 [11:56<01:06,  4.98it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_219.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_220.wav probably does not have speech please check it !!


Processing files:  88%|████████▊ | 2382/2710 [11:56<01:03,  5.15it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_222.wav probably does not have speech please check it !!


Processing files:  88%|████████▊ | 2391/2710 [11:58<01:17,  4.12it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_231.wav probably does not have speech please check it !!


Processing files:  88%|████████▊ | 2396/2710 [12:00<01:43,  3.04it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_237.wav probably does not have speech please check it !!


Processing files:  89%|████████▉ | 2411/2710 [12:04<01:01,  4.83it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_250.wav probably does not have speech please check it !!


Processing files:  89%|████████▉ | 2421/2710 [12:07<01:05,  4.44it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_260.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_261.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_262.wav probably does not have speech please check it !!


Processing files:  89%|████████▉ | 2423/2710 [12:07<00:52,  5.49it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_263.wav probably does not have speech please check it !!


Processing files:  90%|████████▉ | 2438/2710 [12:11<01:06,  4.12it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_278.wav probably does not have speech please check it !!


Processing files:  90%|█████████ | 2447/2710 [12:14<01:03,  4.11it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_287.wav probably does not have speech please check it !!


Processing files:  91%|█████████ | 2468/2710 [12:21<01:03,  3.82it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_308.wav probably does not have speech please check it !!


Processing files:  91%|█████████▏| 2476/2710 [12:23<01:15,  3.10it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_317.wav probably does not have speech please check it !!


Processing files:  92%|█████████▏| 2494/2710 [12:28<00:47,  4.54it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_333.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_334.wav probably does not have speech please check it !!


Processing files:  93%|█████████▎| 2525/2710 [12:36<00:46,  4.01it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_366.wav probably does not have speech please check it !!


Processing files:  93%|█████████▎| 2529/2710 [12:37<00:42,  4.29it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_370.wav probably does not have speech please check it !!


Processing files:  93%|█████████▎| 2532/2710 [12:38<00:36,  4.81it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_373.wav probably does not have speech please check it !!


Processing files:  94%|█████████▍| 2542/2710 [12:40<00:37,  4.46it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_383.wav probably does not have speech please check it !!


Processing files:  95%|█████████▍| 2562/2710 [12:45<00:41,  3.60it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_403.wav probably does not have speech please check it !!


Processing files:  95%|█████████▍| 2566/2710 [12:46<00:43,  3.30it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_407.wav probably does not have speech please check it !!


Processing files:  95%|█████████▍| 2570/2710 [12:47<00:31,  4.51it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_409.wav probably does not have speech please check it !!


Processing files:  95%|█████████▌| 2575/2710 [12:48<00:30,  4.39it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_415.wav probably does not have speech please check it !!


Processing files:  96%|█████████▌| 2594/2710 [12:53<00:15,  7.41it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_433.wav probably does not have speech please check it !!
> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_434.wav probably does not have speech please check it !!


Processing files:  96%|█████████▌| 2605/2710 [12:56<00:30,  3.41it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_446.wav probably does not have speech please check it !!


Processing files:  97%|█████████▋| 2616/2710 [12:59<00:23,  4.08it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_457.wav probably does not have speech please check it !!


Processing files:  97%|█████████▋| 2621/2710 [13:01<00:25,  3.55it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_462.wav probably does not have speech please check it !!


Processing files:  97%|█████████▋| 2633/2710 [13:03<00:12,  5.95it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_473.wav probably does not have speech please check it !!


Processing files:  97%|█████████▋| 2634/2710 [13:03<00:14,  5.23it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_475.wav probably does not have speech please check it !!


Processing files:  98%|█████████▊| 2644/2710 [13:06<00:12,  5.39it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_483.wav probably does not have speech please check it !!


Processing files:  98%|█████████▊| 2661/2710 [13:10<00:10,  4.77it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_500.wav probably does not have speech please check it !!


Processing files:  99%|█████████▊| 2673/2710 [13:14<00:10,  3.39it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_514.wav probably does not have speech please check it !!


Processing files:  99%|█████████▉| 2686/2710 [13:16<00:03,  7.16it/s]

> The file ..\data\converted\tsync2-to-gigaspeech\wav16_silence_trimmed\tsync2_noon_99_525.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 2710/2710 [13:21<00:00,  3.38it/s]


Processing complete

Found 58 files with no speech. List saved to ..\data\converted\tsync2-to-gigaspeech\no_speech_files.txt





# Format Dataset

In [11]:
def load_data(filepairs):
    features = ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path']
    data = {f: [] for f in features}

    for wav_file, wrd_ph_file in tqdm(filepairs):

        file_name = wav_file.split('.')[0]
        
        segment_id = file_name
        speaker = 'Noon'

        with open(f'{tsync2_path}/wrd_ph/{wrd_ph_file}') as f:
            line = f.readline().strip()
            text = ''.join(line.split('|'))
        with open(f'{dst_dir}/{wav_file}', 'rb') as f:
            audio_bytes = f.read()
            
        file_like_object = io.BytesIO(audio_bytes)
        audio_array, sr = sf.read(file_like_object)

        begin_time = 0.0
        end_time = audio_array.shape[0] / sr
        audio_id = file_name
        title = file_name
        url = 'N/A'
        source = 'TSync2'
        category = 10
        original_full_path = ''

        data['segment_id'].append(segment_id)
        data['speaker'].append(speaker)
        data['text'].append(text)
        data['audio'].append(f'{tsync2_path}/wav/{wav_file}')
        data['begin_time'].append(begin_time)
        data['end_time'].append(end_time)
        data['audio_id'].append(audio_id)
        data['title'].append(title)
        data['url'].append(url)
        data['source'].append(source)
        data['category'].append(category)
        data['original_full_path'].append(original_full_path)

    return Dataset.from_dict(data).cast_column("audio", Audio(sampling_rate=16000))


In [12]:
train_data = load_data(train_files)
val_data = load_data(val_files)
test_data = load_data(test_files)

100%|██████████| 2168/2168 [00:16<00:00, 130.52it/s]
100%|██████████| 271/271 [00:02<00:00, 132.72it/s]
100%|██████████| 271/271 [00:02<00:00, 134.72it/s]


In [13]:
# print sample data
print(train_data[0])

{'segment_id': 'tsync2_noon_1_2757', 'speaker': 'Noon', 'text': 'ได้ออกเดินทางจากฐานทัพฟอร์ตดรัมในนิวยอร์กแล้ว', 'audio': {'path': '../data/raw/TSync2/wav/tsync2_noon_1_2757.wav', 'array': array([5.06200595e-04, 8.83036875e-04, 7.95761822e-04, ...,
       1.54371464e-05, 4.19901116e-05, 0.00000000e+00]), 'sampling_rate': 16000}, 'begin_time': 0.0, 'end_time': 3.868, 'audio_id': 'tsync2_noon_1_2757', 'title': 'tsync2_noon_1_2757', 'url': 'N/A', 'source': 'TSync2', 'category': 10, 'original_full_path': ''}


In [14]:
dataset_dict = DatasetDict({'train': train_data, 'validation': val_data, 'test': test_data})
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 2168
    })
    validation: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 271
    })
    test: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 271
    })
})


# Push to Huggingface Hub

In [None]:
dataset_dict.push_to_hub('ming-korawut/tsync2-to-gigaspeech')

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/36.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/ming-korawut/tsync2-to-gigaspeech/commit/4715f4861a2a50d59a3a19545f97842d5b04b7cd', commit_message='Upload dataset', commit_description='', oid='4715f4861a2a50d59a3a19545f97842d5b04b7cd', pr_url=None, pr_revision=None, pr_num=None)