# Import libraries

In [1]:
from dotenv import load_dotenv
import os
import random
from tqdm import tqdm
import io
import soundfile as sf

from huggingface_hub import login

from datasets import Dataset, DatasetDict, Audio, load_dataset

import sys
sys.path.append('..')
from utils.audio_util import resample_audios, trim_silence_with_vad
from utils.file_util import recursive_copy

# Huggingface Login

In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

login(token=HF_TOKEN)

# Load Gigaspeech

In [3]:
dataset_path = 'speechcolab/gigaspeech'
cached_path = '../data/raw/gigaspeech'

gigaspeech = load_dataset(path=dataset_path, name='xs', cache_dir=cached_path)
print(gigaspeech)

data/xs_n_archives.txt:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

data/test_n_archives.txt:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

xs_chunks_0000.tar.gz:   0%|          | 0.00/972M [00:00<?, ?B/s]

dev_chunks_0000.tar.gz:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

test_chunks_0000.tar.gz:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

test_chunks_0001.tar.gz:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

test_chunks_0002.tar.gz:   0%|          | 0.00/861M [00:00<?, ?B/s]

xs_chunks_0000_metadata.csv:   0%|          | 0.00/3.28M [00:00<?, ?B/s]

dev_chunks_0000_metadata.csv:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

test_chunks_0000_metadata.csv:   0%|          | 0.00/2.66M [00:00<?, ?B/s]

test_chunks_0001_metadata.csv:   0%|          | 0.00/2.66M [00:00<?, ?B/s]

test_chunks_0002_metadata.csv:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 9389
    })
    validation: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 6750
    })
    test: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 25619
    })
})


In [4]:
# print samples
print(gigaspeech['train'][0])

{'segment_id': 'YOU0000000315_S0000660', 'speaker': 'N/A', 'text': "AS THEY'RE LEAVING <COMMA> CAN KASH PULL ZAHRA ASIDE REALLY QUICKLY <QUESTIONMARK>", 'audio': {'path': '/Users/titor/Chula/4-1/capstone/Restructure/data/raw/gigaspeech/downloads/extracted/655ba9120b25038fb7ccb93ce3fcb18e2a521d130978684577a0f67084ff2382/xs_chunks_0000/YOU0000000315_S0000660.wav', 'array': array([0.0005188 , 0.00085449, 0.00012207, ..., 0.00125122, 0.00076294,
       0.00036621]), 'sampling_rate': 16000}, 'begin_time': 2941.889892578125, 'end_time': 2945.070068359375, 'audio_id': 'YOU0000000315', 'title': 'Return to Vasselheim | Critical Role: VOX MACHINA | Episode 43', 'url': 'https://www.youtube.com/watch?v=zr2n1fLVasU', 'source': 2, 'category': 24, 'original_full_path': 'audio/youtube/P0004/YOU0000000315.opus'}


# Load TSync2

In [5]:
tsync2_path = '../data/raw/TSync2'

wav_files = sorted(os.listdir(f'{tsync2_path}/wav'))
wrd_ph_files = sorted(os.listdir(f'{tsync2_path}/wrd_ph'))

assert len(wav_files) == len(wrd_ph_files)

len_tsync2 = len(wav_files)
print(f'TSync2 has {len_tsync2} audio files')

TSync2 has 2710 audio files


In [6]:
# shuffle pairs
random.seed(42)
combined = list(zip(wav_files, wrd_ph_files))
random.shuffle(combined)

In [7]:
# split train test val
train_val_test = [0.8, 0.1, 0.1] # adjust these values to change the split
train_size = int(len_tsync2 * train_val_test[0])
val_size = int(len_tsync2 * train_val_test[1])
test_size = len_tsync2 - train_size - val_size

train_files = combined[:train_size]
val_files = combined[train_size:train_size + val_size]
test_files = combined[train_size + val_size:]

print(f'Train: {len(train_files)} Val: {len(val_files)} Test: {len(test_files)}')

Train: 2168 Val: 271 Test: 271


# Preprocess the audio files

In [8]:
!find "../data/raw/TSync2/wav" -type f -name "*.wav" -exec sh -c 'ffmpeg -i "$1" -c:a pcm_mulaw "${1%.wav}.tmp.wav" && mv "${1%.wav}.tmp.wav" "$1"' _ {} \;

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

In [10]:
# Create destination directory if it doesn't exist
os.makedirs("../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed", exist_ok=True)

# Copy all files from wav32 to wav16_silence_trimmed
src_dir = "../data/raw/TSync2/wav"
dst_dir = "../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed"

recursive_copy(src_dir, dst_dir)

In [11]:
# Resample all files in wav16_silence_trimmed to 16kHz
SAMPLE_RATE = 16000
NUM_RESAMPLE_THREADS = 4

resample_audios(
  input_folders=dst_dir,
  file_ext="wav",
  sample_rate=SAMPLE_RATE,
  n_jobs=NUM_RESAMPLE_THREADS
)

Resampling the audio files...
Found 2710 files...


100%|██████████| 2710/2710 [00:07<00:00, 382.73it/s]

Done !





In [12]:
# Trim silence at the beginning and end of each audio file
trim_silence_with_vad(
  input_folder=dst_dir,
  file_extension="wav",
)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /Users/titor/.cache/torch/hub/master.zip


Found 2710 .wav files to process


Processing files:   1%|          | 22/2710 [00:00<01:35, 28.17it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_004.wav probably does not have speech please check it !!


Processing files:   2%|▏         | 49/2710 [00:01<01:27, 30.26it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_212.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_206.wav probably does not have speech please check it !!


Processing files:   2%|▏         | 65/2710 [00:02<01:39, 26.48it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_366.wav probably does not have speech please check it !!


Processing files:   4%|▍         | 116/2710 [00:04<01:26, 29.90it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_373.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_415.wav probably does not have speech please check it !!


Processing files:   5%|▌         | 147/2710 [00:05<01:33, 27.33it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_011.wav probably does not have speech please check it !!


Processing files:   8%|▊         | 222/2710 [00:09<01:47, 23.15it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_403.wav probably does not have speech please check it !!


Processing files:   9%|▉         | 244/2710 [00:10<01:21, 30.20it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_172.wav probably does not have speech please check it !!


Processing files:  10%|▉         | 268/2710 [00:11<01:29, 27.24it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_370.wav probably does not have speech please check it !!


Processing files:  13%|█▎        | 354/2710 [00:15<01:35, 24.71it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_200.wav probably does not have speech please check it !!


Processing files:  16%|█▌        | 425/2710 [00:17<01:15, 30.31it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_407.wav probably does not have speech please check it !!


Processing files:  17%|█▋        | 473/2710 [00:19<01:28, 25.17it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_003.wav probably does not have speech please check it !!


Processing files:  18%|█▊        | 497/2710 [00:20<01:17, 28.47it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_001.wav probably does not have speech please check it !!


Processing files:  27%|██▋       | 731/2710 [00:30<01:13, 26.80it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_462.wav probably does not have speech please check it !!


Processing files:  29%|██▉       | 781/2710 [00:32<01:10, 27.20it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_099.wav probably does not have speech please check it !!


Processing files:  30%|███       | 821/2710 [00:34<01:15, 24.90it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_500.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_514.wav probably does not have speech please check it !!


Processing files:  32%|███▏      | 872/2710 [00:36<01:24, 21.75it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_111.wav probably does not have speech please check it !!


Processing files:  33%|███▎      | 896/2710 [00:37<01:18, 23.01it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_475.wav probably does not have speech please check it !!


Processing files:  35%|███▍      | 940/2710 [00:39<01:28, 19.96it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_059.wav probably does not have speech please check it !!


Processing files:  36%|███▌      | 979/2710 [00:41<01:16, 22.71it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_263.wav probably does not have speech please check it !!


Processing files:  37%|███▋      | 1006/2710 [00:43<01:14, 22.98it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_317.wav probably does not have speech please check it !!


Processing files:  38%|███▊      | 1033/2710 [00:44<00:54, 30.63it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_129.wav probably does not have speech please check it !!


Processing files:  40%|███▉      | 1080/2710 [00:46<01:08, 23.92it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_262.wav probably does not have speech please check it !!


Processing files:  41%|████      | 1101/2710 [00:47<00:58, 27.44it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_074.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_060.wav probably does not have speech please check it !!


Processing files:  42%|████▏     | 1150/2710 [00:49<01:15, 20.79it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_260.wav probably does not have speech please check it !!


Processing files:  45%|████▌     | 1231/2710 [00:53<01:01, 23.97it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_473.wav probably does not have speech please check it !!


Processing files:  46%|████▌     | 1251/2710 [00:54<00:55, 26.35it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_261.wav probably does not have speech please check it !!


Processing files:  49%|████▊     | 1317/2710 [00:57<00:59, 23.47it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_287.wav probably does not have speech please check it !!


Processing files:  49%|████▉     | 1330/2710 [00:57<00:49, 27.67it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_278.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_250.wav probably does not have speech please check it !!


Processing files:  51%|█████     | 1373/2710 [00:59<00:48, 27.35it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_127.wav probably does not have speech please check it !!


Processing files:  51%|█████     | 1387/2710 [00:59<00:49, 26.71it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_457.wav probably does not have speech please check it !!


Processing files:  54%|█████▎    | 1456/2710 [01:02<00:42, 29.29it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_092.wav probably does not have speech please check it !!


Processing files:  55%|█████▌    | 1503/2710 [01:04<00:39, 30.67it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_333.wav probably does not have speech please check it !!


Processing files:  57%|█████▋    | 1537/2710 [01:05<00:39, 29.33it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_131.wav probably does not have speech please check it !!


Processing files:  58%|█████▊    | 1572/2710 [01:07<00:53, 21.15it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_483.wav probably does not have speech please check it !!


Processing files:  66%|██████▌   | 1789/2710 [01:16<00:46, 19.72it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_525.wav probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1941/2710 [01:23<00:34, 22.33it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_308.wav probably does not have speech please check it !!
> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_334.wav probably does not have speech please check it !!


Processing files:  72%|███████▏  | 1953/2710 [01:23<00:26, 28.64it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_446.wav probably does not have speech please check it !!


Processing files:  76%|███████▌  | 2054/2710 [01:28<00:36, 17.76it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_409.wav probably does not have speech please check it !!


Processing files:  76%|███████▋  | 2072/2710 [01:28<00:25, 25.49it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_186.wav probably does not have speech please check it !!


Processing files:  77%|███████▋  | 2086/2710 [01:29<00:23, 27.04it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_145.wav probably does not have speech please check it !!


Processing files:  78%|███████▊  | 2117/2710 [01:30<00:22, 26.63it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_434.wav probably does not have speech please check it !!


Processing files:  85%|████████▍ | 2297/2710 [01:38<00:15, 27.21it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_190.wav probably does not have speech please check it !!


Processing files:  86%|████████▌ | 2331/2710 [01:39<00:11, 33.43it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_231.wav probably does not have speech please check it !!


Processing files:  86%|████████▋ | 2343/2710 [01:40<00:16, 22.45it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_219.wav probably does not have speech please check it !!


Processing files:  89%|████████▉ | 2421/2710 [01:43<00:11, 24.43it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_433.wav probably does not have speech please check it !!


Processing files:  92%|█████████▏| 2488/2710 [01:45<00:09, 24.29it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_383.wav probably does not have speech please check it !!


Processing files:  92%|█████████▏| 2503/2710 [01:46<00:06, 29.81it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_220.wav probably does not have speech please check it !!


Processing files:  93%|█████████▎| 2517/2710 [01:46<00:07, 27.22it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_036.wav probably does not have speech please check it !!


Processing files:  94%|█████████▍| 2544/2710 [01:48<00:06, 24.89it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_008.wav probably does not have speech please check it !!


Processing files:  95%|█████████▍| 2562/2710 [01:48<00:05, 25.60it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_222.wav probably does not have speech please check it !!


Processing files:  97%|█████████▋| 2621/2710 [01:50<00:02, 31.41it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_169.wav probably does not have speech please check it !!


Processing files:  99%|█████████▉| 2680/2710 [01:52<00:01, 29.59it/s]

> The file ../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed/tsync2_noon_99_237.wav probably does not have speech please check it !!


Processing files: 100%|██████████| 2710/2710 [01:53<00:00, 23.78it/s]


Processing complete

Found 58 files with no speech. List saved to ../data/converted/TSync2-to-gigaspeech/no_speech_files.txt





In [13]:
# Normalize the volume of all audio files to -27dB
!find "../data/converted/TSync2-to-gigaspeech/wav16_silence_trimmed" -type f -name "*.wav" -exec sh -c 'ffmpeg-normalize "$1" -nt rms -t=-27 -o "$1" -ar 16000 -f' _ {} \;

# Format Dataset

In [14]:
def load_data(filepairs):
    features = ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path']
    data = {f: [] for f in features}

    for wav_file, wrd_ph_file in tqdm(filepairs):

        file_name = wav_file.split('.')[0]
        
        segment_id = file_name
        speaker = 'Noon'

        with open(f'{tsync2_path}/wrd_ph/{wrd_ph_file}') as f:
            line = f.readline().strip()
            text = ''.join(line.split('|'))
        with open(f'{dst_dir}/{wav_file}', 'rb') as f:
            audio_bytes = f.read()
            
        file_like_object = io.BytesIO(audio_bytes)
        audio_array, sr = sf.read(file_like_object)

        begin_time = 0.0
        end_time = audio_array.shape[0] / sr
        audio_id = file_name
        title = file_name
        url = 'N/A'
        source = 'TSync2'
        category = 10
        original_full_path = ''

        data['segment_id'].append(segment_id)
        data['speaker'].append(speaker)
        data['text'].append(text)
        data['audio'].append(f'{tsync2_path}/wav/{wav_file}')
        data['begin_time'].append(begin_time)
        data['end_time'].append(end_time)
        data['audio_id'].append(audio_id)
        data['title'].append(title)
        data['url'].append(url)
        data['source'].append(source)
        data['category'].append(category)
        data['original_full_path'].append(original_full_path)

    return Dataset.from_dict(data).cast_column("audio", Audio(sampling_rate=16000))


In [15]:
train_data = load_data(train_files)
val_data = load_data(val_files)
test_data = load_data(test_files)

100%|██████████| 2168/2168 [00:01<00:00, 1089.08it/s]
100%|██████████| 271/271 [00:00<00:00, 1030.28it/s]
100%|██████████| 271/271 [00:00<00:00, 1109.82it/s]


In [16]:
# print sample data
print(train_data[0])

{'segment_id': 'tsync2_noon_1_2757', 'speaker': 'Noon', 'text': 'ได้ออกเดินทางจากฐานทัพฟอร์ตดรัมในนิวยอร์กแล้ว', 'audio': {'path': '../data/raw/TSync2/wav/tsync2_noon_1_2757.wav', 'array': array([ 5.01572737e-04,  8.84395908e-04,  7.93166109e-04, ...,
       -1.11445050e-04,  7.06672836e-06,  0.00000000e+00]), 'sampling_rate': 16000}, 'begin_time': 0.0, 'end_time': 3.868, 'audio_id': 'tsync2_noon_1_2757', 'title': 'tsync2_noon_1_2757', 'url': 'N/A', 'source': 'TSync2', 'category': 10, 'original_full_path': ''}


In [17]:
dataset_dict = DatasetDict({'train': train_data, 'validation': val_data, 'test': test_data})
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 2168
    })
    validation: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 271
    })
    test: Dataset({
        features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
        num_rows: 271
    })
})


# Push to Huggingface Hub

In [19]:
dataset_dict.push_to_hub('dubbing-ai/tsync2-to-gigaspeech')

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1084 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Map:   0%|          | 0/1084 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dubbing-ai/tsync2-to-gigaspeech/commit/6057218339dd68cda81873adb4c0facc613eb75c', commit_message='Upload dataset', commit_description='', oid='6057218339dd68cda81873adb4c0facc613eb75c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dubbing-ai/tsync2-to-gigaspeech', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dubbing-ai/tsync2-to-gigaspeech'), pr_revision=None, pr_num=None)