In [1]:
%conda install torchaudio

3 channel Terms of Service accepted
Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: c:\Users\PC\miniconda3\envs\dataDownload

  added / updated specs:
    - torchaudio


The following NEW packages will be INSTALLED:

  aom                pkgs/main/win-64::aom-3.12.1-h00a0c3c_0 
  blas               pkgs/main/win-64::blas-1.0-mkl 
  cairo              pkgs/main/win-64::cairo-1.18.4-he9e932c_0 
  dav1d              pkgs/main/win-64::dav1d-1.2.1-h2bbff1b_0 
  ffmpeg             pkgs/main/win-64::ffmpeg-6.1.1-haa678a1_5 
  filelock           pkgs/main/win-64::filelock-3.20.0-py310haa95532_0 
  fontconfig         pkgs/main/win-64::fontconfig-2.15.0-hd211d86_0 
  freetype           pkgs/main/win-64::freetype-2.13.3-h0620614_0 
  fsspec             pkgs/main/win-64::fsspec-2025.10.0-py310h4442805_0 
  gmp                pkgs/main/win-64::gmp-6.3.0-h537511b_0 
  gmpy2              pk

In [2]:
import pathlib, zipfile, urllib.request

root = pathlib.Path("kokoro_data")
root.mkdir(exist_ok=True)
zip_path = root / "kokoro-speech-v1_3.zip"

url = "https://github.com/kaiidams/Kokoro-Speech-Dataset/releases/download/1.3/kokoro-speech-v1_3.zip"
print(f"Downloading {url} …")
urllib.request.urlretrieve(url, zip_path)
print("Extracting …")
with zipfile.ZipFile(zip_path) as zf:
    zf.extractall(root)

print("Done.")

Downloading https://github.com/kaiidams/Kokoro-Speech-Dataset/releases/download/1.3/kokoro-speech-v1_3.zip …
Extracting …
Done.


In [2]:
import os
import json
import torchaudio
import torch

# Utility print for notebook
def err_print(*args, **kwargs):
    print(*args, **kwargs)

# --- Functions from original script ---
def read_params_list(data_dir, size):
    index_file = os.path.join(data_dir, 'index.json')
    if not os.path.exists(index_file):
        raise FileNotFoundError("`data/index.json` not found. Please download it from the project page.")

    with open(index_file, encoding="utf-8") as f:
        params_list = json.load(f)

    return [
        params for params in params_list
        if (size == 'xlarge') or (size in params['sizes'].split())
    ]


def check_data_directory(data_dir, params_list):
    isok = True
    for params in params_list:
        id_ = params['id']
        audio_dir = os.path.join(data_dir, f'{id_}')
        if not os.path.isdir(audio_dir):
            err_print(f"Missing: `{audio_dir}`")
            isok = False
    if isok:
        err_print("✅ All audio directories exist.")
    return isok


def dump_script(data_dir, params_list):
    err_print('\n⚙️ Run these commands to download missing archives manually:\n')
    print(f'cd {data_dir}')
    for params in params_list:
        print(f'curl -LO {params["archive_url"]}')
    for params in params_list:
        archive_url = params['archive_url']
        archive_file = os.path.basename(archive_url)
        id_ = params['id']
        print(f"unzip {archive_file} -d {id_}")


def extract_wav_files(data_dir, params_list, clip_format, sample_rate, output_dir):
    clip_dir = 'wavs'
    clip_ext = clip_format
    os.makedirs(os.path.join(output_dir, clip_dir), exist_ok=True)
    max_int16 = torch.iinfo(torch.int16).max

    for params in params_list:
        id_ = params['id']
        metadata_file = os.path.join(data_dir, f'{id_}.metadata.txt')
        audio_dir = os.path.join(data_dir, f'{id_}')
        with open(metadata_file, 'rt', encoding="utf-8") as metadata_f:
            current_file = None
            current_audio = None
            for line in metadata_f:
                parts = line.rstrip('\r\n').split('|')
                id_, audio_file, audio_start, audio_end, _, _ = parts
                audio_start, audio_end = int(audio_start), int(audio_end)
                if current_file != audio_file:
                    file = os.path.join(audio_dir, audio_file)
                    err_print(f'Reading {file}...')
                    y, sr = torchaudio.load(file)
                    assert len(y.shape) == 2 and y.shape[0] == 1
                    assert y.dtype == torch.float32
                    assert sr == sample_rate
                    y = (y * max_int16 / torch.max(torch.abs(y))).to(torch.int16)
                    current_file = audio_file
                    current_audio = y
                output_file = os.path.join(output_dir, clip_dir, f'{id_}.{clip_ext}')
                y = current_audio[:, audio_start:audio_end]
                torchaudio.save(output_file, y, sample_rate)


def write_metafile(data_dir, params_list, output_dir):
    metadata_outfile = os.path.join(output_dir, 'metadata.csv')
    with open(metadata_outfile, 'wt', encoding="utf-8") as metadata_f:
        for params in params_list:
            id_ = params['id']
            metadata_file = os.path.join(data_dir, f'{id_}.metadata.txt')
            with open(metadata_file, 'rt', encoding="utf-8") as transcript_f:
                for line in transcript_f:
                    parts = line.rstrip('\r\n').split('|')
                    id_, _, _, _, text, voca = parts
                    metadata_f.write(f'{id_}|{text}|{voca}\n')
    err_print(f"✅ Metadata written to {metadata_outfile}")

# --- Notebook main logic (instead of argparse) ---
data_dir = 'kokoro_data'           # @param ["data"] {allow-input: true}
output_dir = 'output'       # @param ["output"] {allow-input: true}
size = 'large'               # @param ["tiny", "small", "large", "xlarge"]
clip_format = 'wav'         # @param ["wav", "flac", "mp3", "ogg"]
sample_rate = 22050         # @param {type:"integer"}

# --- Run processing ---
params_list = read_params_list(data_dir, size)
if not params_list:
    raise ValueError(f"No matching params found for size '{size}'.")

if not check_data_directory(data_dir, params_list):
    dump_script(data_dir, params_list)
else:
    extract_wav_files(data_dir, params_list, clip_format, sample_rate, output_dir)
    write_metafile(data_dir, params_list, output_dir)


✅ All audio directories exist.
Reading kokoro_data\meian-by-soseki-natsume\meian_001_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_002_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_003_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_004_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_005_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_006_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_007_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_008_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_009_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_010_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_011_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_012_natsume_64kb.mp3...
Reading kokoro_data\meian-by-soseki-natsume\meian_013_natsume_64kb.mp3...
Reading

# To download the audio files if missing, just paste in the archive urls for each individual file

In [10]:
import pathlib, zipfile, urllib.request

root = pathlib.Path("kokoro_data")
root.mkdir(exist_ok=True)

sources = [
    ("http://www.archive.org/download/kusamakura_1311_librivox/kusamakura_1311_librivox_64kb_mp3.zip",
     "kusamakura-by-soseki-natsume"),
    ("http://www.archive.org/download/gan_1311_librivox/gan_1311_librivox_64kb_mp3.zip",
     "gan-by-ogai-mori"),
    ("http://www.archive.org/download/umareizuru_nayami_ez_1302/umareizuru_nayami_ez_1302_64kb_mp3.zip",
     "umareizuru-nayami-by-takeo-arishima"),
    ("http://www.archive.org/download/futon_ek_1303_librivox/futon_ek_1303_librivox_64kb_mp3.zip",
     "futon-by-katai-tayama"),
    ("http://www.archive.org/download/meian_1403_librivox/meian_1403_librivox_64kb_mp3.zip",
     "meian-by-soseki-natsume"),
]

for url, folder_name in sources:
    zip_path = root / pathlib.Path(url).name
    target_dir = root / folder_name
    print(f"Downloading {url} …")
    urllib.request.urlretrieve(url, zip_path)
    print(f"Extracting to {target_dir} …")
    target_dir.mkdir(exist_ok=True)
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(target_dir)
    print("Done.\n")

Downloading http://www.archive.org/download/meian_1403_librivox/meian_1403_librivox_64kb_mp3.zip …
Extracting to kokoro_data\meian-by-soseki-natsume …
Done.



# Process into needed files

We are making lab files for the output and preprocessing

In [3]:
from pathlib import Path
import csv

out_dir = Path("./output")
wav_dir = out_dir / "wavs"

with (out_dir / "metadata.csv").open(encoding="utf8") as f:
    reader = csv.reader(f, delimiter="|")
    for utt_id, orth, *_ in reader:
        (wav_dir / f"{utt_id}.lab").write_text(orth.strip(), encoding="utf8")