In [2]:
# Step 1: Install dependencies
# Run these commands in your terminal or notebook
# !pip install onnxruntime phonemizer torch numpy scipy munch

# Step 2: Import required libraries
import numpy as np
import torch
from onnxruntime import InferenceSession
from phonemizer import phonemize
from pathlib import Path

# Step 3: Define helper functions (from kokoro.py)
def normalize_text(text):
    # Normalize text (e.g., replace special characters, handle numbers, etc.)
    # This is a simplified version; replace with the full function from kokoro.py
    text = text.replace("’", "'").replace("‘", "'")
    text = text.replace("“", '"').replace("”", '"')
    return text

def get_vocab():
    # Define the vocabulary (replace with the full function from kokoro.py)
    _pad = "$"
    _punctuation = ';:,.!?¡¿—…"«»“” '
    _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
    return {s: i for i, s in enumerate(symbols)}

VOCAB = get_vocab()

def tokenize(phonemes):
    # Convert phonemes to tokens using the vocabulary
    return [VOCAB[p] for p in phonemes if p in VOCAB]

# Step 4: Load the ONNX model
onnx_model_path = "../SynthVox/models/kokoro-v0_19.onnx"
sess = InferenceSession(onnx_model_path)

# Step 5: Phonemize and tokenize the input text
def generate_speech(text, voicepack_path, lang="en-us"):
    # Normalize the text
    text = normalize_text(text)

    # Phonemize the text
    phonemes = phonemize(
        text,
        language=lang,
        backend="espeak",
        with_stress=True,
        preserve_punctuation=True,
    )

    # Tokenize the phonemes
    tokens = tokenize(phonemes)

    # Ensure the tokens are within the context length (510 tokens max)
    if len(tokens) > 510:
        tokens = tokens[:510]
        print("Warning: Input text was truncated to 510 tokens.")

    # Add padding tokens (0) at the start and end
    tokens = [0] + tokens + [0]

    # Step 6: Load the voicepack
    voicepack = torch.load(voicepack_path)
    ref_s = voicepack[len(tokens)].numpy()  # Style vector based on token length

    # Step 7: Run inference with the ONNX model
    inputs = {
        "tokens": np.array([tokens], dtype=np.int64),  # Shape: (1, <=512)
        "style": ref_s,  # Shape: (1, 256)
        "speed": np.array([1.0], dtype=np.float32),  # Speed control (1.0 = normal)
    }

    # Generate audio
    audio = sess.run(None, inputs)[0]  # Shape: (1, audio_length)

    # Step 8: Save or play the audio
    return audio

# Step 9: Example usage
if __name__ == "__main__":
    # Input text
    text = "Hello, this is a demonstration of the Kokoro-82M text-to-speech model."

    # Voicepack path (replace with the desired voicepack)
    voicepack_path = "../SynthVox/models/voices/af.pt"  # Default voice (50-50 mix of Bella & Sarah)

    # Generate speech
    audio = generate_speech(text, voicepack_path)

    # Save the audio to a file
    output_file = "../data/gen/hello.wav"
    import scipy.io.wavfile as wav
    wav.write(output_file, rate=24000, data=audio.squeeze())

    print(f"Speech saved to {output_file}")

  voicepack = torch.load(voicepack_path)


FileNotFoundError: [Errno 2] No such file or directory: 'voices/af.pt'