# Kokoro-82M TTS - Fast High-Quality Text-to-Speech

This notebook demonstrates Kokoro-82M TTS, which balances speed and quality with only 82M parameters.

**Features:**
- Fast generation (~5-10 seconds)
- High quality despite small size (82M parameters)
- Apache licensed and cost-efficient
- Apple Silicon GPU optimization

In [1]:
# Load Kokoro TTS model
import time
import torch
import os
from transformers import pipeline

print(f"MPS available: {torch.backends.mps.is_available()}")
print("Loading Kokoro-82M TTS model via transformers...")

MPS available: True
Loading Kokoro-82M TTS model via transformers...


In [2]:
# Initialize Kokoro pipeline via Hugging Face transformers
print("Loading Kokoro-82M TTS model...")
start_load = time.time()

# Try to use the Kokoro model via transformers
# Note: As of now, Kokoro may not be directly available via transformers pipeline
# This is a placeholder implementation - we may need to use the model directly
try:
    # Attempt to load via transformers pipeline
    kokoro_pipe = pipeline("text-to-speech", model="hexgrad/Kokoro-82M")
    print("Loaded via transformers pipeline")
except Exception as e:
    print(f"Transformers pipeline not available: {e}")
    print("Kokoro-82M requires specific installation - see model card for details")
    kokoro_pipe = None

end_load = time.time()
print(f"Load attempt completed in {end_load - start_load:.2f} seconds")
print("Model size: 82M parameters (lightweight but high quality)")

Loading Kokoro-82M TTS model...
Transformers pipeline not available: Unrecognized model in hexgrad/Kokoro-82M. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, f

In [3]:
# Generate speech with Kokoro-82M (if available)
text = "Hello, this is Kokoro TTS. I provide fast, high-quality speech generation with only 82 million parameters."

if kokoro_pipe is not None:
    print("Generating audio with Kokoro-82M...")
    start_time = time.time()

    # Generate audio
    audio_data = kokoro_pipe(text)

    end_time = time.time()

    print(f"Audio generated in {end_time - start_time:.2f} seconds")
    print(f"Audio type: {type(audio_data)}")
    if hasattr(audio_data, 'shape'):
        print(f"Audio shape: {audio_data.shape}")
    elif isinstance(audio_data, dict) and 'audio' in audio_data:
        print(f"Audio shape: {audio_data['audio'].shape}")

    # Get sample rate
    if isinstance(audio_data, dict) and 'sampling_rate' in audio_data:
        sample_rate = audio_data['sampling_rate']
    else:
        sample_rate = 24000  # Default for Kokoro
    print(f"Sample rate: {sample_rate}")
else:
    print("Kokoro model not available - skipping generation")
    print("To use Kokoro-82M, install: pip install kokoro>=0.9.2")
    audio_data = None
    sample_rate = 24000

Kokoro model not available - skipping generation
To use Kokoro-82M, install: pip install kokoro>=0.9.2


In [4]:
# Save Kokoro audio as MP3 (if available)
import soundfile as sf
from pydub import AudioSegment
import numpy as np

if audio_data is not None:
    # Create output directory
    os.makedirs("output", exist_ok=True)

    # Save as WAV first
    wav_file = "output/kokoro_output.wav"
    mp3_file = "output/kokoro_output.mp3"

    # Extract audio array
    if isinstance(audio_data, dict) and 'audio' in audio_data:
        audio_array = audio_data['audio']
    else:
        audio_array = audio_data

    # Convert to numpy array if needed
    if torch.is_tensor(audio_array):
        audio_array = audio_array.cpu().numpy()
    else:
        audio_array = np.array(audio_array)

    # Ensure 1D array
    if audio_array.ndim > 1:
        audio_array = audio_array.squeeze()

    # Normalize audio to prevent clipping
    audio_normalized = audio_array / np.max(np.abs(audio_array))

    # Save as WAV
    sf.write(wav_file, audio_normalized, sample_rate)

    # Convert to MP3
    audio_segment = AudioSegment.from_wav(wav_file)
    audio_segment.export(mp3_file, format="mp3")

    print(f"Kokoro audio saved as {mp3_file}")
    print(f"Duration: {len(audio_segment)/1000:.2f} seconds")
    print(f"Generation time: {end_time - start_time:.2f} seconds")
    print("\n⚡ Kokoro-82M TTS: Perfect balance of speed and quality!")
else:
    print("No audio generated - Kokoro model not available")
    print("\nTo install Kokoro-82M:")
    print("1. pip install kokoro>=0.9.2")
    print("2. Restart notebook and try again")

No audio generated - Kokoro model not available

To install Kokoro-82M:
1. pip install kokoro>=0.9.2
2. Restart notebook and try again
