<a href="https://colab.research.google.com/github/devendergarg/pingurls/blob/main/Chatterbox_tts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Uninstall existing libraries to prevent conflicts
!pip uninstall -y numpy pandas transformers torch torchaudio

# Install the necessary libraries from scratch
# This ensures that compatible versions are downloaded and installed together.
!pip install chatterbox-tts gradio

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Collecting chatterbox-tts
  Downloading chatterbox_tts-0.1.1-py3-none-any.whl.metadata (5.9 kB)
Collecting numpy==1.26.0 (from chatterbox-tts)
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting resampy==

In [1]:
# Make sure you have run this cell in a fresh runtime:
# !pip install chatterbox-tts gradio

import gradio as gr
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
import os
import logging
import random
import numpy as np
import torch

# Set up logging
logging.basicConfig(level=logging.INFO)

# --- Define device ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Running on device: {DEVICE}")

# --- Global Model Initialization ---
logging.info("Loading Chatterbox-TTS model...")
model = ChatterboxTTS.from_pretrained(device=DEVICE)
logging.info("Model loaded successfully.")


def set_seed(seed: int):
    """Sets the random seed for reproducibility across torch, numpy, and random."""
    if seed == 0: # A seed of 0 will be treated as random
        seed = random.randint(1, 1_000_000)
    logging.info(f"Using random seed: {seed}")
    torch.manual_seed(seed)
    if DEVICE == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

# --- Main TTS Function ---
def text_to_speech(text, audio_prompt, exaggeration, cfg_weight, temperature, seed, max_chars):
    """
    Generates speech from text, dynamically truncating based on the max_chars slider.
    """
    try:
        # --- Input validation ---
        if not text or not text.strip():
            raise gr.Error("Please provide some text to synthesize.")

        # Truncate text using the value from the max_chars slider
        max_chars = int(max_chars)
        if len(text) > max_chars:
            logging.warning(f"Text length ({len(text)}) exceeds limit ({max_chars}). Truncating.")
            text = text[:max_chars]

        # Set the seed for reproducibility
        set_seed(int(seed))

        # --- THIS IS THE CORRECTED LOGIC ---

        # The 'audio_prompt' variable from Gradio is either a filepath (str) or None.
        prompt_path = audio_prompt
        logging.info(f"Received audio prompt path from Gradio: {prompt_path} (Type: {type(prompt_path)})")

        # The model's generate function is designed to handle audio_prompt_path being None.
        # We can pass it directly.
        generate_args = {
            'text': text,
            'audio_prompt_path': prompt_path,
            'exaggeration': exaggeration,
            'temperature': temperature,
            'cfg_weight': cfg_weight
        }

        # Log which voice is being used and check for file existence only if a path is given.
        if prompt_path:
            if not os.path.exists(prompt_path):
                 raise gr.Error(f"Audio prompt file not found at temporary path: {prompt_path}. Please try re-uploading.")
            logging.info(f"Generating speech with audio prompt: {prompt_path}")
        else:
            logging.info("No audio prompt provided. Generating speech with default voice.")

        # Single, clean call to the model
        wav = model.generate(**generate_args)

        # ------------------------------------

        output_path = "generated_speech.wav"
        ta.save(output_path, wav.cpu(), model.sr)
        logging.info(f"Speech saved to {output_path}")

        return output_path

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)
        if isinstance(e, gr.Error):
             raise e
        raise gr.Error(f"An error occurred during speech generation: {e}")


# --- Gradio UI with default theme ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Chatterbox-TTS Gradio Demo
        Generate speech from text with reference audio styling.
        **Note**: Text longer than the selected max characters will be automatically shortened.
        """
    )
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                lines=4,
                label="Text to Synthesize",
                placeholder="Enter your text here..."
            )
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Reference Audio File (Optional)"
            )

            exaggeration_slider = gr.Slider(
                minimum=0.25, maximum=2.0, step=0.05,
                label="Exaggeration",
                info="Neutral = 0.5. Extreme values can be unstable.",
                value=0.5
            )
            cfg_slider = gr.Slider(
                minimum=0.2, maximum=1.0, step=0.05,
                label="CFG/Pace",
                info="Controls how strictly the model follows the text/prompt.",
                value=0.5
            )

            with gr.Accordion("More options", open=False):
                temp_slider = gr.Slider(
                    minimum=0.05, maximum=5.0, step=0.05,
                    label="Temperature",
                    info="Controls randomness. Higher values are more diverse.",
                    value=0.8
                )
                seed_input = gr.Number(
                    value=0,
                    label="Random Seed",
                    info="Set to 0 for a random seed."
                )
                max_chars_slider = gr.Slider(
                    minimum=300, maximum=3000, step=200,
                    label="Max Characters",
                    info="Sets the character limit for the input text. Longer text requires more processing time and memory.",
                    value=300
                )

            submit_button = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Speech", interactive=False)

    submit_button.click(
        fn=text_to_speech,
        inputs=[
            text_input,
            audio_input,
            exaggeration_slider,
            cfg_slider,
            temp_slider,
            seed_input,
            max_chars_slider
        ],
        outputs=audio_output,
        api_name="tts"
    )

# Launch the Gradio app
demo.launch(share=True, debug=True)

ModuleNotFoundError: No module named 'chatterbox'

In [None]:
# Make sure you have run this cell in a fresh runtime:
# !pip install chatterbox-tts gradio

import gradio as gr
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
import os
import logging
import random
import numpy as np
import torch

# Set up logging
logging.basicConfig(level=logging.INFO)

# --- Define device ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Running on device: {DEVICE}")

# --- Global Model Initialization ---
logging.info("Loading Chatterbox-TTS model...")
model = ChatterboxTTS.from_pretrained(device=DEVICE)
logging.info("Model loaded successfully.")


def set_seed(seed: int):
    """Sets the random seed for reproducibility across torch, numpy, and random."""
    if seed == 0: # A seed of 0 will be treated as random
        seed = random.randint(1, 1_000_000)
    logging.info(f"Using random seed: {seed}")
    torch.manual_seed(seed)
    if DEVICE == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

# --- Main TTS Function ---
def text_to_speech(text, audio_prompt, exaggeration, cfg_weight, temperature, seed, max_chars):
    """
    Generates speech from text. The total text is first capped by the max_chars slider.
    If the remaining text is > 500 chars, it is split into chunks and generated sequentially.
    """
    try:
        if not text or not text.strip():
            raise gr.Error("Please provide some text to synthesize.")

        # First, honor the Max Characters slider as an overall cap on the text length.
        max_chars = int(max_chars)+500
        if len(text) > max_chars:
            logging.warning(f"Total text length ({len(text)}) exceeds slider limit ({max_chars}). Truncating total text.")
            text = text[:max_chars]

        # Set the seed for reproducibility
        set_seed(int(seed))

        # Prepare common arguments for the model.
        prompt_path = audio_prompt
        generate_args = {
            'audio_prompt_path': prompt_path,
            'exaggeration': exaggeration,
            'temperature': temperature,
            'cfg_weight': cfg_weight
        }

        # --- MINIMAL CHANGE FOR CHUNKING LOGIC ---

        CHUNK_SIZE = 500
        # Split text into chunks of 500 characters
        text_chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]

        if len(text_chunks) > 1:
            logging.info(f"Text is long. Splitting into {len(text_chunks)} chunks of up to {CHUNK_SIZE} characters each.")

            wav_parts = []
            for i, chunk in enumerate(text_chunks):
                logging.info(f"Generating audio for chunk {i+1}/{len(text_chunks)}...")
                generate_args['text'] = chunk
                # Generate audio for the current chunk
                wav_part = model.generate(**generate_args)
                wav_parts.append(wav_part)

            # Stitch the audio parts together
            wav = torch.cat(wav_parts, dim=1)
            logging.info("All chunks generated and concatenated successfully.")

        else:
            # If there's only one chunk (or less), run the generation normally.
            logging.info(f"Generating speech for text: '{text[:50]}...'")
            generate_args['text'] = text
            wav = model.generate(**generate_args)

        # --- END OF CHANGE ---

        output_path = "generated_speech.wav"
        ta.save(output_path, wav.cpu(), model.sr)
        logging.info(f"Speech saved to {output_path}")

        return output_path

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)
        if isinstance(e, gr.Error):
             raise e
        raise gr.Error(f"An error occurred during speech generation: {e}")


# --- Gradio UI with default theme ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Chatterbox-TTS Gradio Demo
        Generate speech from text with reference audio styling.
        **Note**: Text longer than the selected max characters will be automatically shortened.
        Text is processed in chunks of 500 characters to handle long inputs.
        """
    )
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                lines=4,
                label="Text to Synthesize",
                placeholder="Enter your text here..."
            )
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Reference Audio File (Optional)"
            )

            exaggeration_slider = gr.Slider(
                minimum=0.25, maximum=2.0, step=0.05,
                label="Exaggeration",
                info="Neutral = 0.5. Extreme values can be unstable.",
                value=0.5
            )
            cfg_slider = gr.Slider(
                minimum=0.2, maximum=1.0, step=0.05,
                label="CFG/Pace",
                info="Controls how strictly the model follows the text/prompt.",
                value=0.5
            )

            with gr.Accordion("More options", open=False):
                temp_slider = gr.Slider(
                    minimum=0.05, maximum=5.0, step=0.05,
                    label="Temperature",
                    info="Controls randomness. Higher values are more diverse.",
                    value=0.8
                )
                seed_input = gr.Number(
                    value=0,
                    label="Random Seed",
                    info="Set to 0 for a random seed."
                )
                max_chars_slider = gr.Slider(
                    minimum=300, maximum=3000, step=200,
                    label="Max Characters",
                    info="Sets the TOTAL character limit for the input text. Longer text requires more processing time and memory.",
                    value=3000 # Increased default to show capability
                )

            submit_button = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Speech", interactive=False)

    submit_button.click(
        fn=text_to_speech,
        inputs=[
            text_input,
            audio_input,
            exaggeration_slider,
            cfg_slider,
            temp_slider,
            seed_input,
            max_chars_slider
        ],
        outputs=audio_output,
        api_name="tts"
    )

# Launch the Gradio app
demo.launch(share=True, debug=True)