<a href="https://colab.research.google.com/github/gleidsonnunes/scripts/blob/master/Narrador_XTTS_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Instalar dependencias
# Resetar tudo
!pip uninstall -y torch torchaudio torchvision transformers TTS
!pip cache purge

# Instalar Torch CPU-only (sem dependência de Triton)
!pip install torch==2.1.2 torchaudio==2.1.2 torchvision>=0.11 --index-url https://download.pytorch.org/whl/cpu

# Instalar versão estável da TTS que usa XTTS
!pip install TTS==0.21.1

# (opcional) instalar transformers caso necessário para outras tarefas
!pip install transformers==4.36.2

!pip install gradio pymupdf ebooklib pydub bs4

!apt-get update && apt-get install -y ffmpeg

In [None]:
#@title Enviar Voz
from google.colab import files
uploaded = files.upload()
import shutil
for name in uploaded:
    if name.endswith(".mp3") or name.endswith(".wav"):
        shutil.move(name, "voz_clonada_sample.wav")

In [None]:
#@title Interface
import gradio as gr
from TTS.api import TTS
import fitz  # PyMuPDF
from ebooklib import epub
from bs4 import BeautifulSoup
import os
from pydub import AudioSegment
import torch # Import torch
from ebooklib import ITEM_DOCUMENT

os.makedirs("audios", exist_ok=True)

SPEAKER_WAV = "voz_clonada_sample.wav"
OUTPUT_DIR = "audios"

# Aceita os termos automaticamente (Coqui)
os.environ["COQUI_TOS_AGREED"] = "1"

# Allowlist XttsConfig and XttsAudioConfig for torch.load
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import XttsAudioConfig
# torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])


tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True, gpu=False)

emotions = ["neutral", "happy", "sad", "angry", "excited", "sleepy", "whispering", "shouting"]

def extract_text(file):
    ext = os.path.splitext(file.name)[-1].lower()
    text = ""
    if ext == ".pdf":
        doc = fitz.open(file.name)
        for page in doc:
            text += page.get_text()
    elif ext == ".epub":
        book = epub.read_epub(file.name)
        for item in book.get_items():
          if item.get_type() == ITEM_DOCUMENT:
              soup = BeautifulSoup(item.get_content(), "html.parser")
              text += soup.get_text() + "\n\n"
    return text.strip()

def narrar(texto, emocao, progress=gr.Progress(track_tqdm=True)):
    if not texto:  # Check if text is empty
        return None, None  # Return None for both outputs if text is empty

    wav_path = os.path.join(OUTPUT_DIR, "narracao.wav")
    mp3_path = os.path.join(OUTPUT_DIR, "narracao.mp3")

    tts.tts_to_file(
        text=texto,
        speaker_wav=SPEAKER_WAV,
        language="pt",
        emotion=emocao,  # Include emotion
        file_path=wav_path
    )

    audio = AudioSegment.from_wav(wav_path)
    audio.export(mp3_path, format="mp3")

    return wav_path, mp3_path

with gr.Blocks() as demo:
    gr.Markdown("## Narrador XTTS com sua voz preferida 🎤")

    with gr.Row():
        arquivo = gr.File(label="Carregar PDF ou EPUB", file_types=[".pdf", ".epub"])
        emocao = gr.Dropdown(emotions, label="Emoção", value="neutral")

    texto_extraido = gr.Textbox(label="Texto extraído", lines=10)
    carregar_btn = gr.Button("Extrair texto")

    carregar_btn.click(fn=extract_text, inputs=arquivo, outputs=texto_extraido)

    with gr.Row():
        narrar_btn = gr.Button("Narrar")
        audio_saida = gr.Audio(label="Ouvir áudio")
        download_mp3 = gr.File(label="Baixar em MP3")

    narrar_btn.click(fn=narrar, inputs=[texto_extraido, emocao], outputs=[audio_saida, download_mp3])

demo.launch(share=True, debug=True)