In [None]:
import os, sys, subprocess

# Clone repo
if not os.path.exists('/content/qwen3_tts'):
    os.system('git clone https://github.com/QwenLM/Qwen3-TTS /content/qwen3_tts')
else:
    print('✅ Repo già presente')

# ── Sistema (sox binario + rubberband) ──────────────────────────
# FIX: pydub cerca il BINARIO sox nel PATH, non il package pip
os.system('apt-get install -y -q sox libsox-fmt-all rubberband-cli')

# ── Python deps ────────────────────────────────────────
os.system('pip install -q transformers==4.57.3 accelerate==1.12.0')
os.system('pip install -q soundfile librosa torchaudio onnxruntime einops sox')
os.system('pip install -q /content/qwen3_tts')
os.system('pip install -q gradio pydub pyloudnorm noisereduce')
os.system('pip install -q pyrubberband')

# ── flash-attn (opzionale, ~20% più veloce) ────────────────────
# Su Colab T4 + Python 3.12 non esiste un wheel pre-compilato:
# pip tenta di compilare dal sorgente (>10 min). Lo saltiamo dopo
# 90s e usiamo sdpa (PyTorch nativo, funziona perfettamente).
print('⏳ Verifica flash-attn (max 90s)...')
_fa_proc = subprocess.Popen(
    [sys.executable, '-m', 'pip', 'install', 'flash-attn', '-q'],
    stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
try:
    _out, _err = _fa_proc.communicate(timeout=90)
    if _fa_proc.returncode == 0:
        print('✅ flash-attn installato! (inferenza più veloce)')
    else:
        print('ℹ️ flash-attn non disponibile → si usa sdpa (ok)')
except subprocess.TimeoutExpired:
    _fa_proc.kill()
    _fa_proc.communicate()  # svuota i buffer, evita zombie
    print('ℹ️ flash-attn richiede compilazione (>10 min) → saltato, si usa sdpa')
except Exception as _e:
    print(f'ℹ️ flash-attn: {_e} → si usa sdpa')

print('✅ Tutto installato. Riavvio...')
os.kill(os.getpid(), 9)


In [None]:
import sys, torch

sys.path.insert(0, '/content/qwen3_tts')
from qwen_tts import Qwen3TTSModel
# NOTA: il warning 'flash-attn is not installed' qui sotto è normale
#       e ININFLUENTE: la libreria usa PyTorch sdpa che funziona perfettamente

print(f'✅ Python: {sys.version[:6]}')
print(f'✅ Qwen3TTSModel importato')
print(f'🎮 CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    free  = torch.cuda.mem_get_info()[0] / 1024**3
    print(f'   GPU: {torch.cuda.get_device_name(0)}')
    print(f'   VRAM: {free:.1f}GB liberi / {total:.1f}GB totali')
else:
    print('⚠️ Nessuna GPU → vai su Runtime → Change runtime type → T4 GPU')


In [None]:
print('📦 Caricamento modello...')

# ─────────────────────────────────────────────
# Scegli il modello (decommenta quello che vuoi):
# ─────────────────────────────────────────────
MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"          # Voice Clone standard
# MODEL_ID = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"        # Voice Clone veloce (metà VRAM)
# MODEL_ID = "Qwen/Qwen3-TTS-25Hz-1.7B-Base"        # Voice Clone long-form (>500 parole)
# MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign" # Crea voce da descrizione testuale
# MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" # 9 voci predefinite Qwen

# Detect flash-attn
try:
    import flash_attn
    attn_impl = "flash_attention_2"
    print('⚡ flash-attn rilevato → flash_attention_2')
except ImportError:
    attn_impl = "sdpa"
    print('ℹ️ flash-attn non trovato → sdpa (ok, differenza ~15%)')

model = Qwen3TTSModel.from_pretrained(
    MODEL_ID,
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation=attn_impl,
)

if "VoiceDesign" in MODEL_ID:
    MODEL_TYPE = "design"
elif "CustomVoice" in MODEL_ID:
    MODEL_TYPE = "custom"
else:
    MODEL_TYPE = "clone"

SAMPLE_RATE = 25000 if "25Hz" in MODEL_ID else 24000
print(f"✅ Modello pronto! | tipo: {MODEL_TYPE} | SR: {SAMPLE_RATE} Hz | attn: {attn_impl}")


In [None]:
import os, time, hashlib, tempfile, re, subprocess, sys, importlib.util
import gradio as gr
import torch, numpy as np
import librosa, soundfile as sf
import pyrubberband as pyrb
from pydub import AudioSegment
from pydub.silence import detect_leading_silence
import pyloudnorm as pyln
import noisereduce as nr

# ─────────────────────────────────────────────
# (1) Whisper  [default 'small']
# ─────────────────────────────────────────────
if importlib.util.find_spec('faster_whisper') is None:
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'faster-whisper'], check=True)

from faster_whisper import WhisperModel

_whisper = None
def get_whisper(model_size='small'):
    global _whisper
    if _whisper is None or getattr(_whisper, '_size', None) != model_size:
        _whisper = WhisperModel(model_size, device='cuda' if torch.cuda.is_available() else 'cpu',
                                compute_type='float16' if torch.cuda.is_available() else 'int8')
        _whisper._size = model_size
    return _whisper

def trascrivi_reference(audio_path, lang='it', whisper_size='small'):
    wm = get_whisper(whisper_size)
    segments, info = wm.transcribe(audio_path, language=lang, vad_filter=True)
    return ' '.join([s.text.strip() for s in segments]).strip()

# ─────────────────────────────────────────────
# (2) Preprocessing testo IT
# ─────────────────────────────────────────────
def preprocessa_testo_it(testo: str):
    testo = testo.replace('«','').replace('»','')
    testo = testo.replace('"','')
    testo = testo.replace('\u2018',"'").replace('\u2019',"'")
    testo = re.sub(r'\.(\s+|$)', r',\1', testo)
    testo = testo.replace('…', ',')
    testo = re.sub(r'\.{2,}', ',', testo)
    testo = testo.replace('—', ',').replace('–', ',')
    testo = re.sub(r'[()[\]{}]', '', testo)
    testo = re.sub(r'\s+', ' ', testo).strip()
    return testo

def split_testo_chunked(testo, max_chars=400):
    frasi = re.split(r'(?<=[,.!?])\s+', testo)
    chunks, current = [], ''
    for f in frasi:
        if len(current) + len(f) + 1 <= max_chars:
            current += (' ' if current else '') + f
        else:
            if current: chunks.append(current.strip())
            current = f
    if current: chunks.append(current.strip())
    return chunks if chunks else [testo]

# ─────────────────────────────────────────────
# (3) Reference: SNR score + 24kHz
# ─────────────────────────────────────────────
def trim_silence(audio, threshold=-40, padding_ms=120):
    start = detect_leading_silence(audio, silence_threshold=threshold)
    end   = detect_leading_silence(audio.reverse(), silence_threshold=threshold)
    trimmed = audio[start : len(audio) - end]
    return AudioSegment.silent(duration=padding_ms) + trimmed + AudioSegment.silent(duration=padding_ms)

def snr_score(chunk):
    samples = np.array(chunk.get_array_of_samples(), dtype=np.float64)
    if len(samples) == 0: return 0.0
    signal_power = np.mean(samples ** 2)
    smoothed = np.convolve(samples, np.ones(5) / 5, mode='same')
    noise_est = np.var(samples - smoothed)
    return signal_power / (noise_est + 1e-9)

def prepara_reference(filepath, denoise=False, target_ms=18_000):
    audio = AudioSegment.from_file(filepath)
    audio = audio.set_channels(1).set_frame_rate(24000)
    audio = trim_silence(audio)
    if len(audio) > target_ms:
        step = 1500
        chunks = [audio[i:i+target_ms] for i in range(0, len(audio) - target_ms, step)]
        audio = max(chunks, key=snr_score)
    tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    audio.export(tmp.name, format='wav')
    if denoise:
        import soundfile as _sf; import noisereduce as _nr
        data, sr = _sf.read(tmp.name)
        data = _nr.reduce_noise(y=data, sr=sr, prop_decrease=0.35)
        _sf.write(tmp.name, data, sr)
    return tmp.name

# ─────────────────────────────────────────────
# (4) Post-processing
# ─────────────────────────────────────────────
def postprocessa_output(wav_path, target_lufs=-18.0):
    audio = AudioSegment.from_wav(wav_path)
    audio = trim_silence(audio, threshold=-50, padding_ms=150)
    audio.export(wav_path, format='wav')
    data, rate = sf.read(wav_path)
    meter = pyln.Meter(rate)
    loudness = meter.integrated_loudness(data)
    if loudness > -70:
        data = pyln.normalize.loudness(data, loudness, target_lufs)
        sf.write(wav_path, data, rate)
    return wav_path

# ─────────────────────────────────────────────
# (5) Target durata – pyrubberband
# ─────────────────────────────────────────────
def durata_wav(path):
    data, rate = sf.read(path); return len(data) / rate

def adatta_durata(wav_path, target_sec):
    data, sr = sf.read(wav_path)
    current_sec = len(data) / sr
    rate = max(0.7, min(1.4, current_sec / target_sec))
    stretched = pyrb.time_stretch(data.astype(np.float32), sr, 1.0 / rate)
    sf.write(wav_path, stretched, sr)
    return wav_path, rate, current_sec

# ─────────────────────────────────────────────
# (6) Cache voice_clone_prompt
# ─────────────────────────────────────────────
def _hash_file(path):
    with open(path, 'rb') as f: return hashlib.md5(f.read()).hexdigest()

_prompt_cache = {}
def get_voice_prompt(ref_wav, ref_text, xvec_only):
    key = (_hash_file(ref_wav), ref_text, bool(xvec_only))
    if key not in _prompt_cache:
        _prompt_cache[key] = model.create_voice_clone_prompt(
            ref_audio=ref_wav, ref_text=ref_text, x_vector_only_mode=bool(xvec_only))
    return _prompt_cache[key]

# ─────────────────────────────────────────────
# (7) Preset IT + gen_kwargs builder
# ─────────────────────────────────────────────
PRESET = {
    '🇮🇹 IT – Naturale YouTube (consigliato)': dict(
        temperature=0.85, top_p=0.95, top_k=50, repetition_penalty=1.06, max_new_tokens=3072),
    '🇮🇹 IT – Super stabile (meno errori)': dict(
        temperature=0.70, top_p=0.85, top_k=40, repetition_penalty=1.10, max_new_tokens=3072),
    '🇮🇹 IT – Più espressivo (rischio errori)': dict(
        temperature=1.00, top_p=0.95, top_k=70, repetition_penalty=1.05, max_new_tokens=3072),
}

def applica_preset(nome):
    p = PRESET[nome]
    return p['temperature'], p['top_p'], p['top_k'], p['repetition_penalty'], p['max_new_tokens']

def build_gen_kwargs(max_new_tokens, top_k, top_p, temperature, repetition_penalty):
    return dict(max_new_tokens=int(max_new_tokens), do_sample=True,
                top_k=int(top_k), top_p=float(top_p), temperature=float(temperature),
                repetition_penalty=float(repetition_penalty),
                subtalker_dosample=True, subtalker_top_k=int(top_k),
                subtalker_top_p=float(top_p), subtalker_temperature=float(temperature))

# ─────────────────────────────────────────────
# (8a) Voice Clone
# ─────────────────────────────────────────────
def genera_voice_clone(testo, ref_file, ref_text_manual, auto_ref_text, whisper_size,
                       xvec_only, use_cache, denoise_ref, do_postprocess, use_chunking,
                       preset_name, temperature, top_p, top_k, rep_pen, max_tok,
                       target_min, target_sec):
    if not testo or not testo.strip(): raise gr.Error('Inserisci un testo.')
    if ref_file is None: raise gr.Error('Carica un audio di riferimento.')
    if MODEL_TYPE != 'clone':
        raise gr.Error(f'Modello attivo ({MODEL_ID}) non è un modello Base.')
    t0 = time.time()
    target_totale = float(target_min) * 60 + float(target_sec)
    testo_proc = preprocessa_testo_it(testo)
    ref_wav = prepara_reference(ref_file, denoise=denoise_ref)
    ref_text = (ref_text_manual or '').strip()
    if (not xvec_only) and (not ref_text) and auto_ref_text:
        ref_text = trascrivi_reference(ref_wav, lang='it', whisper_size=whisper_size)
    warn = ''
    if (not xvec_only) and (not ref_text):
        warn = '⚠️ ref_text vuoto: attiva auto-trascrizione o inseriscilo manualmente.'
    gen_kwargs = build_gen_kwargs(max_tok, top_k, top_p, temperature, rep_pen)
    chunks = split_testo_chunked(testo_proc) if (use_chunking and len(testo_proc) > 400) else [testo_proc]
    if use_cache: prompt = get_voice_prompt(ref_wav, ref_text, xvec_only)
    all_wavs = []
    for chunk in chunks:
        if use_cache:
            wavs, sr = model.generate_voice_clone(text=chunk, language='Italian', voice_clone_prompt=prompt, **gen_kwargs)
        else:
            wavs, sr = model.generate_voice_clone(text=chunk, language='Italian', ref_audio=ref_wav,
                                                   ref_text=ref_text, x_vector_only_mode=bool(xvec_only), **gen_kwargs)
        all_wavs.append(wavs[0])
    final_wav = np.concatenate(all_wavs) if len(all_wavs) > 1 else all_wavs[0]
    out_path = '/content/output_qwen3_clone.wav'
    sf.write(out_path, final_wav, sr)
    stretch = 1.0
    if target_totale > 0: out_path, stretch, _ = adatta_durata(out_path, target_totale)
    if do_postprocess: postprocessa_output(out_path)
    dur = durata_wav(out_path)
    info = f'✅ {time.time()-t0:.1f}s | {len(chunks)} chunk(s) | durata {dur:.1f}s | stretch {stretch:.3f}x'
    if target_totale > 0: info += f' | delta {dur-target_totale:+.1f}s'
    if warn: info = warn + '\n' + info
    return out_path, testo_proc, ref_text, info

# ─────────────────────────────────────────────
# (8b) Voice Design
# ─────────────────────────────────────────────
def genera_voice_design(testo, voice_desc, do_postprocess, use_chunking,
                        preset_name, temperature, top_p, top_k, rep_pen, max_tok,
                        target_min, target_sec):
    if not testo or not testo.strip(): raise gr.Error('Inserisci un testo.')
    if not voice_desc or not voice_desc.strip(): raise gr.Error('Inserisci una descrizione.')
    if MODEL_TYPE != 'design':
        raise gr.Error(f'Modello attivo ({MODEL_ID}) non è VoiceDesign.')
    t0 = time.time()
    target_totale = float(target_min) * 60 + float(target_sec)
    testo_proc = preprocessa_testo_it(testo)
    gen_kwargs = build_gen_kwargs(max_tok, top_k, top_p, temperature, rep_pen)
    chunks = split_testo_chunked(testo_proc) if (use_chunking and len(testo_proc) > 400) else [testo_proc]
    all_wavs = []
    for chunk in chunks:
        wavs, sr = model.generate_voice_design(text=chunk, language='Italian', voice_description=voice_desc, **gen_kwargs)
        all_wavs.append(wavs[0])
    final_wav = np.concatenate(all_wavs) if len(all_wavs) > 1 else all_wavs[0]
    out_path = '/content/output_qwen3_design.wav'
    sf.write(out_path, final_wav, sr)
    stretch = 1.0
    if target_totale > 0: out_path, stretch, _ = adatta_durata(out_path, target_totale)
    if do_postprocess: postprocessa_output(out_path)
    dur = durata_wav(out_path)
    return out_path, testo_proc, f'✅ {time.time()-t0:.1f}s | {len(chunks)} chunk(s) | {dur:.1f}s | stretch {stretch:.3f}x'

# ─────────────────────────────────────────────
# (8c) Custom Voice
# ─────────────────────────────────────────────
CUSTOM_VOICES = ['Chelsie','Ethan','Emma','Dylan','Chloe','Aria','Marcus','Zara','Leo']

def genera_custom_voice(testo, voice_dd, voice_manual, do_postprocess, use_chunking,
                        preset_name, temperature, top_p, top_k, rep_pen, max_tok,
                        target_min, target_sec):
    if not testo or not testo.strip(): raise gr.Error('Inserisci un testo.')
    if MODEL_TYPE != 'custom':
        raise gr.Error(f'Modello attivo ({MODEL_ID}) non è CustomVoice.')
    voice_name = voice_manual.strip() if voice_manual.strip() else voice_dd
    t0 = time.time()
    target_totale = float(target_min) * 60 + float(target_sec)
    testo_proc = preprocessa_testo_it(testo)
    gen_kwargs = build_gen_kwargs(max_tok, top_k, top_p, temperature, rep_pen)
    chunks = split_testo_chunked(testo_proc) if (use_chunking and len(testo_proc) > 400) else [testo_proc]
    all_wavs = []
    for chunk in chunks:
        wavs, sr = model.generate(text=chunk, language='Italian', voice_name=voice_name, **gen_kwargs)
        all_wavs.append(wavs[0])
    final_wav = np.concatenate(all_wavs) if len(all_wavs) > 1 else all_wavs[0]
    out_path = '/content/output_qwen3_custom.wav'
    sf.write(out_path, final_wav, sr)
    stretch = 1.0
    if target_totale > 0: out_path, stretch, _ = adatta_durata(out_path, target_totale)
    if do_postprocess: postprocessa_output(out_path)
    dur = durata_wav(out_path)
    return out_path, testo_proc, f'✅ {time.time()-t0:.1f}s | {voice_name} | {len(chunks)} chunk(s) | {dur:.1f}s'

# ─────────────────────────────────────────────
# (9) Gradio UI v3 — 3 tab
# ─────────────────────────────────────────────
def _shared_controls(suffix):
    do_pp    = gr.Checkbox(label='🎚️ Normalizza output (-18 LUFS)', value=True)
    chunking = gr.Checkbox(label='✂️ Chunking testi lunghi (>400 char)', value=True)
    preset   = gr.Dropdown(label='🎛️ Preset IT', choices=list(PRESET.keys()),
                           value='🇮🇹 IT – Naturale YouTube (consigliato)')
    gr.Markdown('### 🎬 Target durata (opzionale)')
    with gr.Row():
        t_min = gr.Number(label='Min', value=0, minimum=0, maximum=60, step=1)
        t_sec = gr.Number(label='Sec', value=0, minimum=0, maximum=59, step=1)
    with gr.Accordion('⚙️ Avanzate', open=False):
        temp = gr.Slider(label='temperature', minimum=0.3, maximum=1.3, value=0.85, step=0.05)
        tp   = gr.Slider(label='top_p', minimum=0.5, maximum=1.0, value=0.95, step=0.05)
        tk   = gr.Slider(label='top_k', minimum=1, maximum=200, value=50, step=1)
        rp   = gr.Slider(label='repetition_penalty', minimum=1.0, maximum=1.3, value=1.06, step=0.01)
        mt   = gr.Slider(label='max_new_tokens', minimum=512, maximum=8192, value=3072, step=256)
    preset.change(fn=applica_preset, inputs=preset, outputs=[temp, tp, tk, rp, mt])
    return do_pp, chunking, preset, t_min, t_sec, temp, tp, tk, rp, mt

with gr.Blocks(title='Qwen3-TTS Pro IT v3') as ui:
    gr.Markdown('# 🎤 Qwen3‑TTS — Pro Italiano v3')
    gr.Markdown(
        f'**Modello:** `{MODEL_ID}` | **Tipo:** `{MODEL_TYPE}` | **SR:** {SAMPLE_RATE} Hz\n\n'
        '**Fix v2:** bfloat16 · 24kHz · pyrubberband · denoise 0.35 | '
        '**Nuovo v3:** Whisper small · SNR ref · Chunking · VoiceDesign · CustomVoice'
    )
    with gr.Tabs():
        with gr.TabItem('🎤 Voice Clone'):
            gr.Markdown('Clona una voce da audio di riferimento. Richiede modello `*-Base`.')
            with gr.Row():
                with gr.Column(scale=1):
                    ref_audio  = gr.Audio(label='🎤 Reference (WAV/MP3)', type='filepath', sources=['upload'])
                    ref_text_m = gr.Textbox(label='📝 Trascrizione reference', placeholder='Opzionale ma consigliato', lines=2)
                    auto_ref   = gr.Checkbox(label='🧠 Auto‑trascrivi con Whisper', value=True)
                    wh_size    = gr.Dropdown(label='Whisper model', choices=['tiny','base','small'], value='small')
                    xvec       = gr.Checkbox(label='⚡ x_vector_only_mode', value=False)
                    cache      = gr.Checkbox(label='⚡ Cache voice_clone_prompt', value=True)
                    den_ref    = gr.Checkbox(label='🔇 Denoise reference', value=False)
                    do_pp_vc, chunking_vc, preset_vc, tmin_vc, tsec_vc, temp_vc, tp_vc, tk_vc, rp_vc, mt_vc = _shared_controls('vc')
                with gr.Column(scale=1):
                    testo_vc = gr.Textbox(label='📄 Testo Italiano', lines=12, placeholder='Incolla il tuo voiceover…')
                    btn_vc   = gr.Button('🚀 Genera Voice Clone', variant='primary')
                    audio_vc = gr.Audio(label='🔊 Output', type='filepath')
                    proc_vc  = gr.Textbox(label='Testo preprocessato', lines=3)
                    rtext_vc = gr.Textbox(label='ref_text usato', lines=2)
                    info_vc  = gr.Textbox(label='Info', lines=3)
            btn_vc.click(fn=genera_voice_clone,
                inputs=[testo_vc, ref_audio, ref_text_m, auto_ref, wh_size, xvec, cache, den_ref,
                        do_pp_vc, chunking_vc, preset_vc, temp_vc, tp_vc, tk_vc, rp_vc, mt_vc, tmin_vc, tsec_vc],
                outputs=[audio_vc, proc_vc, rtext_vc, info_vc])

        with gr.TabItem('🎨 Voice Design'):
            gr.Markdown('Crea voce da descrizione testuale. Richiede `*-VoiceDesign`.')
            with gr.Row():
                with gr.Column(scale=1):
                    vdesc = gr.Textbox(label='🎨 Descrizione voce',
                        placeholder='Es: Voce femminile italiana, calda, ideale per documentari.', lines=4)
                    do_pp_vd, chunking_vd, preset_vd, tmin_vd, tsec_vd, temp_vd, tp_vd, tk_vd, rp_vd, mt_vd = _shared_controls('vd')
                with gr.Column(scale=1):
                    testo_vd = gr.Textbox(label='📄 Testo Italiano', lines=12)
                    btn_vd   = gr.Button('🎨 Genera Voice Design', variant='primary')
                    audio_vd = gr.Audio(label='🔊 Output', type='filepath')
                    proc_vd  = gr.Textbox(label='Testo preprocessato', lines=3)
                    info_vd  = gr.Textbox(label='Info', lines=2)
            btn_vd.click(fn=genera_voice_design,
                inputs=[testo_vd, vdesc, do_pp_vd, chunking_vd, preset_vd,
                        temp_vd, tp_vd, tk_vd, rp_vd, mt_vd, tmin_vd, tsec_vd],
                outputs=[audio_vd, proc_vd, info_vd])

        with gr.TabItem('👤 Custom Voice'):
            gr.Markdown('9 voci predefinite Qwen. Richiede `*-CustomVoice`.')
            with gr.Row():
                with gr.Column(scale=1):
                    vdd  = gr.Dropdown(label='👤 Voce predefinita', choices=CUSTOM_VOICES, value='Chelsie')
                    vman = gr.Textbox(label='✏️ Nome voce manuale (sovrascrive dropdown)', lines=1, placeholder='Lascia vuoto per dropdown')
                    do_pp_cv, chunking_cv, preset_cv, tmin_cv, tsec_cv, temp_cv, tp_cv, tk_cv, rp_cv, mt_cv = _shared_controls('cv')
                with gr.Column(scale=1):
                    testo_cv = gr.Textbox(label='📄 Testo Italiano', lines=12)
                    btn_cv   = gr.Button('👤 Genera Custom Voice', variant='primary')
                    audio_cv = gr.Audio(label='🔊 Output', type='filepath')
                    proc_cv  = gr.Textbox(label='Testo preprocessato', lines=3)
                    info_cv  = gr.Textbox(label='Info', lines=2)
            btn_cv.click(fn=genera_custom_voice,
                inputs=[testo_cv, vdd, vman, do_pp_cv, chunking_cv, preset_cv,
                        temp_cv, tp_cv, tk_cv, rp_cv, mt_cv, tmin_cv, tsec_cv],
                outputs=[audio_cv, proc_cv, info_cv])

ui.queue(default_concurrency_limit=1)
ui.launch(share=True)
