# VieNeu TTS Demo

This notebook demonstrates text-to-speech synthesis using VieNeu SDK with:
1. Preset voices
2. Custom voice cloning
3. Batch speech synthesis

## New: All-in-One Function
All functionality has been combined into a single `run_vieneu_tts_batch()` function below.

In [1]:
# All-in-One VieNeu TTS Function
import sys
import datetime
import soundfile as sf
from pathlib import Path
from vieneu import Vieneu


def run_vieneu_tts_batch(
    text_samples=None,
    output_base_dir="/home/lamquy/Project/TTS/results/VieNeu-TTS",
    notebook_dir="/home/lamquy/Project/TTS/VieNeu-TTS",
    preset_voice="Binh",
    use_custom_voice=False,
    sample_audio_path=None,
    sample_audio_text_path=None,
    custom_voice_name="MyCustomVoice",
    temperature=1.0,
    top_k=50,
    sample_rate=24000
):
    
    
    # ====== SETUP & CONFIGURATION ======
    notebook_dir = Path(notebook_dir)
    output_base_dir = Path(output_base_dir)
    
    # Import text samples if not provided
    if text_samples is None:
        sys.path.insert(0, '/home/lamquy/Project/TTS')
        from text_sample import TEXT_SAMPLES
        text_samples = TEXT_SAMPLES
    
    # Set sample audio path defaults
    if sample_audio_path is None:
        sample_audio_path = notebook_dir / "examples" / "audio_ref" / "example.wav"
    else:
        sample_audio_path = Path(sample_audio_path)
    with open(sample_audio_text_path, "r") as f:
        sample_audio_text = f.read()
    if sample_audio_text_path is None:
        sample_audio_text = "v√≠ d·ª• 2. t√≠nh trung b√¨nh c·ªßa d√£y s·ªë."
    
    # ====== INITIALIZE VieNeu SDK ======
    print("üöÄ Initializing VieNeu SDK...")
    tts = Vieneu()
    print("‚úÖ SDK initialized successfully")
    
    try:
        # ====== SELECT VOICE ======
        # List all available preset voices
        available_voices = tts.list_preset_voices()
        print("üìã Available preset voices:", available_voices)
        
        # Select a preset voice
        current_voice = tts.get_preset_voice(preset_voice)
        print(f"‚úÖ Selected voice: {preset_voice}")
        
        # ====== CLONE CUSTOM VOICE (OPTIONAL) ======
        if use_custom_voice:
            # Check if sample audio exists
            if sample_audio_path.exists():
                print(f"üéôÔ∏è Cloning voice from: {sample_audio_path.name}")
                
                # Clone voice and save with custom name
                custom_voice = tts.clone_voice(
                    audio_path=sample_audio_path,
                    text=sample_audio_text,
                    name=custom_voice_name
                )
                
                print(f"‚úÖ Voice cloned and saved as: '{custom_voice_name}'")
                
                # Switch to the new custom voice
                current_voice = custom_voice
                
                # Verify it was added to the voice list
                print("üìã Updated voice list:", tts.list_preset_voices())
            else:
                print(f"‚ö†Ô∏è Sample audio not found at: {sample_audio_path}")
                print("   Continuing with preset voice...")
        
        # ====== BATCH SPEECH SYNTHESIS ======
        # Create output directory with timestamp
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        output_dir = output_base_dir / timestamp
        output_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"üìÅ Output directory: {output_dir}")
        print(f"üìù Total samples to process: {len(text_samples)}")
        output = []
        # Process each text sample
        for idx, text in enumerate(text_samples, start=1):
            # Show progress
            print(f"\nüéß Sample {idx}/{len(text_samples)}: {text[:50]}...")
            
            # Generate audio
            audio = tts.infer(
                text=text,
                voice=current_voice,
                temperature=temperature,
                top_k=top_k
            )
            
            # Save to file
            output_file = output_dir / f"sample_{idx}.wav"
            output.append(output_file)
            sf.write(str(output_file), audio, sample_rate)
            print(f"   üíæ Saved: {output_file}")
        
        print("\n‚úÖ All samples processed successfully!")
        
        return output
        
    finally:
        # ====== CLEANUP ======
        # Close the TTS engine
        tts.close()
        print("‚úÖ TTS engine closed")

  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.7.1+cu118 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [2]:
def compute_wer_and_plot(
    ref_text,
    audio_path,
    asr_model_name="large",
    language="vi"
):
    """
    ref_text: str - text chu·∫©n
    audio_path: str - path t·ªõi file audio
    return: dict (WER + hypothesis + figure)
    """

    # -------- 1. Load ASR --------
    model = whisper.load_model(asr_model_name)

    # -------- 2. ASR: audio -> hypothesis --------
    result = model.transcribe(str(audio_path), language=language)  # ‚úÖ Convert to string
    hyp_text = result["text"].strip()

    # -------- 3. Normalize --------
    ref_text = ref_text.lower().strip()
    hyp_text = hyp_text.lower().strip()

    # -------- 4. Compute WER --------
    wer_score = wer(ref_text, hyp_text)

    # -------- 5. Error breakdown --------
    details = process_words(ref_text, hyp_text)

    error_counts = {
        "Correct": details.hits,
        "Substitution": details.substitutions,
        "Deletion": details.deletions,
        "Insertion": details.insertions,
    }

    # -------- 6. Plot --------
    fig, ax = plt.subplots()
    ax.bar(error_counts.keys(), error_counts.values())
    ax.set_title("Word Error Rate Breakdown")
    ax.set_ylabel("Count")
    ax.set_xlabel("Type")

    plt.tight_layout()

    return {
        "reference": ref_text,
        "hypothesis": hyp_text,
        "WER": wer_score,
        "error_breakdown": error_counts,
        "figure": fig,
    }

## Usage Examples

Run the cell below to execute batch TTS with default settings (preset voice).

In [3]:
!pip install openai-whisper matplotlib jiwer




In [4]:
import sys
import datetime
import soundfile as sf
from pathlib import Path
from vieneu import Vieneu
import whisper
import matplotlib.pyplot as plt
from jiwer import wer, process_words

In [5]:
# Force reload whisper module
import importlib
import sys
if 'whisper' in sys.modules:
    del sys.modules['whisper']
import whisper
# Verify it's the correct whisper
print(f"Whisper version: {whisper.__version__}")
print(f"Has load_model: {hasattr(whisper, 'load_model')}")

Whisper version: 20250625
Has load_model: True


In [1]:
import json
import datetime
from pathlib import Path

import whisper
import matplotlib.pyplot as plt
import torch
import torchaudio
import torch.nn.functional as F

from jiwer import wer, cer, process_words, process_characters
from speechbrain.inference.speaker import SpeakerRecognition

TARGET_SR = 16000

def _load_wav_mono_resample(path: str, target_sr: int = TARGET_SR) -> torch.Tensor:
    wav, sr = torchaudio.load(path)  # [C, T]
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)   # mono
    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.float()  # [1, T]

class SpeakerSimECAPA:
    """
    Lazy-load ECAPA model 1 l·∫ßn, d√πng l·∫°i nhi·ªÅu l·∫ßn (ƒë·ª° t·∫£i l·∫°i m·ªói file).
    """
    def __init__(self, device: str | None = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = SpeakerRecognition.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb",
            run_opts={"device": self.device},
        )

    @torch.inference_mode()
    def cosine_similarity(self, ref_audio_path: str, gen_audio_path: str) -> float:
        ref = _load_wav_mono_resample(ref_audio_path).squeeze(0).unsqueeze(0).to(self.device)  # [1,T]
        gen = _load_wav_mono_resample(gen_audio_path).squeeze(0).unsqueeze(0).to(self.device)

        emb_ref = self.model.encode_batch(ref).squeeze()
        emb_gen = self.model.encode_batch(gen).squeeze()

        sim = F.cosine_similarity(emb_ref, emb_gen, dim=0).item()
        return float(sim)

# Kh·ªüi t·∫°o global instance sau khi class ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a
spk_sim = None  # Lazy init ƒë·ªÉ tr√°nh load model khi import

def get_speaker_sim_model():
    global spk_sim
    if spk_sim is None:
        spk_sim = SpeakerSimECAPA()
    return spk_sim

def metric_calculate(
    audio_path,
    ref_text,
    model_name,
    asr_model_name="large",
    language="vi",
    result_dir="../results",
    ref_audio_path=None,         # ‚úÖ NEW: audio gi·ªçng g·ªëc ƒë·ªÉ so speaker similarity
    speaker_sim_model=None,      # ‚úÖ NEW: truy·ªÅn instance SpeakerSimECAPA ƒë·ªÉ reuse
):
    """
    T√≠nh to√°n WER + CER (+ Speaker Similarity n·∫øu c√≥ ref_audio_path) v√† l∆∞u JSON
    """
    # Load ASR model
    model = whisper.load_model(asr_model_name)

    # Transcribe audio
    result = model.transcribe(str(audio_path), language=language)
    hyp_text = result["text"].strip()

    # Normalize texts
    ref_text_normalized = ref_text.lower().strip()
    hyp_text_normalized = hyp_text.lower().strip()

    # Compute WER + CER
    wer_score = wer(ref_text_normalized, hyp_text_normalized)
    cer_score = cer(ref_text_normalized, hyp_text_normalized)

    # Word-level error breakdown
    word_details = process_words(ref_text_normalized, hyp_text_normalized)
    word_error_counts = {
        "Correct": word_details.hits,
        "Substitution": word_details.substitutions,
        "Deletion": word_details.deletions,
        "Insertion": word_details.insertions,
    }

    # Character-level error breakdown
    char_details = process_characters(ref_text_normalized, hyp_text_normalized)
    char_error_counts = {
        "Correct": char_details.hits,
        "Substitution": char_details.substitutions,
        "Deletion": char_details.deletions,
        "Insertion": char_details.insertions,
    }

    # ‚úÖ NEW: Speaker similarity (voice cloning)
    speaker_similarity = None
    if ref_audio_path is not None:
        if speaker_sim_model is None:
            speaker_sim_model = get_speaker_sim_model()  # lazy init
        speaker_similarity = speaker_sim_model.cosine_similarity(str(ref_audio_path), str(audio_path))

    # Prepare result directory
    result_path = Path(result_dir)
    result_path.mkdir(parents=True, exist_ok=True)

    # Create JSON filename with timestamp
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    audio_name = Path(audio_path).stem
    json_filename = f"{model_name}_{audio_name}_metrics.json"
    json_path = result_path / json_filename

    # Save results to JSON
    results_to_save = {
        "model_name": model_name,
        "audio_file": str(audio_path),
        "reference": ref_text,
        "hypothesis": hyp_text,
        "WER": wer_score,
        "CER": cer_score,
        "word_error_breakdown": word_error_counts,
        "char_error_breakdown": char_error_counts,
        "speaker_ref_audio": str(ref_audio_path) if ref_audio_path is not None else None,  # ‚úÖ NEW
        "speaker_similarity_cosine": speaker_similarity,                                   # ‚úÖ NEW
        "asr_model": asr_model_name,
        "timestamp": timestamp
    }

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(results_to_save, f, ensure_ascii=False, indent=2)

    print(f"üíæ Saved metrics (WER+CER+SIM) to: {json_path}")
    return str(json_path)

def plot_comparison(json_paths, result_dir="../results", chart_filename=None):
    """
    V·∫Ω ƒë·ªì th·ªã so s√°nh WER + CER + Speaker Similarity (n·∫øu c√≥)
    """
    # Load all JSON data
    all_results = []
    for json_path in json_paths:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_results.append(data)

    model_names = [r.get("model_name", "Unknown") for r in all_results]
    wer_scores = [r.get("WER", 0) * 100 for r in all_results]  # %
    cer_scores = [r.get("CER", 0) * 100 for r in all_results]  # %
    sim_scores = [r.get("speaker_similarity_cosine", None) for r in all_results]  # float or None

    # Create figure with 3 subplots
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
    colors = plt.cm.Set2(range(len(model_names)))

    # Plot 1: WER
    bars1 = ax1.bar(model_names, wer_scores, color=colors)
    ax1.set_xlabel("Model TTS")
    ax1.set_ylabel("WER (%)")
    ax1.set_title("So s√°nh Word Error Rate (WER)")
    ax1.set_ylim(0, max(wer_scores) * 1.2 if wer_scores else 10)
    for bar, score in zip(bars1, wer_scores):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                 f'{score:.2f}%', ha='center', va='bottom', fontsize=10)

    # Plot 2: CER
    bars2 = ax2.bar(model_names, cer_scores, color=colors)
    ax2.set_xlabel("Model TTS")
    ax2.set_ylabel("CER (%)")
    ax2.set_title("So s√°nh Character Error Rate (CER)")
    ax2.set_ylim(0, max(cer_scores) * 1.2 if cer_scores else 10)
    for bar, score in zip(bars2, cer_scores):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                 f'{score:.2f}%', ha='center', va='bottom', fontsize=10)

    # Plot 3: Speaker Similarity
    # N·∫øu thi·∫øu similarity (None) th√¨ v·∫Ω 0 v√† annotate "N/A"
    sim_plot_vals = [(s if s is not None else 0.0) for s in sim_scores]
    bars3 = ax3.bar(model_names, sim_plot_vals, color=colors)
    ax3.set_xlabel("Model TTS")
    ax3.set_ylabel("Cosine Similarity")
    ax3.set_title("So s√°nh Speaker Similarity (Voice Cloning)")
    ax3.set_ylim(0, 1.0)

    for bar, s in zip(bars3, sim_scores):
        label = f"{s:.3f}" if isinstance(s, (float, int)) else "N/A"
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                 label, ha='center', va='bottom', fontsize=10)

    plt.tight_layout()

    # Save figure
    result_path = Path(result_dir)
    result_path.mkdir(parents=True, exist_ok=True)

    if chart_filename is None:
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        chart_filename = f"metrics_comparison.png"

    chart_path = result_path / chart_filename
    plt.savefig(chart_path, dpi=150, bbox_inches='tight')
    plt.close()

    print(f"üìä Saved comparison chart to: {chart_path}")
    return str(chart_path)

def load_wer_results(json_path):
    """
    ƒê·ªçc k·∫øt qu·∫£ WER t·ª´ file JSON
    
    Args:
        json_path: str - ƒë∆∞·ªùng d·∫´n ƒë·∫øn file JSON
    
    Returns:
        dict - k·∫øt qu·∫£ WER ƒë√£ l∆∞u
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        return json.load(f)




metric_calculate(
    audio_path="/home/lamquy/Project/TTS/results/F5-TTS-Vietnamese/20260116_143338/sample_1.wav", 
    # audio_path="/home/lamquy/Project/TTS/results/XTTSv2/20260116_143925.wav", 
    # audio_path="/home/lamquy/Project/TTS/results/VieNeu-TTS/20260116_142158/sample_1.wav", 
    ref_audio_path="/home/lamquy/Project/TTS/F5-TTS-Vietnamese/samples/khoi/khoi.wav",
    ref_text="H√† N·ªôi, tr√°i tim c·ªßa Vi·ªát Nam, l√† m·ªôt th√†nh ph·ªë ng√†n nƒÉm vƒÉn hi·∫øn v·ªõi b·ªÅ d√†y l·ªãch s·ª≠ v√† vƒÉn h√≥a ƒë·ªôc ƒë√°o. B∆∞·ªõc ch√¢n tr√™n nh·ªØng con ph·ªë c·ªï k√≠nh quanh H·ªì Ho√†n Ki·∫øm, du kh√°ch nh∆∞ ƒë∆∞·ª£c du h√†nh ng∆∞·ª£c th·ªùi gian, chi√™m ng∆∞·ª°ng ki·∫øn tr√∫c Ph√°p c·ªï ƒëi·ªÉn h√≤a quy·ªán v·ªõi n√©t ki·∫øn tr√∫c truy·ªÅn th·ªëng Vi·ªát Nam. M·ªói con ph·ªë trong khu ph·ªë c·ªï mang m·ªôt t√™n g·ªçi ƒë·∫∑c tr∆∞ng, ph·∫£n √°nh ngh·ªÅ th·ªß c√¥ng truy·ªÅn th·ªëng t·ª´ng th·ªãnh h√†nh n∆°i ƒë√¢y nh∆∞ ph·ªë H√†ng B·∫°c, H√†ng ƒê√†o, H√†ng M√£. ·∫®m th·ª±c H√† N·ªôi c≈©ng l√† m·ªôt ƒëi·ªÉm nh·∫•n ƒë·∫∑c bi·ªát, t·ª´ t√¥ ph·ªü n√≥ng h·ªïi bu·ªïi s√°ng, b√∫n ch·∫£ th∆°m l·ª´ng tr∆∞a h√®, ƒë·∫øn ch√® Th√°i ng·ªçt ng√†o chi·ªÅu thu. Nh·ªØng m√≥n ƒÉn d√¢n d√£ n√†y ƒë√£ tr·ªü th√†nh bi·ªÉu t∆∞·ª£ng c·ªßa vƒÉn h√≥a ·∫©m th·ª±c Vi·ªát, ƒë∆∞·ª£c c·∫£ th·∫ø gi·ªõi y√™u m·∫øn. Ng∆∞·ªùi H√† N·ªôi n·ªïi ti·∫øng v·ªõi t√≠nh c√°ch hi·ªÅn h√≤a, l·ªãch thi·ªáp nh∆∞ng c≈©ng r·∫•t c·∫ßu to√†n trong t·ª´ng chi ti·∫øt nh·ªè, t·ª´ c√°ch pha tr√† sen cho ƒë·∫øn c√°ch ch·ªçn hoa sen t√¢y ƒë·ªÉ th∆∞·ªüng tr√†.", 
    model_name="F5-TTS-Vietnamese", 
    asr_model_name="large", 
    language="vi", 
    result_dir="../results/json"
)

# plot_comparison([
#     "/home/lamquy/Project/TTS/results/json/F5-TTS_sample_1_metrics.json",
#     "/home/lamquy/Project/TTS/results/json/VieNeu-TTS_sample_2_metrics.json",
#     "/home/lamquy/Project/TTS/results/json/XTTSv2_20260116_114606_metrics.json"
# ])

ModuleNotFoundError: No module named 'speechbrain'

In [8]:
TEXT_SAMPLES = [
    # B·∫£ng ch·ªØ c√°i ti·∫øng Vi·ªát
    # "A ƒÇ √Ç B C D ƒê E √ä G H I K L M N O √î ∆† P Q R S T U ∆Ø V X Y",
    
    # # ƒêo·∫°n vƒÉn d√†i - test ƒë·ªô ·ªïn ƒë·ªãnh
    # "Ti·∫øng Vi·ªát l√† ng√¥n ng·ªØ gi√†u thanh ƒëi·ªáu v√† h√¨nh ·∫£nh, ph·∫£n √°nh ƒë·ªùi s·ªëng tinh t·∫ø c·ªßa con ng∆∞·ªùi Vi·ªát Nam. Trong giao ti·∫øp h·∫±ng ng√†y, ch√∫ng ta s·ª≠ d·ª•ng ti·∫øng Vi·ªát ƒë·ªÉ chia s·∫ª c·∫£m x√∫c, truy·ªÅn ƒë·∫°t th√¥ng tin v√† k·∫øt n·ªëi c·ªông ƒë·ªìng. Ng√¥n ng·ªØ n√†y kh√¥ng ch·ªâ c√≥ t·ª´ v·ª±ng phong ph√∫ m√† c√≤n c√≥ h·ªá th·ªëng d·∫•u thanh ƒë·∫∑c tr∆∞ng, gi√∫p c√¢u n√≥i tr·ªü n√™n sinh ƒë·ªông, r√µ nghƒ©a v√† c·∫£m x√∫c. Vi·ªác b·∫£o t·ªìn v√† ph√°t tri·ªÉn ti·∫øng Vi·ªát l√† tr√°ch nhi·ªám chung c·ªßa x√£ h·ªôi trong th·ªùi ƒë·∫°i s·ªë h√≥a nay.",
    
    # # T√™n ri√™ng v√† ƒë·ªãa danh
    # "Nguy·ªÖn √Åi Qu·ªëc ƒë√£ t·ª´ng vi·∫øt v·ªÅ nh·ªØng chuy·∫øn chu du d√†i d·∫±ng d·∫∑c qua ch√¢u √Çu gi·ªØa m√πa ƒë√¥ng r√©t m∆∞·ªõt.",
    
    # # Ph·ª• √¢m kh√≥ (ch, tr, s, x)
    # "Ch·ªã Tr√∫c nh·∫∑t nh·∫°nh t·ª´ng chi·∫øc ch√©n s·ª© s·ª©t s·∫πo tr√™n chi·∫øc ch√µng tre tr∆∞·ªõc hi√™n nh√†.",
    
    # # C√¢u ng·∫Øn
    # "ƒê√¢y l√† ch·ªØ g",
    # "N·∫øu b·∫°n kh√¥ng bi·∫øt m√¨nh ƒëang ·ªü ƒë√¢u, th√¨ b·∫•t c·ª© con ƒë∆∞·ªùng n√†o c≈©ng s·∫Ω d·∫´n b·∫°n ƒë·∫øn ƒë√≥.",
    # "M·ªçi th·ª© ƒë√£ h√≥a ƒëi√™n v·ªõi ch·ªìng ng√†y h√¥m nay",
    "H√† N·ªôi, tr√°i tim c·ªßa Vi·ªát Nam, l√† m·ªôt th√†nh ph·ªë ng√†n nƒÉm vƒÉn hi·∫øn v·ªõi b·ªÅ d√†y l·ªãch s·ª≠ v√† vƒÉn h√≥a ƒë·ªôc ƒë√°o. B∆∞·ªõc ch√¢n tr√™n nh·ªØng con ph·ªë c·ªï k√≠nh quanh H·ªì Ho√†n Ki·∫øm, du kh√°ch nh∆∞ ƒë∆∞·ª£c du h√†nh ng∆∞·ª£c th·ªùi gian, chi√™m ng∆∞·ª°ng ki·∫øn tr√∫c Ph√°p c·ªï ƒëi·ªÉn h√≤a quy·ªán v·ªõi n√©t ki·∫øn tr√∫c truy·ªÅn th·ªëng Vi·ªát Nam. M·ªói con ph·ªë trong khu ph·ªë c·ªï mang m·ªôt t√™n g·ªçi ƒë·∫∑c tr∆∞ng, ph·∫£n √°nh ngh·ªÅ th·ªß c√¥ng truy·ªÅn th·ªëng t·ª´ng th·ªãnh h√†nh n∆°i ƒë√¢y nh∆∞ ph·ªë H√†ng B·∫°c, H√†ng ƒê√†o, H√†ng M√£. ·∫®m th·ª±c H√† N·ªôi c≈©ng l√† m·ªôt ƒëi·ªÉm nh·∫•n ƒë·∫∑c bi·ªát, t·ª´ t√¥ ph·ªü n√≥ng h·ªïi bu·ªïi s√°ng, b√∫n ch·∫£ th∆°m l·ª´ng tr∆∞a h√®, ƒë·∫øn ch√® Th√°i ng·ªçt ng√†o chi·ªÅu thu. Nh·ªØng m√≥n ƒÉn d√¢n d√£ n√†y ƒë√£ tr·ªü th√†nh bi·ªÉu t∆∞·ª£ng c·ªßa vƒÉn h√≥a ·∫©m th·ª±c Vi·ªát, ƒë∆∞·ª£c c·∫£ th·∫ø gi·ªõi y√™u m·∫øn. Ng∆∞·ªùi H√† N·ªôi n·ªïi ti·∫øng v·ªõi t√≠nh c√°ch hi·ªÅn h√≤a, l·ªãch thi·ªáp nh∆∞ng c≈©ng r·∫•t c·∫ßu to√†n trong t·ª´ng chi ti·∫øt nh·ªè, t·ª´ c√°ch pha tr√† sen cho ƒë·∫øn c√°ch ch·ªçn hoa sen t√¢y ƒë·ªÉ th∆∞·ªüng tr√†.",
    

]

"""
    Run batch TTS synthesis using VieNeu SDK.
    
    Args:
        text_samples (list): List of text strings to synthesize. If None, will import from text_sample.py
        output_base_dir (str|Path): Base directory for output files
        notebook_dir (str|Path): Directory containing the notebook and examples
        preset_voice (str): Name of preset voice to use (e.g., "Binh")
        use_custom_voice (bool): Whether to clone and use a custom voice
        sample_audio_path (str|Path): Path to sample audio for voice cloning
        sample_audio_text (str): Transcript of the sample audio
        custom_voice_name (str): Name to save the cloned voice as
        temperature (float): Temperature for synthesis (0.1=stable, 1.0+=expressive)
        top_k (int): Top-k sampling parameter
        sample_rate (int): Audio sample rate for output files
        
    Returns:
        Path: Directory containing the generated audio files
    """
output_dir = run_vieneu_tts_batch(
    text_samples= TEXT_SAMPLES,
    output_base_dir="/home/lamquy/Project/TTS/results/VieNeu-TTS",
    notebook_dir="/home/lamquy/Project/TTS/VieNeu-TTS",
    # preset_voice="TTT2",
    use_custom_voice=True,
    sample_audio_path="/home/lamquy/Project/TTS/VieNeu-TTS/examples/audio_ref/example_ngoc_huyen.wav",
    sample_audio_text_path="/home/lamquy/Project/TTS/VieNeu-TTS/examples/audio_ref/example_ngoc_huyen.txt",
    custom_voice_name="ngoc_huyen",
    temperature=1.0,
    top_k=50,   
    sample_rate=24000
)
print(output_dir)



üöÄ Initializing VieNeu SDK...
Loading backbone from: pnnbao-ump/VieNeu-TTS-0.3B-q4-gguf on cpu ...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


Loading codec from: neuphonic/distill-neucodec on cpu ...
‚úÖ SDK initialized successfully
üìã Available preset voices: ['Binh', 'Tuyen', 'Vinh', 'Doan', 'Ly', 'Ngoc']
‚úÖ Selected voice: Binh
üéôÔ∏è Cloning voice from: example_ngoc_huyen.wav
‚úÖ Voice 'ngoc_huyen' saved to /home/lamquy/Project/TTS/VieNeu-TTS/vieneu/assets/samples
‚úÖ Voice cloned and saved as: 'ngoc_huyen'
üìã Updated voice list: ['Binh', 'Tuyen', 'Vinh', 'Doan', 'Ly', 'Ngoc', 'ngoc_huyen']
üìÅ Output directory: /home/lamquy/Project/TTS/results/VieNeu-TTS/20260116_142158
üìù Total samples to process: 1

üéß Sample 1/1: H√† N·ªôi, tr√°i tim c·ªßa Vi·ªát Nam, l√† m·ªôt th√†nh ph·ªë ng...
   üíæ Saved: /home/lamquy/Project/TTS/results/VieNeu-TTS/20260116_142158/sample_1.wav

‚úÖ All samples processed successfully!
‚úÖ TTS engine closed
[PosixPath('/home/lamquy/Project/TTS/results/VieNeu-TTS/20260116_142158/sample_1.wav')]
