# Test TTS

In [1]:
import io
import json
import logging
import os
import re
from io import BytesIO
from threading import Thread

import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import torch
from dotenv import load_dotenv
from pydub import AudioSegment

# from huggingface_hub import login
# Opitimized
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from sentence_transformers import SentenceTransformer, util
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    TextIteratorStreamer,
    pipeline,
)
from TTS.tts.configs.xtts_config import XttsConfig  # type: ignore
from TTS.tts.models.xtts import Xtts  # type: ignore

from routes.Xu_ly_text import Xu_ly_text_de_doc
from vi_cleaner.vi_cleaner import ViCleaner  # type: ignore

# Configure logging instead of using print statements
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

device = 'cuda'
use_deepspeed = True if device == 'cuda' else False
def clear_gpu_cache():
    if torch.cuda.is_available() and device == 'cuda':
        torch.cuda.empty_cache()
   

  from tqdm.autonotebook import tqdm, trange


In [2]:
     
# def load_model_tts(xtts_checkpoint, xtts_config, xtts_vocab):
#     clear_gpu_cache()
#     config = XttsConfig()
#     config.load_json(xtts_config)
#     XTTS_MODEL = Xtts.init_from_config(config)

#     use_deepspeed = torch.cuda.is_available()

#     XTTS_MODEL.load_checkpoint(
#         config,
#         checkpoint_path=xtts_checkpoint,
#         vocab_path=xtts_vocab,
#         use_deepspeed=use_deepspeed,
#     )
#     if torch.cuda.is_available():
#         XTTS_MODEL.cuda()
#         # Set model to use bfloat16 if supported
#         if torch.cuda.get_device_capability() >= (8, 0):  # Check for bfloat16 support (Ampere+ GPUs)
#             XTTS_MODEL = XTTS_MODEL.to(dtype=torch.bfloat16)
#         else:
#             logging.info("Warning: bfloat16 not supported on this device.")
#             XTTS_MODEL.eval()
#     return XTTS_MODEL
def load_model_tts(xtts_checkpoint, xtts_config, xtts_vocab):
    clear_gpu_cache()
    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)

    use_deepspeed = torch.cuda.is_available()

    XTTS_MODEL.load_checkpoint(
        config,
        checkpoint_path=xtts_checkpoint,
        vocab_path=xtts_vocab,
        use_deepspeed=use_deepspeed,
    )
    if torch.cuda.is_available():
        XTTS_MODEL.cuda()
        # Set model to use bfloat16 if supported
        if torch.cuda.get_device_capability() >= (8, 0):  # Check for bfloat16 support (Ampere+ GPUs)
            XTTS_MODEL = XTTS_MODEL.to(dtype=torch.bfloat16)
        else:
            logging.info("Warning: bfloat16 not supported on this device.")
            XTTS_MODEL.eval()
    return XTTS_MODEL

def run_tts(text, lang="vi"):
    if vixtts_model is None or not reference_audio:
        return "You need to run the previous step to load the model !!", None, None

    gpt_cond_latent, speaker_embedding = vixtts_model.get_conditioning_latents(
        audio_path=reference_audio,
        gpt_cond_len=vixtts_model.config.gpt_cond_len,
        max_ref_length=vixtts_model.config.max_ref_len,
        sound_norm_refs=vixtts_model.config.sound_norm_refs,
    )

    # Chuẩn hóa
    tts_text = normalize_vietnamese_text(text)
    tts_texts = split_sentences(tts_text)
    print(tts_texts)
    wav_chunks = []
    for text in tts_texts:
        if text.strip() == "":
            continue

        wav_chunk = vixtts_model.inference(
            text=text,
            language=lang,
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.01,  # 0.3
            length_penalty=1.0,  # 1.0
            repetition_penalty=50.0,  # 10.0
            top_k=5,  # 30
            top_p=0.95,  # 0.85
        )

        
        keep_len = -1
        wav_chunk["wav"] = torch.tensor(wav_chunk["wav"][:keep_len])
        wav_chunks.append(wav_chunk["wav"])



    out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
    out_path = 'test_nghich.wav'
    torchaudio.save(out_path, out_wav, 24000)

    return out_path


def normalize_vietnamese_text(text):
    cleaner = ViCleaner(text)
    text = cleaner.clean()
    text = Xu_ly_text_de_doc(text)
    return text.lower()


def split_sentences(text, max_length=245):
    text = (
        text.replace("\n", ". ").replace(";", ".").replace("?", ".").replace("!", ".")
    )

    sentences = re.findall(r"[^,.]+[,.]", text)
    grouped_sentences = []
    current_group = ""

    for sentence in sentences:
        # Nếu thêm câu vào mà không vượt quá giới hạn max_length
        if len(current_group) + len(sentence) + 1 < max_length:
            if current_group:
                current_group += " " + sentence  # Ghép câu mới vào câu trước đó
            else:
                current_group = sentence  # Câu đầu tiên của nhóm
        elif len(sentence) > max_length:  # Xử lý
            if current_group:
                grouped_sentences.append(current_group)
                current_group = ""
            tamthoi = []
            for i in sentence.split(" "):
                tamthoi += [i]
                if len(tamthoi) >= 40:
                    grouped_sentences += [" ".join(tamthoi)]
                    tamthoi = []
            if tamthoi:
                grouped_sentences += [" ".join(tamthoi)]
        else:
            grouped_sentences.append(current_group)  # Thêm nhóm vào list
            current_group = sentence  # Khởi tạo nhóm mới với câu hiện tại

    if current_group:
        grouped_sentences.append(current_group)  # Thêm nhóm cuối cùng vào list

    return grouped_sentences

# -------------------------------------------------------------------  LOAD MODEL ----------------------------------------------------------------------
print("Loading TTS models... ")

# Load model TTS capleaf/viXTTS
tts_model_path = os.getenv("PROJECTCB1_TTS_MODEL")
vixtts_model = load_model_tts(
    xtts_checkpoint=f"{tts_model_path}/model.pth",
    xtts_config=f"{tts_model_path}/config.json",
    xtts_vocab=f"{tts_model_path}/vocab.json",
)

logging.info("Done TTS")
# Load reference audio for tts
reference_audio = os.getenv("PROJECTCB1_REFERENCE_AUDIO")  # Mẫu giọng nói

def convert_to_wav(audio_bytes):
    try:
        # Sử dụng pydub để đọc tệp âm thanh từ BytesIO
        audio = AudioSegment.from_file(BytesIO(audio_bytes))
        # Tạo một tệp WAV trong bộ nhớ
        wav_io = BytesIO()
        audio.export(wav_io, format="wav")
        wav_io.seek(0)  # Đặt lại con trỏ tệp về vị trí ban đầu
        return wav_io
    except Exception as e:
        raise ValueError(f"Error converting audio to WAV: {e}")

Loading TTS models... 


  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


[2024-12-13 19:22:49,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2024-12-13 19:22:50,544] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.1, git-hash=unknown, git-branch=unknown
[2024-12-13 19:22:50,546] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2024-12-13 19:22:50,642] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float32, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'min_out_tokens': 1, 'scale_attn

Using /home/andv/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/andv/.cache/torch_extensions/py310_cu124/transformer_inference/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load transformer_inference op: 0.008982181549072266 seconds


Loading extension module transformer_inference...
2024-12-13 19:22:50,976 - INFO - Done TTS


In [3]:
text="Xin chào" 
lang="vi"
gpt_cond_latent, speaker_embedding = vixtts_model.get_conditioning_latents(
    audio_path=reference_audio,
    gpt_cond_len=vixtts_model.config.gpt_cond_len,
    max_ref_length=vixtts_model.config.max_ref_len,
    sound_norm_refs=vixtts_model.config.sound_norm_refs,
)

# Chuẩn hóa
tts_text = normalize_vietnamese_text(text)
tts_texts = split_sentences(tts_text)
print(tts_texts)
wav_chunks = []
for text in tts_texts:
    if text.strip() == "":
        continue

    # Ensure the dtype of the input matches the model's dtype
    input_dtype = next(vixtts_model.parameters()).dtype  # Get the dtype of the model weights (bfloat16 or float32)
    
    # Convert the input tensors to the model's dtype (bfloat16 in this case)
    gpt_cond_latent = gpt_cond_latent.to(dtype=torch.bfloat16)
    speaker_embedding = speaker_embedding.to(dtype=torch.bfloat16)
    
    print(f"Model dtype: {next(vixtts_model.parameters()).dtype}")
    print(f"GPT Latent dtype: {gpt_cond_latent.dtype}")
    print(f"Speaker Embedding dtype: {speaker_embedding.dtype}")


#     wav_chunk = vixtts_model.inference(
#         text=text,
#         language=lang,
#         gpt_cond_latent=gpt_cond_latent,
#         speaker_embedding=speaker_embedding,
#         temperature=0.01,  # 0.3
#         length_penalty=1.0,  # 1.0
#         repetition_penalty=50.0,  # 10.0
#         top_k=5,  # 30
#         top_p=0.95,  # 0.85
#     )

    
#     keep_len = -1
#     wav_chunk["wav"] = torch.tensor(wav_chunk["wav"][:keep_len])
#     wav_chunks.append(wav_chunk["wav"])



# out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
# out_path = 'test_nghich.wav'
# torchaudio.save(out_path, out_wav, 24000)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same

In [None]:
# audio_file = run_tts(text="""Vốn của từng dự án trong chương trình mục tiêu quốc gia phát triển kinh tế - xã hội vùng đồng bào dân tộc thiểu số và miền núi giai đoạn 2021 - 2030. Giai đoạn I: Từ Năm 2021 đến năm 2025
# Dự kiến nhu cầu vốn và nguồn vốn để thực hiện Dự án 1: 18.177,448 tỷ đồng, trong đó:
# - Ngân sách trung ương: 7.840,553 tỷ đồng (vốn đầu tư: 4.565,965 tỷ đồng; vốn sự nghiệp: 3.274,588 tỷ đồng);
# - Ngân sách địa phương: 640,321 tỷ đồng;
# - Vốn vay tín dụng chính sách: 9.291,096 tỷ đồng;
# - Vốn huy động hợp pháp khác: 405,478 tỷ đồng.
# """)

# from IPython.display import Audio
# Audio(audio_file)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same

In [None]:
def load_model_tts(xtts_checkpoint, xtts_config, xtts_vocab):
    clear_gpu_cache()
    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)

    use_deepspeed = torch.cuda.is_available()

    XTTS_MODEL.load_checkpoint(
        config,
        checkpoint_path=xtts_checkpoint,
        vocab_path=xtts_vocab,
        use_deepspeed=use_deepspeed,
    )
    if torch.cuda.is_available():
        XTTS_MODEL.cuda()
        # Set model to use bfloat16 if supported
        if torch.cuda.get_device_capability() >= (8, 0):  # Check for bfloat16 support (Ampere+ GPUs)
            XTTS_MODEL = XTTS_MODEL.to(dtype=torch.bfloat16)
        else:
            logging.info("Warning: bfloat16 not supported on this device.")
            XTTS_MODEL.eval()
    return XTTS_MODEL






In [None]:

gpt_cond_latent, speaker_embedding = vixtts_model.get_conditioning_latents(
    audio_path=reference_audio,
    gpt_cond_len=vixtts_model.config.gpt_cond_len,
    max_ref_length=vixtts_model.config.max_ref_len,
    sound_norm_refs=vixtts_model.config.sound_norm_refs,
)

# Chuẩn hóa
tts_text = normalize_vietnamese_text(text)
tts_texts = split_sentences(tts_text)
print(tts_texts)
wav_chunks = []
for text in tts_texts:
    if text.strip() == "":
        continue

    # Ensure the dtype of the input matches the model's dtype
    input_dtype = next(vixtts_model.parameters()).dtype  # Get the dtype of the model weights (bfloat16 or float32)
    
    # Convert the input tensors to the model's dtype (bfloat16 in this case)
    gpt_cond_latent = gpt_cond_latent.to(dtype=input_dtype)
    speaker_embedding = speaker_embedding.to(dtype=input_dtype)

    wav_chunk = vixtts_model.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.01,  # 0.3
        length_penalty=1.0,  # 1.0
        repetition_penalty=50.0,  # 10.0
        top_k=5,  # 30
        top_p=0.95,  # 0.85
    )

    keep_len = -1
    wav_chunk["wav"] = torch.tensor(wav_chunk["wav"][:keep_len])
    wav_chunks.append(wav_chunk["wav"])

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
out_path = 'test_nghich.wav'
torchaudio.save(out_path, out_wav, 24000)