In [1]:
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    AutoModelForCausalLM,
    AutoTokenizer,
    TextStreamer,
)
from sentence_transformers import CrossEncoder, SentenceTransformer, util
from routes.Xu_ly_text import Xu_ly_text, Xu_ly_text_de_doc
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pydub import AudioSegment
from dotenv import load_dotenv
from vinorm import TTSnorm
import soundfile as sf
from io import BytesIO
import pandas as pd
import numpy as np
import librosa
import timeit
import torch
import json
import io
import re
import os

# from huggingface_hub import login
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Configure Chrome options
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--ignore-certificate-errors")  # Bỏ qua lỗi chứng chỉ SSL
options.add_argument("--allow-insecure-localhost")  # Cho phép kết nối không an toàn
options.add_argument(
    "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# service=Service("/usr/bin/chromedriver"),
# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

load_dotenv()

device = "cuda" if torch.cuda.is_available() else "cpu"


# --------------------------Define function to load Model and Data---------------------------
# Hàm nội bộ
def load_embedding_model(embedding_model_path, device):
    try:
        embedding_model = SentenceTransformer(
            model_name_or_path=embedding_model_path,
            device=device,
            model_kwargs={"torch_dtype": "bfloat16"},
            trust_remote_code=True,
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load embedding model: {e}")
    return embedding_model


# def load_reranking_model(pr_model_path):
#     pr_model = CrossEncoder(model_name=pr_model_path, device=device)
#     return pr_model


def load_embeddings(embeddings_path, device):
    try:
        text_chunks_and_embedding_df = pd.read_csv(embeddings_path)

        # Convert the embedding column from JSON strings to lists of floats
        text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df[
            "embedding"
        ].apply(json.loads)

        # Convert embeddings to PyTorch tensors
        embeddings = torch.tensor(
            np.array(text_chunks_and_embedding_df["embedding"].tolist()),
            dtype=torch.bfloat16,
        ).to(device)

        pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")
    except Exception as e:
        raise RuntimeError(f"Failed to load embeddings: {e}")

    return embeddings, pages_and_chunks


def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


def load_model_tts(xtts_checkpoint, xtts_config, xtts_vocab):
    clear_gpu_cache()
    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)

    use_deepspeed = torch.cuda.is_available()

    XTTS_MODEL.load_checkpoint(
        config,
        checkpoint_path=xtts_checkpoint,
        vocab_path=xtts_vocab,
        use_deepspeed=use_deepspeed,
    )
    if torch.cuda.is_available():
        XTTS_MODEL.cuda()
        XTTS_MODEL.eval()
    return XTTS_MODEL


def load_model_stt(stt_model_path: str):
    processor = Wav2Vec2Processor.from_pretrained(stt_model_path)
    stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_path).to(device)
    return processor, stt_model


def load_chat_model(model_path, device):
    try:
        if device == "cuda":
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.bfloat16,
                device_map=device,
                attn_implementation="flash_attention_2",
            )
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            # model.eval()
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float32,
                device_map=device,
            )
            tokenizer = AutoTokenizer.from_pretrained(model_path)
    except Exception as e:
        raise RuntimeError(f"Failed to load language model: {e}")

    return model, tokenizer


# -----------------------Load model and data--------------------------------------------------

print("Loading models... ")
# Load model embedding
eb_model_path = os.getenv("PROJECTCB1_EMBEDDING_MODEL")
embedding_model = load_embedding_model(
    embedding_model_path=eb_model_path, device=device
)

# Load data
embeddings_path = os.getenv("PROJECTCB1_DATA_FINAL")
embeddings, pages_and_chunks = load_embeddings(
    embeddings_path=embeddings_path, device=device
)

# Load model TTS capleaf/viXTTS
tts_model_path = os.getenv("PROJECTCB1_TTS_MODEL")
vixtts_model = load_model_tts(
    xtts_checkpoint=f"{tts_model_path}/model.pth",
    xtts_config=f"{tts_model_path}/config.json",
    xtts_vocab=f"{tts_model_path}/vocab.json",
)

# Load reference audio for tts
reference_audio = os.getenv("PROJECTCB1_REFERENCE_AUDIO")  # Mẫu giọng nói

# Load model STT nguyenvulebinh/wav2vec2-base-vietnamese-250h
stt_model_path = os.getenv("PROJECTCB1_STT_MODEL")
processor, stt_model = load_model_stt(stt_model_path=stt_model_path)

# Load LLM
llm_path = os.getenv("PROJECTCB1_LLM_MODEL")
model, tokenizer = load_chat_model(llm_path, device=device)
# ------------------------------------------------------------------------------------

# Load reranking
# rr_model_path = "embedding_model/PhoRanker"
# reranking_model = load_reranking_model(rr_model_path)

# Dowload TTS capleaf/viXTTS
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="capleaf/viXTTS", repo_type="model", local_dir="Model/TTS_model"
# )


print("Models Loaded!")


# ------------------------------------------Text processing-------------------------
def normalize_vietnamese_text(text):
    text = Xu_ly_text_de_doc(text)
    text = (
        TTSnorm(text, unknown=False, lower=False, rule=True)
        .replace("  ", " ")
        .replace(":", ".")
        .replace("!.", "!")
        .replace("?.", "?")
        .replace(" .", ".")
        .replace(" ,", ",")
        .replace('"', "")
        .replace("'", "")
        .replace("+", " ")
        .replace("..", ".")
        .replace("AI", "Ây Ai")
        .replace("A.I", "Ây Ai")
    )

    return text.lower()


def split_sentences(text, max_length=245):
    text = (
        text.replace("\n", ". ").replace(";", ".").replace("?", ".").replace("!", ".")
    )

    sentences = re.findall(r"[^,.]+[,.]", text)
    grouped_sentences = []
    current_group = ""

    for sentence in sentences:
        # Nếu thêm câu vào mà không vượt quá giới hạn max_length
        if len(current_group) + len(sentence) + 1 < max_length:
            if current_group:
                current_group += " " + sentence  # Ghép câu mới vào câu trước đó
            else:
                current_group = sentence  # Câu đầu tiên của nhóm
        elif len(sentence) > max_length:  # Xử lý
            if current_group:
                grouped_sentences.append(current_group)
                current_group = ""
            k = 0
            tamthoi = []
            for i in sentence.split(" "):
                tamthoi += [i]
                if len(tamthoi) >= 40:
                    grouped_sentences += [" ".join(tamthoi)]
                    tamthoi = []
            if tamthoi:
                grouped_sentences += [" ".join(tamthoi)]
        else:
            grouped_sentences.append(current_group)  # Thêm nhóm vào list
            current_group = sentence  # Khởi tạo nhóm mới với câu hiện tại

    if current_group:
        grouped_sentences.append(current_group)  # Thêm nhóm cuối cùng vào list

    return grouped_sentences


import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

rerank_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
rerank_model = AutoModelForSequenceClassification.from_pretrained(
    "BAAI/bge-reranker-v2-m3", torch_dtype=torch.bfloat16, device_map="cuda:0"
)
rerank_model.eval()


# Retrieval with rerank
def retrieve_relevant_resources(
    query: str,
    number_result_embedding: int = 20,
    number_result_reranking: int = 5,
    threshold: int = -4,
):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    # cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    # Get top scores with a threshold
    # scores, indices = torch.topk(input=cosine_scores, k=n_resources_to_return)
    scores, indices = torch.topk(input=dot_scores, k=number_result_embedding)
    print(scores)

    context_items = [pages_and_chunks[i] for i in indices]
    results = [item["Final_Answer"] for item in context_items]

    pairs = [[query, result] for result in results]

    with torch.no_grad():
        inputs = rerank_tokenizer(
            pairs, padding=True, truncation=True, return_tensors="pt", max_length=1024
        )
        inputs = {
            key: value.to("cuda") for key, value in inputs.items()
        }  # Move all inputs to the same device as the model

        # Compute scores
        rerank_scores = rerank_model(**inputs, return_dict=True).logits.view(
            -1,
        )

        top_scores, top_indices = torch.topk(rerank_scores, k=number_result_reranking)
        # Help me add script to only take the score > -3
        filtered_indices = top_indices[top_scores > threshold]
        rerank_result = [results[i] for i in filtered_indices]

    # return results, scores, top_scores, rerank_result
    return rerank_result


# Không sử dung các câu dẫn dắt, hãy trả về trực tiếp câu trả lời.
# Đảm bảo câu trả lời giải thích rõ nhất có thể.
# Prompt formatter
def prompt_formatter_root(query: str, results: list) -> str:
    context = '- "' + '"\n\n- "'.join(results) + '"'
    base_prompt = """Hãy cho bản thân không gian để suy nghĩ bằng cách trích xuất các đoạn văn có liên quan từ ngữ cảnh dưới đây trước khi trả lời câu hỏi của người dùng.
Sử dụng các đoạn ngữ cảnh sau để trả lời câu hỏi của người dùng:

{context}

Câu hỏi của người dùng: "{query}"
Không sử dung các câu dẫn dắt, hãy trả về trực tiếp câu trả lời. Đảm bảo câu trả lời giải thích rõ nhất có thể. 
Trả lời:"""
    return base_prompt.format(context=context, query=query)


# ----------------------------------Output function-----------------------------------------------
def run_stt(audio_bytes):
    # Đọc tệp âm thanh từ byte
    audio = AudioSegment.from_file(BytesIO(audio_bytes))

    # Chuyển đổi âm thanh thành mảng numpy
    samples = np.array(audio.get_array_of_samples())

    # Đảm bảo là mono (1 kênh)
    if audio.channels > 1:
        samples = samples.reshape((-1, audio.channels))
        samples = samples.mean(
            axis=1
        )  # Lấy trung bình giá trị của tất cả các kênh để chuyển sang mono

    # Chuẩn hóa lại tần số mẫu về 16000 Hz
    samples_16k = librosa.resample(
        samples.astype(np.float32), orig_sr=audio.frame_rate, target_sr=16000
    )

    # Tokenize dữ liệu đầu vào
    input_values = processor(
        samples_16k, return_tensors="pt", padding="longest", sampling_rate=16000
    ).input_values

    # Chuyển sang GPU và chuyển đổi sang float
    input_values = input_values.to(device).float()

    # Lấy kết quả dự đoán từ mô hình
    logits = stt_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Giải mã kết quả dự đoán thành văn bản
    transcription = processor.batch_decode(predicted_ids)[0]
    text = Xu_ly_text(transcription)
    return text


def run_tts(text, lang="vi"):
    if vixtts_model is None or not reference_audio:
        return "You need to run the previous step to load the model !!", None, None

    gpt_cond_latent, speaker_embedding = vixtts_model.get_conditioning_latents(
        audio_path=reference_audio,
        gpt_cond_len=vixtts_model.config.gpt_cond_len,
        max_ref_length=vixtts_model.config.max_ref_len,
        sound_norm_refs=vixtts_model.config.sound_norm_refs,
    )

    # Chuẩn hóa
    tts_text = normalize_vietnamese_text(text)
    tts_texts = split_sentences(tts_text)
    print(tts_texts)
    wav_chunks = []
    for text in tts_texts:
        if text.strip() == "":
            continue

        wav_chunk = vixtts_model.inference(
            text=text,
            language=lang,
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.01,  # 0.3
            length_penalty=1.0,  # 1.0
            repetition_penalty=50.0,  # 10.0
            top_k=5,  # 30
            top_p=0.95,  # 0.85
        )

        keep_len = -1
        wav_chunk["wav"] = torch.tensor(wav_chunk["wav"][:keep_len])
        wav_chunks.append(wav_chunk["wav"])

    out_wav = (
        torch.cat(wav_chunks, dim=0).squeeze(0).cpu().numpy()
    )  # Chuyển sang numpy array

    # Chuyển đổi Tensor thành định dạng WAV
    buffer = io.BytesIO()

    # Ghi âm thanh vào buffer, đảm bảo dữ liệu đầu vào là numpy array và định dạng đúng
    try:
        sf.write(buffer, out_wav, 24000, format="WAV")
        buffer.seek(0)
        wav_data = buffer.read()
    except Exception as e:
        print(f"Error writing WAV file: {e}")
        return None, None, None

    return wav_data


#  Phân biệt rõ ràng, dự án và tiểu dự án là hoàn toàn khác nhau


# ---------------------------Web searching----------------------------
def fetch_links(query: str, max_links: int = 10):
    """Fetch links from Google search results."""
    url = f"https://www.google.com/search?q={query}"
    driver.get(url)
    links = []

    try:
        # Wait for search results to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//a[@jsname="UWckNb"]'))
        )
        link_elements = driver.find_elements(By.XPATH, '//a[@jsname="UWckNb"]')

        for link_element in link_elements[:max_links]:
            href = link_element.get_attribute("href")
            if href:
                links.append(href)
                print(f"Link found: {href}")
    except Exception as e:
        print(f"Error fetching links: {e}")

    return links


def fetch_page_content(url: str):
    """Fetch page content for a given URL."""
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        content = driver.find_element(By.TAG_NAME, "body").text
        print(f"Content fetched from {url}")
        return content
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return None


def web_searching(query: str):
    """Perform web search and retrieve page content."""
    results_web_searching = []
    links = fetch_links(query)
    for link in links:
        if any(bad_domain in link for bad_domain in ["youtube.com"]):
            print(f"Skipping: {link}")
            continue

        content = fetch_page_content(link)
        if content:
            results_web_searching.append(content)
        else:
            continue
        if len(results_web_searching) == 2:
            break
    # Return first valid content
    text_web_searching = "\n\n".join(
        [f'- "{content}"' for content in results_web_searching]
    )
    text_web_searching = text_web_searching[:20000]
    return text_web_searching


def web_searching(query: str):
    """Perform web search and retrieve page content."""
    results_web_searching = []
    links = fetch_links(query)
    for link in links:
        if any(bad_domain in link for bad_domain in ["youtube.com"]):
            print(f"Skipping: {link}")
            continue

        content = fetch_page_content(link)
        if content:
            results_web_searching.append(content)
        else:
            continue
        if len(results_web_searching) == 2:
            break
    # Return first valid content
    text_web_searching = "\n\n".join(
        [f'- "{content}"' for content in results_web_searching]
    )
    text_web_searching = text_web_searching[:20000]
    return text_web_searching


# ---------------------------------Ask---------------------------
from transformers import TextIteratorStreamer
from threading import Thread


def ask(query: str) -> str:
    messages = [
        {
            "role": "system",
            "content": """Bạn là một trợ lí tiếng Việt hữu ích. Hãy trả lời câu hỏi của người dùng một cách chính xác.""",
        },
    ]
    results = retrieve_relevant_resources(
        query, number_result_embedding=20, number_result_reranking=5, threshold=-4
    )
    if len(results) == 0:
        web_search_result = web_searching(query=query)

        prompt = f"""Hãy cho bản thân không gian để suy nghĩ bằng cách trích xuất các đoạn văn có liên quan từ ngữ cảnh dưới đây trước khi trả lời câu hỏi của người dùng.
Sử dụng các đoạn ngữ cảnh sau để trả lời câu hỏi của người dùng:

{web_search_result}

Câu hỏi của người dùng: "{query}"
Không sử dung các câu dẫn dắt, hãy trả về trực tiếp câu trả lời. Đảm bảo câu trả lời giải thích đầy đủ, rõ ràng nhất có thể. 
Trả lời:"""
    else:
        prompt = prompt_formatter_root(query, results)
    messages.append({"role": "user", "content": prompt})

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    # model_inputs = tokenizer([text], return_tensors="pt").to(device)

    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    thread = Thread(
        target=model.generate,
        kwargs={
            "input_ids": inputs["input_ids"],
            "streamer": streamer,
            "do_sample": True,
            "max_new_tokens": 1024,
            "temperature": 0.01,
            # "top_k": 40,
            # "top_p": 0.95,
            # "repetition_penalty": 1.05,
        },
    )
    thread.start()  # now start the thread

    # for this example we'll both print out the new text and save it to a file
    # -----------------------------
    for new_text in streamer:
        yield new_text + ""
        # We can now process this text however we want,
        # for this example we'll print the text to stdout and
        # at the same time save the output to a text file.
        # In reality however we can do any processing we want in this loop
        # meaning we can fit this to just about any usecase we want!
        print(new_text, end="")

    thread.join()  # join our thread


  from .autonotebook import tqdm as notebook_tqdm


Loading models... 


  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


[2024-12-04 17:12:05,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2024-12-04 17:12:05,985] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.0, git-hash=unknown, git-branch=unknown
[2024-12-04 17:12:05,986] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2024-12-04 17:12:06,087] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float32, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'min_out_tokens': 1, 'scale_attn

Using /home/andv/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/andv/.cache/torch_extensions/py310_cu124/transformer_inference/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/11] /usr/local/cuda-12.4/bin/nvcc --generate-dependencies-with-compile --dependency-output pointwise_ops.cuda.o.d -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -isystem /home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/torch/include -isystem /home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/torch/include/TH -isystem /home/andv/important/chatbot_vnuis/.venv/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda-12.4/include -isystem /usr/include/python3.10 -D_GLIB

Loading extension module transformer_inference...


RuntimeError: Failed to load language model: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.