In [1]:
import torch
import numpy as np
import pandas as pd
from sentence_transformers import CrossEncoder, SentenceTransformer, util
from underthesea import sent_tokenize
from vinorm import TTSnorm
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import io
import soundfile as sf  # Th∆∞ vi·ªán x·ª≠ l√Ω √¢m thanh
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from pydub import AudioSegment
from io import BytesIO
import librosa
from Xu_ly_text import Xu_ly_text, Xu_ly_text_de_doc
import re
from dotenv import load_dotenv
import os
# from huggingface_hub import login

load_dotenv()

device = "cuda" if torch.cuda.is_available() else "cpu"

# # Get the Hugging Face access token from environment variables
# hf_token = os.getenv("PROJECTCB1_HUGGINGFACE_ACCESS_TOKEN")

# # Log in to Hugging Face using the access token
# if hf_token:
#     login(token=hf_token)
# else:
#     print(
#         "Access token not found. Please set the HUGGINGFACE_ACCESS_TOKEN in your .env file."
#     )

eb_model_path = os.getenv("PROJECTCB1_EMBEDDING_MODEL")
embeddings_path = os.getenv("PROJECTCB1_EMBEDDING_DATA_PATH")


# H√†m n·ªôi b·ªô
def load_embedding_model(embedding_model_path):
    embedding_model = SentenceTransformer(
        model_name_or_path=embedding_model_path, device=device
    )
    return embedding_model


def load_reranking_model(pr_model_path):
    pr_model = CrossEncoder(model_name=pr_model_path, device=device, trust_remote_code=True)
    return pr_model


def load_embeddings(embeddings_path):
    text_chunks_and_embedding_df = pd.read_csv(embeddings_path)
    text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df[
        "embedding"
    ].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))
    pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")
    embeddings = torch.tensor(
        np.array(text_chunks_and_embedding_df["embedding"].tolist()),
        dtype=torch.float32,
    ).to(device)
    return embeddings, pages_and_chunks


def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


def load_model_tts(xtts_checkpoint, xtts_config, xtts_vocab):
    clear_gpu_cache()
    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)

    use_deepspeed = torch.cuda.is_available()

    XTTS_MODEL.load_checkpoint(
        config,
        checkpoint_path=xtts_checkpoint,
        vocab_path=xtts_vocab,
        use_deepspeed=use_deepspeed,
    )
    if torch.cuda.is_available():
        XTTS_MODEL.cuda()
    return XTTS_MODEL


def normalize_vietnamese_text(text):
    text = Xu_ly_text_de_doc(text)
    text = (
        TTSnorm(text, unknown=False, lower=False, rule=True)
        .replace("  ", " ")
        .replace(":", ".")
        .replace("!.", "!")
        .replace("?.", "?")
        .replace(" .", ".")
        .replace(" ,", ",")
        .replace('"', "")
        .replace("'", "")
        .replace("+", " ")
        .replace("..", ".")
        .replace("AI", "√Çy Ai")
        .replace("A.I", "√Çy Ai")
    )
    return text.lower()


def split_sentences(text, max_length=245):
    text = (
        text.replace("\n", ". ").replace(";", ".").replace("?", ".").replace("!", ".")
    )

    sentences = re.findall(r"[^,.]+[,.]", text)
    grouped_sentences = []
    current_group = ""

    for sentence in sentences:
        # N·∫øu th√™m c√¢u v√†o m√† kh√¥ng v∆∞·ª£t qu√° gi·ªõi h·∫°n max_length
        if len(current_group) + len(sentence) + 1 < max_length:
            if current_group:
                current_group += " " + sentence  # Gh√©p c√¢u m·ªõi v√†o c√¢u tr∆∞·ªõc ƒë√≥
            else:
                current_group = sentence  # C√¢u ƒë·∫ßu ti√™n c·ªßa nh√≥m
        elif len(sentence) > max_length:  # X·ª≠ l√Ω
            if current_group:
                grouped_sentences.append(current_group)
                current_group = ""
            k = 0
            tamthoi = []
            for i in sentence.split(" "):
                tamthoi += [i]
                if len(tamthoi) >= 40:
                    grouped_sentences += [" ".join(tamthoi)]
                    tamthoi = []
            if tamthoi:
                grouped_sentences += [" ".join(tamthoi)]
        else:
            grouped_sentences.append(current_group)  # Th√™m nh√≥m v√†o list
            current_group = sentence  # Kh·ªüi t·∫°o nh√≥m m·ªõi v·ªõi c√¢u hi·ªán t·∫°i

    if current_group:
        grouped_sentences.append(current_group)  # Th√™m nh√≥m cu·ªëi c√πng v√†o list

    return grouped_sentences


# Khai b√°o c√°c m√¥ h√¨nh

print("Loading models... ")
# Load model embedding

embedding_model = load_embedding_model(eb_model_path)

# Load reranking
# rr_model_path = "itdainb/PhoRanker"
# reranking_model = load_reranking_model(rr_model_path)

# Dowload TTS capleaf/viXTTS
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="capleaf/viXTTS", repo_type="model", local_dir="Model/TTS_model"
# )

tts_model_path = os.getenv("PROJECTCB1_TTS_MODEL")
# Load model TTS capleaf/viXTTS
vixtts_model = load_model_tts(
    xtts_checkpoint=f"{tts_model_path}/model.pth",
    xtts_config=f"{tts_model_path}/config.json",
    xtts_vocab=f"{tts_model_path}/vocab.json",
)

embeddings, pages_and_chunks = load_embeddings(embeddings_path)  # Load embeddings
reference_audio = os.getenv("PROJECTCB1_REFERENCE_AUDIO")  # M·∫´u gi·ªçng n√≥i

# Load model STT nguyenvulebinh/wav2vec2-base-vietnamese-250h

stt_model_path = os.getenv("PROJECTCB1_STT_MODEL")
processor = Wav2Vec2Processor.from_pretrained(stt_model_path)
model = Wav2Vec2ForCTC.from_pretrained(stt_model_path).to(device)
print("Models Loaded!")

# processor.save_pretrained(stt_model_path)
# model.save_pretrained(stt_model_path)
# H√†m s·ª≠ d·ª•ng cho API


def run_stt(audio_bytes):
    # ƒê·ªçc t·ªáp √¢m thanh t·ª´ byte
    audio = AudioSegment.from_file(BytesIO(audio_bytes))

    # Chuy·ªÉn ƒë·ªïi √¢m thanh th√†nh m·∫£ng numpy
    samples = np.array(audio.get_array_of_samples())

    # ƒê·∫£m b·∫£o l√† mono (1 k√™nh)
    if audio.channels > 1:
        samples = samples.reshape((-1, audio.channels))
        samples = samples.mean(
            axis=1
        )  # L·∫•y trung b√¨nh gi√° tr·ªã c·ªßa t·∫•t c·∫£ c√°c k√™nh ƒë·ªÉ chuy·ªÉn sang mono

    # Chu·∫©n h√≥a l·∫°i t·∫ßn s·ªë m·∫´u v·ªÅ 16000 Hz
    samples_16k = librosa.resample(
        samples.astype(np.float32), orig_sr=audio.frame_rate, target_sr=16000
    )

    # Tokenize d·ªØ li·ªáu ƒë·∫ßu v√†o
    input_values = processor(
        samples_16k, return_tensors="pt", padding="longest", sampling_rate=16000
    ).input_values

    # Chuy·ªÉn sang GPU v√† chuy·ªÉn ƒë·ªïi sang float
    input_values = input_values.to(device).float()

    # L·∫•y k·∫øt qu·∫£ d·ª± ƒëo√°n t·ª´ m√¥ h√¨nh
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Gi·∫£i m√£ k·∫øt qu·∫£ d·ª± ƒëo√°n th√†nh vƒÉn b·∫£n
    transcription = processor.batch_decode(predicted_ids)[0]
    text = Xu_ly_text(transcription)
    return text


def run_tts(text, lang="vi"):
    if vixtts_model is None or not reference_audio:
        return "You need to run the previous step to load the model !!", None, None

    gpt_cond_latent, speaker_embedding = vixtts_model.get_conditioning_latents(
        audio_path=reference_audio,
        gpt_cond_len=vixtts_model.config.gpt_cond_len,
        max_ref_length=vixtts_model.config.max_ref_len,
        sound_norm_refs=vixtts_model.config.sound_norm_refs,
    )

    # Chu·∫©n h√≥a
    tts_text = normalize_vietnamese_text(text)
    tts_texts = split_sentences(tts_text)
    print(tts_texts)
    wav_chunks = []
    for text in tts_texts:
        if text.strip() == "":
            continue

        wav_chunk = vixtts_model.inference(
            text=text,
            language=lang,
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.01,  # 0.3
            length_penalty=1.0,  # 1.0
            repetition_penalty=50.0,  # 10.0
            top_k=5,  # 30
            top_p=0.95,  # 0.85
        )

        keep_len = -1
        wav_chunk["wav"] = torch.tensor(wav_chunk["wav"][:keep_len])
        wav_chunks.append(wav_chunk["wav"])

    out_wav = (
        torch.cat(wav_chunks, dim=0).squeeze(0).cpu().numpy()
    )  # Chuy·ªÉn sang numpy array

    # Chuy·ªÉn ƒë·ªïi Tensor th√†nh ƒë·ªãnh d·∫°ng WAV
    buffer = io.BytesIO()

    # Ghi √¢m thanh v√†o buffer, ƒë·∫£m b·∫£o d·ªØ li·ªáu ƒë·∫ßu v√†o l√† numpy array v√† ƒë·ªãnh d·∫°ng ƒë√∫ng
    try:
        sf.write(buffer, out_wav, 24000, format="WAV")
        buffer.seek(0)
        wav_data = buffer.read()
    except Exception as e:
        print(f"Error writing WAV file: {e}")
        return None, None, None

    return wav_data




  from tqdm.autonotebook import tqdm, trange


Loading models... 


  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


[2024-10-25 15:10:53,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2024-10-25 15:10:54,582] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.15.2, git-hash=unknown, git-branch=unknown
[2024-10-25 15:10:54,584] [INFO] [logging.py:96:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2024-10-25 15:10:54,681] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float32, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'min_out_tokens': 1, 'scale_attn_by

Using /home/andv/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/andv/.cache/torch_extensions/py310_cu124/transformer_inference/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load transformer_inference op: 0.04592084884643555 seconds


Loading extension module transformer_inference...


Models Loaded!


In [2]:
def retrieve_relevant_resources(query: str, n_resources_to_return: int = 5, threshold: int =0.01):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    # Use cosine similarity instead of dot score
    cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    
    # Get all scores and corresponding indices, then filter based on score > 0.5
    scores, indices = torch.topk(input=cosine_scores, k=n_resources_to_return)
    filtered_scores_indices = [(score.item(), index.item()) for score, index in zip(scores, indices) if score.item() > threshold]
    
    # Extract the scores and indices after filtering
    filtered_indices = [index for _, index in filtered_scores_indices]
    
    # Take top 'n_resources_to_return' from the filtered list
    # top_scores = filtered_scores[:n_resources_to_return]
    top_indices = filtered_indices[:n_resources_to_return]
    
    context_items = [pages_and_chunks[i] for i in top_indices]
    results = [item["Final_Answer"] for item in context_items]
    # pr_results = reranking_model.rank(query, results, return_documents=True, top_k=3)
    
    return results

# Example usage:
# results, scores = retrieve_relevant_resources("What is the capital of Japan?", n_resources_to_return=3)
# print(results, scores)


In [24]:
# # Function to format the prompt
# def prompt_formatter(query: str, results: list) -> str:
#     context = "* " + "\n\n* ".join(results)
#     base_prompt = """H√£y cho b·∫£n th√¢n kh√¥ng gian ƒë·ªÉ suy nghƒ© b·∫±ng c√°ch tr√≠ch xu·∫•t c√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan t·ª´ ng·ªØ c·∫£nh d∆∞·ªõi ƒë√¢y tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi:
# \n{context}
# \nC√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan: <tr√≠ch xu·∫•t c√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan t·ª´ ng·ªØ c·∫£nh t·∫°i ƒë√¢y>
# C√¢u h·ªèi: {query}
# """
#     formatted_prompt = base_prompt.format(context=context, query=query)
#     prompt = f"### C√¢u h·ªèi: {formatted_prompt}\n\n### Tr·∫£ l·ªùi:" 
#     return prompt



In [24]:
# def prompt_formatter(query: str, results: list) -> str:
#     context = "* " + "\n\n* ".join(results)
#     base_prompt = """H√£y ƒë·ªÉ cho b·∫£n th√¢n ƒë·ªÉ suy nghƒ© b·∫±ng c√°ch s·ª≠ d·ª•ng c√°c m·ª•c ng·ªØ c·∫£nh sau ƒë·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng:\n\n{context}\n\nC√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng: {query}"""
#     formatted_prompt = base_prompt.format(context=context, query=query)
#     prompt = f"""<|im_start|>system
# B·∫°n l√† m·ªôt tr·ª£ l√≠ AI h·ªØu √≠ch. H√£y tr·∫£ l·ªùi ng∆∞·ªùi d√πng m·ªôt c√°ch ch√≠nh x√°c.
# <|im_end|>
# <|im_start|>user
# {formatted_prompt}<|im_end|>
# <|im_start|>assistant
# """
#     return prompt

In [None]:
# Function to format the prompt
def prompt_formatter(query: str, results: list) -> str:
    context = "* " + "\n\n* ".join(results)
    base_prompt = """H√£y cho b·∫£n th√¢n kh√¥ng gian ƒë·ªÉ suy nghƒ© b·∫±ng c√°ch tr√≠ch xu·∫•t c√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan t·ª´ ng·ªØ c·∫£nh d∆∞·ªõi ƒë√¢y tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi:
\n{context}
\nC√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan: <tr√≠ch xu·∫•t c√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan t·ª´ ng·ªØ c·∫£nh t·∫°i ƒë√¢y>
C√¢u h·ªèi: {query}
"""
    prompt = base_prompt.format(context=context, query=query)
    return prompt

In [25]:
query = "N·ªôi dung d·ª± √°n 1 l√† g√¨?"
results = retrieve_relevant_resources(query)
prompt = prompt_formatter(query, results)

In [26]:
print(prompt)

### C√¢u h·ªèi: H√£y cho b·∫£n th√¢n kh√¥ng gian ƒë·ªÉ suy nghƒ© b·∫±ng c√°ch tr√≠ch xu·∫•t c√°c ƒëo·∫°n vƒÉn c√≥ li√™n quan t·ª´ ng·ªØ c·∫£nh d∆∞·ªõi ƒë√¢y tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi:

* N·ªôi dung c·ª• th·ªÉ c·ªßa D·ª± √°n 1 l√† g√¨? "N·ªôi dung c·ª• th·ªÉ c·ªßa D·ª± √°n 1 bao g·ªìm:
- N·ªôi dung s·ªë 01: H·ªó tr·ª£ ƒë·∫•t ·ªü: CƒÉn c·ª© qu·ªπ ƒë·∫•t, h·∫°n m·ª©c ƒë·∫•t ·ªü v√† kh·∫£ nƒÉng ng√¢n s√°ch, ·ª¶y ban nh√¢n d√¢n c·∫•p t·ªânh xem x√©t, quy·∫øt ƒë·ªãnh giao ƒë·∫•t ƒë·ªÉ l√†m nh√† ·ªü cho c√°c ƒë·ªëi t∆∞·ª£ng n√™u tr√™n ph√π h·ª£p v·ªõi ƒëi·ªÅu ki·ªán, t·∫≠p qu√°n ·ªü ƒë·ªãa ph∆∞∆°ng v√† ph√°p lu·∫≠t v·ªÅ ƒë·∫•t ƒëai, c·ª• th·ªÉ:
+ ·ªû nh·ªØng n∆°i c√≥ ƒëi·ªÅu ki·ªán v·ªÅ ƒë·∫•t ƒëai, ch√≠nh quy·ªÅn ƒë·ªãa ph∆∞∆°ng s·ª≠ d·ª•ng s·ªë ti·ªÅn h·ªó tr·ª£ t·ª´ ng√¢n s√°ch ƒë·ªÉ t·∫°o m·∫∑t b·∫±ng, l√†m h·∫° t·∫ßng k·ªπ thu·∫≠t ƒë·ªÉ c·∫•p ƒë·∫•t ·ªü cho c√°c ƒë·ªëi t∆∞·ª£ng ƒë∆∞·ª£c th·ª• h∆∞·ªüng;
+ ·ªû c√°c ƒë·ªãa ph∆∞∆°ng kh√¥ng c√≥ ƒëi·ªÅu ki·ªán v·ªÅ ƒë·∫•t ƒëai, ch√≠n

In [10]:
# coding: utf8
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

model_path = "vinai/PhoGPT-4B-Chat"  
# model_path = "vilm/vinallama-2.7b-chat"
# model_path = "/home/andv/important/Chatbot/finetune_llm/phogpt-mergerd-with-config-1"

# model_path = "/home/andv/important/Chatbot/finetune"

# config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)  
# config.init_device = "cuda"
# config.attn_config['attn_impl'] = 'flash' # If installed: this will use either Flash Attention V1 or V2 depending on what is installed

model = AutoModelForCausalLM.from_pretrained(model_path,
                                            #  config=config, 
                                             torch_dtype=torch.bfloat16, 
                                             trust_remote_code=True).to("cuda")

# model = AutoModelForCausalLM.from_pretrained(model_path, config = config, trust_remote_code=True).to("cuda")
# If your GPU does not support bfloat16:
# model = AutoModelForCausalLM.from_pretrained(model_path, config=config, torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)  




In [27]:
def ask(query:str):
    results = retrieve_relevant_resources(query=query, n_resources_to_return=3, threshold=0.5)
    input_prompt = prompt_formatter(query=query, results=results)
    input_ids = tokenizer(input_prompt, return_tensors="pt")  

    outputs = model.generate(  
        inputs=input_ids["input_ids"].to("cuda"),  
        attention_mask=input_ids["attention_mask"].to("cuda"),  
        do_sample=True,  
        temperature=0.8,  
        top_k=50,  
        top_p=0.9,  
        max_new_tokens=1024,  
        eos_token_id=tokenizer.eos_token_id,  
        pad_token_id=tokenizer.pad_token_id  
    )  

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]  
    # response = tokenizer.decode(outputs[0])
    # response = response.split("### Tr·∫£ l·ªùi:")[1]
    return response

In [None]:
print(ask("N·ªôi dung d·ª± √°n 1 l√† g√¨, li·ªát k√™ c√°c n·ªôi dung ƒë·∫•y?"))