In [1]:
from llama_index.core import Document, StorageContext, VectorStoreIndex, QueryBundle
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from qdrant_client import QdrantClient, AsyncQdrantClient
import tqdm, uuid
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever


In [3]:
import datasets

data = datasets.load_dataset("kiethuynhanh/vnlegal-dataset", split="train")

def build_full_answer(example):
    if example["context"].strip() == "":
        return {"full_answer": example["answer"]}
    else:
        return {"full_answer": f"Dựa vào {example['context']}\n{example['answer']}"}

data = data.map(build_full_answer)

df = data.to_pandas()
df.head()

Unnamed: 0,field,question,context,answer,created_at,full_answer
0,thue-phi-le-phi,"Thời điểm thực hiện khấu trừ, xác định số thuế...",,Căn cứ Điều 5 Nghị định 117/2025/NĐ-CP quy địn...,06/07/2025,Căn cứ Điều 5 Nghị định 117/2025/NĐ-CP quy địn...
1,thue-phi-le-phi,Chứng từ thanh toán không dùng tiền mặt được q...,,Căn cứ tại Điều 26 Nghị định 181/2025/NĐ-CP qu...,06/07/2025,Căn cứ tại Điều 26 Nghị định 181/2025/NĐ-CP qu...
2,thue-phi-le-phi,"Trách nhiệm của người bán hàng hóa, cung cấp d...",,Tại Điều 21 Nghị định 123/2020/NĐ-CP có quy đị...,06/07/2025,Tại Điều 21 Nghị định 123/2020/NĐ-CP có quy đị...
3,thue-phi-le-phi,"Căn cứ tính thuế xuất khẩu, thuế nhập khẩu đối...",,"Theo quy định Điều 5 Luật Thuế xuất khẩu, thuế...",06/07/2025,"Theo quy định Điều 5 Luật Thuế xuất khẩu, thuế..."
4,thue-phi-le-phi,TOÀN VĂN Công văn 1735 CT CS 2025 giới thiệu n...,,"Ngày 13 tháng 6 năm 2025, Cục Thuế đã ban hành...",06/07/2025,"Ngày 13 tháng 6 năm 2025, Cục Thuế đã ban hành..."


In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed_model

In [5]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank_model = SentenceTransformerRerank(
    model="jinaai/jina-reranker-v2-base-multilingual", top_n=20
)

In [6]:
client = QdrantClient(host="localhost", port=6333,)
aclient = AsyncQdrantClient(host="localhost", port=6333)

In [None]:
# collection_name = "test_collection"

In [9]:
import json

with open("../data/json/corpus.json", "r") as f:
    corpus = json.load(f)
    
documents = []
documents.extend(
    Document(id_=id, text=context)
    for id, context in corpus.items()
)
    
from llama_index.core.schema import TextNode

nodes = []
for doc in documents:
    node = TextNode(text=doc.text, metadata=doc.metadata)
    node.id_ = doc.id_
    node.embedding = embed_model.get_text_embedding(doc.text)
    nodes.append(node)

In [None]:
# from llama_index.core import StorageContext

# vector_store = QdrantVectorStore(
#     client=client, 
#     collection_name=collection_name, 
#     enable_hybrid=True,
#     fastembed_sparse_model="Qdrant/bm25"
# )
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex(nodes, storage_context=storage_context)
# query_engine = index.as_query_engine(
#     vector_store_query_mode="hybrid",
#     sparse_top_k=10, # số lượng kết quả lấy ra từ search dạng sparse (tức BM25 / lexical search).
#     similarity_top_k=10, # số lượng kết quả lấy ra từ search dạng dense (embedding similarity search).
#     hybrid_top_k=10, #giữ lại số lượng kết quả tốt nhất (từ cả hai nguồn).
# )

# # response = query_engine.query("Thời điểm thực hiện khấu trừ, xác định số thuế phải khấu trừ theo Nghị định 117 ra sao?")
# # display(Markdown(f"<b>{response}</b>"))

# response = query_engine.retrieve(
#     "How was Llama2 specifically trained differently from Llama1?"
# )

In [10]:
def create_hydrid_retriever(nodes_, similarity_top_k=20):
    vector_store = QdrantVectorStore(
                        collection_name="thue-phi-le-phi_all-MiniLM-L6-v2",
                        client=client,
                        aclient=aclient,
                        enable_hybrid=True,
                        fastembed_sparse_model="Qdrant/bm25",
                        dense_vector_name="dense",
                        sparse_vector_name="bm25",
                        text_key="raw_context"
                    )
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    return index.as_retriever(similarity_top_k=similarity_top_k)

In [11]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine


# kết hợp bằng Reciprocal Rank Fusion
def create_hydrid_rrf_retriever(nodes_, dense_top_k, sparse_top_k, similarity_top_k):
    
    vector_store = QdrantVectorStore(
                        collection_name="thue-phi-le-phi_all-MiniLM-L6-v2",
                        client=client,
                        aclient=aclient,
                        enable_hybrid=True,
                        fastembed_sparse_model="Qdrant/bm25",
                        dense_vector_name="dense",
                        sparse_vector_name="bm25",
                        text_key="raw_context"
                    )

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    vector_index = VectorStoreIndex.from_vector_store(vector_store)

        
    # retriever cho dense (embedding-based)
    dense_retriever = create_embedding_retriever(nodes_, similarity_top_k=dense_top_k)

    # retriever cho BM25 (sparse)
    bm25_retriever = create_bm25_retriever(nodes_, similarity_top_k=sparse_top_k)


    return QueryFusionRetriever(
            [dense_retriever, bm25_retriever],
            similarity_top_k=similarity_top_k,
            mode="reciprocal_rerank",  # Reciprocal Rank Fusion
        )


In [12]:
def create_embedding_retriever(nodes_, similarity_top_k=2):
    """Function to create an embedding retriever for a list of nodes"""
    vector_index = VectorStoreIndex(nodes_)
    return vector_index.as_retriever(similarity_top_k=similarity_top_k)

In [13]:
def create_bm25_retriever(nodes_, similarity_top_k=2):
    """Function to create a bm25 retriever for a list of nodes"""
    return BM25Retriever.from_defaults(
        nodes=nodes_,
        similarity_top_k=similarity_top_k,
    )

In [None]:
# from llama_index.core.schema import NodeWithScore
# from typing import List

# class EmbeddingBM25RerankerRetriever(BaseRetriever):
#     """Custom retriever that uses both embedding and bm25 retrievers and reranker"""

#     def __init__(
#         self,
#         vector_retriever: VectorIndexRetriever,
#         bm25_retriever: BM25Retriever,
#         reranker: SentenceTransformerRerank,
#     ) -> None:
#         """Init params."""

#         self._vector_retriever = vector_retriever
#         self.bm25_retriever = bm25_retriever
#         self.reranker = reranker

#         super().__init__()

#     def _retrieve(self, query_bundle: QueryBundle, use_rff=False) -> List[NodeWithScore]:
#         """Retrieve nodes given query."""
#         vector_nodes = self._vector_retriever.retrieve(query_bundle)
#         bm25_nodes = self.bm25_retriever.retrieve(query_bundle)

#         vector_nodes.extend(bm25_nodes)
    
        
#         retriever = self.reranker.postprocess_nodes(
#                 vector_nodes, query_bundle
#             )

#         return retriever

In [None]:
# from llama_index.core.schema import NodeWithScore, QueryBundle
# from typing import List


# class EmbeddingBM25RerankerRetriever(BaseRetriever):
#     """Custom retriever that uses both embedding and bm25 retrievers with optional RRF + reranker"""

#     def __init__(
#         self,
#         vector_retriever: VectorIndexRetriever,
#         bm25_retriever: BM25Retriever,
#         reranker: SentenceTransformerRerank,
#         similarity_top_k: int = 20,
#         use_rrf: bool = False,   # thêm flag
#     ) -> None:
#         """Init params."""
#         self._vector_retriever = vector_retriever
#         self._bm25_retriever = bm25_retriever
#         self._reranker = reranker
#         self._similarity_top_k = similarity_top_k
#         self._use_rrf = use_rrf

#         super().__init__()

#     def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
#         """Retrieve nodes given query."""
#         vector_nodes = self._vector_retriever.retrieve(query_bundle)
#         bm25_nodes = self._bm25_retriever.retrieve(query_bundle)

#         if self._use_rrf:
#             # dùng Reciprocal Rank Fusion
#             retriever = QueryFusionRetriever(
#                 [self._vector_retriever, self._bm25_retriever],
#                 similarity_top_k=self._similarity_top_k,
#                 use_async=True,
#                 mode="reciprocal_rerank",  # Reciprocal Rank Fusion
#             )
#             fused_nodes = retriever.retrieve(query_bundle)
#         else:
#             # đơn giản chỉ gộp 2 list
#             fused_nodes = vector_nodes + bm25_nodes
#             # rerank cuối cùng
#         retriever = self._reranker.postprocess_nodes(
#             fused_nodes, query_bundle
#         )
        
#         return retriever

In [62]:
from llama_index.core.schema import NodeWithScore, QueryBundle
from typing import List

class EmbeddingBM25RerankerRetriever(BaseRetriever):
    """Custom retriever that uses both embedding and bm25 retrievers with optional RRF + reranker"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        bm25_retriever: BM25Retriever,
        reranker: SentenceTransformerRerank,
        similarity_top_k: int = 20,
        use_rrf: bool = False,   # thêm flag
    ) -> None:
        self._vector_retriever = vector_retriever
        self._bm25_retriever = bm25_retriever
        self._reranker = reranker
        self._similarity_top_k = similarity_top_k
        self._use_rrf = use_rrf
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        if self._use_rrf:
            # Reciprocal Rank Fusion
            fusion_retriever = QueryFusionRetriever(
                retrievers=[self._vector_retriever, self._bm25_retriever],
                similarity_top_k=self._similarity_top_k,
                mode="reciprocal_rerank",  # hoặc "rrf" tùy version
                use_async=True,
            )
            fused_nodes = fusion_retriever.retrieve(query_bundle)
        else:
            # lấy riêng lẻ rồi gộp
            vector_nodes = self._vector_retriever.retrieve(query_bundle)
            bm25_nodes = self._bm25_retriever.retrieve(query_bundle)
            fused_nodes = vector_nodes + bm25_nodes

        # rerank kết quả cuối
        return self._reranker.postprocess_nodes(fused_nodes, query_bundle)

In [None]:
embedding_retriever = create_embedding_retriever(
    nodes, similarity_top_k=20
)
bm25_retriever = create_bm25_retriever(
    nodes, similarity_top_k=20
)

In [63]:
embedding_bm25_retriever_rerank = EmbeddingBM25RerankerRetriever(
    embedding_retriever, bm25_retriever, reranker=rerank_model, similarity_top_k=20, use_rrf=False
)

embedding_bm25_rff_retriever_rerank = EmbeddingBM25RerankerRetriever(
    embedding_retriever, bm25_retriever, reranker=rerank_model, similarity_top_k=20, use_rrf=True
)

In [37]:
hybrid_retriever = create_hydrid_retriever(nodes, similarity_top_k=20)

hybrid_rrf_retriever = create_hydrid_rrf_retriever(nodes, dense_top_k=20, sparse_top_k=20, similarity_top_k=20)

In [38]:
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.llama_dataset.legacy.embedding import EmbeddingQAFinetuneDataset

async def retrieval_results(retriever, eval_dataset):
    """Function to get retrieval results for a retriever and evaluation dataset"""

    metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        metrics, retriever=retriever
    )

    return await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [39]:
import pandas as pd

def display_results(name, eval_results):
    """Display results from evaluate."""

    metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

    metric_dicts = [eval_result.metric_vals_dict for eval_result in eval_results]

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    return pd.DataFrame(columns)

In [40]:
import json
import random

# Load dữ liệu gốc
with open("../data/json/qa_dataset.json", "r") as f:
    qa_data = json.load(f)

# Lấy ngẫu nhiên 100 câu hỏi
sampled_queries = dict(random.sample(qa_data["queries"].items(), 100))

# Giữ lại relevant_docs tương ứng
sampled_relevant_docs = {
    q: qa_data["relevant_docs"][q] for q in sampled_queries if q in qa_data["relevant_docs"]
}

# Tạo dataset nhỏ
qa_dataset = EmbeddingQAFinetuneDataset(
    queries=sampled_queries,
    corpus=qa_data["corpus"],
    relevant_docs=sampled_relevant_docs,
)

print(f"Tổng số query gốc: {len(qa_data['queries'])}")
print(f"Tổng số query đã chọn: {len(sampled_queries)}")

Tổng số query gốc: 5799
Tổng số query đã chọn: 100


since Python 3.9 and will be removed in a subsequent version.
  sampled_queries = dict(random.sample(qa_data["queries"].items(), 100))


In [64]:
hybrid_retriever_results = await retrieval_results(
    hybrid_retriever, qa_dataset
)

hybrid_rrf_retriever_results = await retrieval_results(
    hybrid_rrf_retriever, qa_dataset
)

In [None]:
embedding_retriever_results = await retrieval_results(
    embedding_retriever, qa_dataset
)
bm25_retriever_results = await retrieval_results(bm25_retriever, qa_dataset)

In [65]:
embedding_bm25_rff_retriever_rerank_results = await retrieval_results(
    embedding_bm25_rff_retriever_rerank, qa_dataset
)

embedding_bm25_retriever_rerank_results = await retrieval_results(
    embedding_bm25_retriever_rerank, qa_dataset
)

In [66]:
pd.concat(
    [
        display_results("Embedding Retriever", embedding_retriever_results),
        display_results("BM25 Retriever", bm25_retriever_results),
        display_results(
            "Embedding + BM25 Retriever + Reranker",
            embedding_bm25_retriever_rerank_results,
        ),
        display_results("Hybrid Retriever", hybrid_retriever_results),
        display_results("Hybrid RRF Retriever", hybrid_rrf_retriever_results),
    ],
    ignore_index=True,
    axis=0,
)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Embedding Retriever,0.49,0.240473,0.026021,0.49,0.240473,0.297767
1,BM25 Retriever,0.84,0.584213,0.044921,0.84,0.584213,0.64452
2,Embedding + BM25 Retriever + Reranker,0.87,0.601723,0.052327,0.87,0.601723,0.665781
3,Hybrid Retriever,0.5,0.237796,0.025026,0.5,0.237796,0.297157
4,Hybrid RRF Retriever,0.73,0.37016,0.0365,0.73,0.37016,0.451655
