### Load in the Data

In [2]:
from llama_index.core import SimpleDirectoryReader

docs = SimpleDirectoryReader("../data/").load_data()

In [16]:
print("="*20, end="\n")
print("Total Docs: ",len(docs))
print("="*20, end="\n")

print("Doc 0: ", docs[66])

Total Docs:  67
Doc 0:  Doc ID: 3890cb21-b902-4185-8476-a588882bdbba
Text: Preprint. Instructions Given an instruction and an output, rate
whether the response appears to be a helpful and informative answer to
the query, from 1 (lowest) - 5 (highest). We call this score perceived
utility. The detailed criterion is as follows: 5: The response
provides a complete, highly detailed, and informative response to the
query, f...


In [65]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.vector_stores import VectorStoreQueryResult
from qdrant_client import QdrantClient, AsyncQdrantClient
from dotenv import load_dotenv
import os
load_dotenv()

True

In [43]:
client = QdrantClient(url=os.getenv("QDRANT_API"), api_key=os.getenv("QDRANT_API_KEY"))
aclient = AsyncQdrantClient(url=os.getenv("QDRANT_API"), api_key=os.getenv("QDRANT_API_KEY"))

2025-12-31 19:38:16,087 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
2025-12-31 19:38:17,380 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"


In [44]:
vector_store = QdrantVectorStore(
    "llama2_paper",
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm25",
    batch_size=20,
)

2025-12-31 19:38:45,557 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper/exists "HTTP/1.1 200 OK"
2025-12-31 19:38:45,804 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper/exists "HTTP/1.1 200 OK"
2025-12-31 19:38:46,498 - INFO - HTTP Request: GET https://huggingface.co/api/models/Qdrant/bm25 "HTTP/1.1 200 OK"
2025-12-31 19:38:46,757 - INFO - HTTP Request: GET https://huggingface.co/api/models/Qdrant/bm25/tree/e499a1f8d6bec960aab5533a0941bf914e70faf9?recursive=false&expand=false "HTTP/1.1 200 OK"
2025-12-31 19:38:46,984 - INFO - HTTP Request: GET https://huggingface.co/api/models/Qdrant/bm25/revision/main "HTTP/1.1 200 OK"
Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]2025-12-31 19:38:47,216 - INFO - HTTP Request: HEAD https://huggingface.co/Qdrant/bm25/resolve/e499a1f8d6bec960aab5533a0941bf914e70faf9/arabic.tx

In [45]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.chunk_size = 512

In [47]:
index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
    use_async = True
)

2025-12-31 19:49:23,810 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-31 19:49:24,191 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-31 19:49:24,221 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-31 19:49:25,948 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper/exists "HTTP/1.1 200 OK"
2025-12-31 19:49:26,204 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper "HTTP/1.1 200 OK"
2025-12-31 19:49:28,627 - INFO - HTTP Request: PUT https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper/points?wait=true "HTTP/1.1 200 OK"
2025-12-31 19:49:29,185 - INFO - HTTP Request: PUT https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:633

In [59]:
query_engine = index.as_query_engine(similarity_top_k=5, sparse_top_k=10)

In [63]:
response = await query_engine.aquery(
    "What is Self RAG"
)

2025-12-31 20:39:18,860 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-12-31 20:39:19,534 - INFO - HTTP Request: GET https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper "HTTP/1.1 200 OK"
2025-12-31 20:39:19,970 - INFO - HTTP Request: POST https://aeb81fab-425b-4042-8d61-110ec22c43c7.us-east4-0.gcp.cloud.qdrant.io:6333/collections/llama2_paper/points/query/batch "HTTP/1.1 200 OK"
2025-12-31 20:39:21,779 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [64]:
response.response

'Self-RAG is a framework that enhances the quality and factuality of large language models (LLMs) by incorporating retrieval on demand and self-reflection. It trains a single arbitrary LM to retrieve, generate, and critique text passages and its own generation using special tokens called reflection tokens. This framework enables the LM to adaptively retrieve passages, reflect on them, and generate responses tailored to diverse task requirements, ultimately improving performance on various tasks compared to state-of-the-art LLMs and retrieval-augmented models.'

## === HYBRID RETRIEVAL ===