# Tutorial on Building Advance RAG Application using Re-ranking and Evaluation 

Loading the environment variables from `.env` file

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

1. Initialize the LLM - Llama-3-8B on Groq.
2. Initialize the Qdrant Vector DB
3. Initialize the Cohere reranker

In [4]:
import os
import json

import warnings
warnings.filterwarnings("ignore")

from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from qdrant_client import QdrantClient

#Obtain the API keys from the Environment Variables
chat_groq_api = os.environ["GROQ_API_KEY"]
qdrant_api = os.environ["QDRANT_API_KEY"]
qdrant_url = os.environ["QDRANT_API_URL"]
cohere_api_key = os.environ["COHERE_API_KEY"]

# Initialize Groq LLM (Llama3)
llm = ChatGroq(
    temperature = 0.2,
    model_name = "llama3-8b-8192",
    api_key = chat_groq_api
)

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Qdrant vector store
vector_store = Qdrant(
    client=QdrantClient(url=qdrant_url, api_key=qdrant_api),
    collection_name = "adv_rag",
    embeddings=embeddings
)


# Initialize Cohere ReRanker
reranker = CohereRerank(cohere_api_key=cohere_api_key)

Adding Sample Dataset to the Qdrant Vector DB

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from transformers import AutoTokenizer
from qdrant_client.http import models

### Loading the data from JSON
with open("data.json","r") as f:
    sample_data = json.load(f)

### Adding the sample data to the Qdrant DB
def add_sample_data_to_qdrant(data, collection_name="adv_rag"):
    # Initialize the embedding model
    embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    
    # Initialize Qdrant client
    qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api)
    
    # Create the collection if it doesn't exist
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
    )
    
    # Initialize Qdrant vector store
    vector_store = Qdrant(
    client=QdrantClient(url=qdrant_url, api_key=qdrant_api),
    collection_name =collection_name,
    embeddings=embed_model
    )
    
    # Add data to Qdrant
    texts = [f"{item['title']}\n\n{item['content']}" for item in data]
    vector_store.add_texts(texts)
    
    print(f"Added {len(texts)} documents to Qdrant collection '{collection_name}'")

# Add the sample data to Qdrant
add_sample_data_to_qdrant(sample_data)

Added 35 documents to Qdrant collection 'adv_rag'


Define Function to Generate Multiple (5 here) queries for a given user query

In [6]:
query_generator_template = """
Generate 5 different search queries based on the following user question:
User question: {question}

Output the queries in a numbered list.
"""

query_generator_prompt = PromptTemplate(
    input_variables=["question"],
    template=query_generator_template
)

query_generator_chain = LLMChain(llm=llm, prompt=query_generator_prompt)

def generate_queries(question):
    result = query_generator_chain.invoke({"question": question})
    queries = [q.strip() for q in result['text'].split("\n") if q.strip() and q[0].isdigit()]
    return queries

Response Generation with Retrieval and Reranking using Cohere Reranker which performs ranking based on their relevance to the provided query.

In [7]:
def retrieve_and_rerank(query, top_k=5):
    base_retriever = vector_store.as_retriever(search_kwargs={"k": top_k * 2})
    retriever = ContextualCompressionRetriever(
        base_compressor=reranker,
        base_retriever=base_retriever
    )
    return retriever.get_relevant_documents(query)

response_template = """
Based on the following context, please answer the question:

Context:
{context}

Question: {question}

Provide a detailed and informative answer:
"""

response_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=response_template
)

response_chain = LLMChain(llm=llm, prompt=response_prompt)

def generate_response(question, context):
    return response_chain.invoke({"question": question, "context": context})['text']

Evaluating the responses based on Relevance, Accuracy and Completeness using the LLM

In [8]:
evaluation_template = """
Evaluate the following response to the given question based on relevance, accuracy, and completeness.
Provide a score from 1 to 10 for each criterion, where 1 is the lowest and 10 is the highest.

Question: {question}
Response: {response}

Evaluation:
1. Relevance (1-10):
2. Accuracy (1-10):
3. Completeness (1-10):

Explanation:
"""

evaluation_prompt = PromptTemplate(
    input_variables=["question", "response"],
    template=evaluation_template
)

evaluation_chain = LLMChain(llm=llm, prompt=evaluation_prompt)

def evaluate_response(question, response):
    return evaluation_chain.invoke({"question": question, "response": response})['text']

Entire RAG Pipline

In [9]:
import re
import cohere

co = cohere.Client(cohere_api_key)

def rerank_responses(evaluated_responses,user_question):
        reranked_responses = []
        for response_set in evaluated_responses:
            rerank = co.rerank(
                model = 'rerank-english-v1.0',
                query = user_question,
                documents=response_set[1],
                top_n = 1
            )
            reranked_responses.append(rerank)
        return reranked_responses
def parse_evaluation_score(evaluation):
    scores = re.findall(r'\d+', evaluation)
    return sum(map(int, scores)) / len(scores) if scores else 0

# Entire Pipeline for Adv RAG
def advanced_rag(user_question, num_responses=3):
    # Generate multiple queries
    queries = generate_queries(user_question)
    
    # Obtaining the documents through retrieval and reranking
    all_documents = []
    for query in queries:
        print("Generated Query : ",query)
        # Retrieve and rerank documents for each query
        documents = retrieve_and_rerank(query,top_k=5)
        all_documents.extend(documents)
    
    # Remove duplicates and get the top 5 unique documents
    unique_documents = list({doc.page_content: doc for doc in all_documents}.values())[:5]
    
    # Generate multiple responses
    responses = []
    for i in range(num_responses):
        context = "\n".join([doc.page_content for doc in unique_documents])
        response = generate_response(user_question, context)
        responses.append(response)
    
    # Evaluate responses
    evaluated_responses = []
    for response in responses:
        evaluation = evaluate_response(user_question, response)
        evaluated_responses.append((response, evaluation))
    
    #########################################################################################
    # If you have premium access to Cohere API, you can try to Rank the Evaluated outputs too
    # Since I use a trial version, it only allows 10 API calls/min. 
    # Please enable the below code in case you want to rerank the evaluated output
    '''
    final_responses = rerank_responses(evaluated_responses,user_question)
    print("Ranked Responses : ",final_responses)
    '''    
    #########################################################################################

    # Sort responses by evaluation score
    sorted_responses = sorted(evaluated_responses, key=lambda x: parse_evaluation_score(x[1]), reverse=True)
    

    return sorted_responses[0][0]  # Return the highest-rated response



In [10]:
# Sample User Query
user_question = "What are the main challenges in implementing quantum computing?"
print(f"USER QUERY: {user_question}")
print(f"{'-'*(20+len(user_question))}\n")

best_response = advanced_rag(user_question,num_responses=3)
print("\n\n")


print(f"User Question: {user_question}")
print(f"{'-'*(20+len(user_question))}\n")
print(f"Best Response after Retrieval, Reranking and Evaluation: \n\n{best_response}")

USER QUERY: What are the main challenges in implementing quantum computing?
-----------------------------------------------------------------------------------



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated Query :  1. "Challenges of implementing quantum computing"
Generated Query :  2. "Barriers to quantum computing adoption"
Generated Query :  3. "Quantum computing implementation difficulties"
Generated Query :  4. "Obstacles to scaling up quantum computing"
Generated Query :  5. "Quantum computing integration challenges"



User Question: What are the main challenges in implementing quantum computing?
-----------------------------------------------------------------------------------

Best Response after Retrieval, Reranking and Evaluation: 

According to the provided context, the main challenges in implementing quantum computing are:

1. **Maintaining Quantum Coherence**: Qubits are extremely sensitive to environmental disturbances, making it difficult to maintain their quantum state. This is a significant hurdle in building reliable quantum computers.

2. **Error Correction**: Quantum states are inherently prone to errors, which can be caused by decoherence, noise, and othe

# End of Tutorial