# Package Installation and Imports

The cell below installs all necessary packages required to run this notebook.


In [2]:
import os
import nest_asyncio

nest_asyncio.apply()
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.core import Settings
from langchain_groq import chat_models, ChatGroq

### Read Docs

In [5]:
data_dir = "/data_hdd_16t/khanhtran/LLM/RAG/Data/5.Choose_chunksize_data"
documents = SimpleDirectoryReader(data_dir).load_data(num_workers=1)
documents[0]

Document(id_='b547800a-f986-4e4d-a72c-0787eb18bad9', embedding=None, metadata={'page_label': '1', 'file_name': '2303.08774v6.pdf', 'file_path': '/data_hdd_16t/khanhtran/LLM/RAG/Data/5.Choose_chunksize_data/2303.08774v6.pdf', 'file_type': 'application/pdf', 'file_size': 5245564, 'creation_date': '2025-10-16', 'last_modified_date': '2025-10-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='GPT-4 Technical Report\nOpenAI∗\nAbstract\nWe report the development of GPT-4, a large-scale, multimodal model which can\naccept image and text inputs and produce text outputs. While less capable than\nhumans in many real-world scenarios, GPT-4 exhibit

### Create evaluation questions and pick k out of them

In [8]:
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.langchain import LangChainLLM
import random   

num_eval_questions = 15

llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
llm_wrapped = LangChainLLM(llm=llm)

custom_question_prompt = PromptTemplate(
    template="""
    You are a expert question generator. For the following context, 
    generate a diverse and detailed question that can be answered by the context to test understanding of the contents. \n
    Context: {{context_str}} \n
    Question:
    """ 
)  

data_generator = DatasetGenerator.from_documents(
    documents[20:25], 
    llm=llm_wrapped,
    text_question_template=custom_question_prompt
)

eval_questions = data_generator.generate_questions_from_nodes()
k_eval_questions = random.sample(eval_questions, num_eval_questions)

  return cls(
2025-10-16 14:13:25,435 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:26,111 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:27,070 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:27,620 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:28,182 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:28,825 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-16 14:13:29,509 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
  return QueryResponseDataset(queries=queries, responses=responses_dict)


### Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context 

In [12]:
llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
Settings.llm = llm
faithfulness_llm = FaithfulnessEvaluator()

faithfulness_new_prompt_template = PromptTemplate(""" Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:

    """)

faithfulness_llm.update_prompts({"your_prompt_key": faithfulness_new_prompt_template}) # Update the prompts dictionary with the new prompt template

relevancy_llm = RelevancyEvaluator()

### Function to evaluate metrics for each chunk size

In [43]:
import time

def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.
    Parameters:
    chunk_size (int): The size of data chunks being processed.
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time: int = 0
    total_faithfulness: int = 0
    total_relevancy: int = 0

    llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

    Settings.llm = llm
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size // 5 

    vector_index = VectorStoreIndex.from_documents(documents[15:20])
    
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    num_questions = len(eval_questions)

    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        time.sleep(random.uniform(1.5, 3.0))  
        
        faithfulness_result = faithfulness_llm.evaluate_response(
            response=response_vector
        ).passing
        
        time.sleep(random.uniform(1.5, 3.0))  
        
        relevancy_result = relevancy_llm.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

### Test different chunk sizes 

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed_model
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

chunk_sizes = [128]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")