<a href="https://colab.research.google.com/github/duper203/RAG_Techniques_with_upstage/blob/main/upstage/04_choose_chunk_size.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Choose Chunk Size

### Import libraries and environment variables


In [None]:
! pip3 install -qU llama-index-llms-langchain langchain_community langchain_upstage llama_index llama-index-llms-upstage llama-index-embeddings-upstage

In [None]:
import os
from google.colab import userdata

os.environ["UPSTAGE_API_KEY"] = userdata.get("UPSTAGE_API_KEY")
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")



import nest_asyncio
import random

nest_asyncio.apply()
# from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.prompts import PromptTemplate

from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)

from llama_index.llms.openai import OpenAI
import time


## Read Docs


In [None]:
data_dir = "data"
documents = SimpleDirectoryReader(data_dir).load_data()

## Create evaluation questions and pick k out of them


In [None]:
import random
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_upstage import ChatUpstage

# Initialize the Upstage LLM client
llm_langchain = ChatUpstage(model='solar-pro')

# Define a prompt for generating questions
prompt_template = """
You are given a document. Your task is to generate a set of evaluation questions based on the content of the document.

Document:
{document_text}

Generate a list of questions based on this document:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["document_text"])

# Initialize an LLM chain
llm_chain = LLMChain(llm=llm_langchain, prompt=prompt)

# Use the first 20 documents for evaluation
eval_documents = documents[0:20]
eval_questions = []

# Generate questions from each document
for document in eval_documents:
    document_text = document.text  # Assuming document.text holds the text data
    questions = llm_chain.run(document_text)
    eval_questions.extend(questions.split("\n"))

# Select k random evaluation questions
num_eval_questions = 25
k_eval_questions = random.sample(eval_questions, num_eval_questions)

# Print or use the evaluation questions
print(k_eval_questions)


['13. What is the relationship between increased CO2 levels in the atmosphere and ocean acidification?', '6. How does engaging local communities in restoration projects contribute to their sustainability and long-term success?', '5. How do scientists use ice core samples, tree rings, and ocean sediments to study climate change?', '10. How can balancing energy needs with ecological conservation be achieved in the context of hydroelectric power?', '7. What are some effective measures for reducing emissions from vehicles, industries, and power plants to improve air quality and public health?', '4. What are the benefits and challenges of transitioning to electric vehicles on a large scale?', '1. What are some examples of indigenous practices that contribute to sustainable land and resource management?', '9. What are the three main types of fossil fuels mentioned in the document?', '8. How can youth engagement in climate action be further empowered and supported?', '8. What are some best pr

## Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context


In [None]:
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core import Settings

gpt4 = OpenAI(temperature=0, model="gpt-4o")

# Define Faithfulness and Relevancy Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(llm=gpt4)

faithfulness_new_prompt_template = PromptTemplate(
    template=""" Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:

    """)

faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template}) # Update the prompts dictionary with the new prompt template
relevancy_gpt4 = RelevancyEvaluator(llm=gpt4)

## Function to evaluate metrics for each chunk size


In [None]:
from llama_index.llms.upstage import Upstage
from llama_index.core.llms import ChatMessage
from llama_index.embeddings.upstage import UpstageEmbedding

embed_model = UpstageEmbedding(model="solar-embedding-1-large")

In [24]:
# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
# Generate response and evaluate it.
from langchain_upstage import UpstageEmbeddings
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.

    Parameters:
    chunk_size (int): The size of data chunks being processed.

    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size, chunk_overlap=chunk_size//5)
    Settings.llm = ChatUpstage(model='solar-pro')
    Settings.embed_model = embed_model
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size//5

    vector_index = VectorStoreIndex.from_documents(
        eval_documents, embed_model=Settings.embed_model
    )
    # build query engine
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing

        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

## Test different chunk sizes


In [25]:
chunk_sizes = [128, 256]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

Chunk size 128 - Average Response time: 3.65s, Average Faithfulness: 1.00, Average Relevancy: 1.00
Chunk size 256 - Average Response time: 3.92s, Average Faithfulness: 1.00, Average Relevancy: 1.00
