<a href="https://colab.research.google.com/github/duper203/RAG_Techniques_with_upstage/blob/main/upstage/04_choose_chunk_size.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Choose Chunk Size

### Import libraries and environment variables


In [6]:
! pip3 install -qU langchain-upstage langchain llama_index llama-index-llms-upstage llama-index-embeddings-upstage

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.0 MB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.0/1.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
from google.colab import userdata

os.environ["UPSTAGE_API_KEY"] = userdata.get("UPSTAGE_API_KEY")


import nest_asyncio
import random

nest_asyncio.apply()
# from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.prompts import PromptTemplate

from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI
import time
import os
# load_dotenv()

## Read Docs


In [4]:
data_dir = "data"
documents = SimpleDirectoryReader(data_dir).load_data()

## Create evaluation questions and pick k out of them


In [8]:
import random
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_upstage import ChatUpstage

# Initialize the Upstage LLM client
llm_langchain = ChatUpstage()

# Define a prompt for generating questions
prompt_template = """
You are given a document. Your task is to generate a set of evaluation questions based on the content of the document.

Document:
{document_text}

Generate a list of questions based on this document:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["document_text"])

# Initialize an LLM chain
llm_chain = LLMChain(llm=llm_langchain, prompt=prompt)

# Use the first 20 documents for evaluation
eval_documents = documents[0:20]
eval_questions = []

# Generate questions from each document
for document in eval_documents:
    document_text = document.text  # Assuming document.text holds the text data
    questions = llm_chain.run(document_text)
    eval_questions.extend(questions.split("\n"))

# Select k random evaluation questions
num_eval_questions = 25
k_eval_questions = random.sample(eval_questions, num_eval_questions)

# Print or use the evaluation questions
print(k_eval_questions)


['10. What are the potential benefits of reducing CO2 emissions and enhancing marine protected areas for the environment and human society?', '5. What is the role of forests in carbon storage and how does deforestation impact this process?', '9. What are some ways to lobby for and form coalitions with like-minded organizations to advocate for strong climate policies?', '7. What is the significance of youth leadership in driving climate action?', '4. How can promoting a culture of sustainability, resilience, and stewardship foster long-term climate action?', '3. What are the impacts of glacial retreat on water supplies for millions of people?', '9. How have global temperatures risen since the late 19th century?', '8. What are ecosystem services, and how can they be supported through climate action?', '13. How does the urban heat island effect contribute to the severity of heatwaves in cities?', '6. How can inclusive climate policies ensure equitable solutions for all affected population

## Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context


In [16]:
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core import Settings

# Settings.llm = llm

# We will use Upstage Solar for evaluating the responses
solar = OpenAI(api_key=os.environ["UPSTAGE_API_KEY"],base_url="https://api.upstage.ai/v1/solar")
# solar = ChatUpstage(model='solar-pro')
# Define Faithfulness and Relevancy Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(llm=solar)

faithfulness_new_prompt_template = PromptTemplate(
    template=""" Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:

    """)

faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template}) # Update the prompts dictionary with the new prompt template
relevancy_gpt4 = RelevancyEvaluator(llm=solar)

## Function to evaluate metrics for each chunk size


In [25]:
from llama_index.llms.upstage import Upstage
from llama_index.core.llms import ChatMessage
from llama_index.embeddings.upstage import UpstageEmbedding

os.environ["UPSTAGE_API_KEY"] = userdata.get("UPSTAGE_API_KEY")

llm_llamaindex = solar
embed_model = UpstageEmbedding(model="solar-embedding-1-large")



In [26]:
# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
# Generate response and evaluate it.
from langchain_upstage import UpstageEmbeddings
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.

    Parameters:
    chunk_size (int): The size of data chunks being processed.

    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size, chunk_overlap=chunk_size//5)
    Settings.llm = llm_llamaindex
    Settings.embed_model = embed_model
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size//5

    vector_index = VectorStoreIndex.from_documents(
        eval_documents, embed_model=Settings.embed_model
    )
    # build query engine
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing

        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [23]:
pip install llama-index-llms-langchain langchain-community

Installing collected packages: python-dotenv, pydantic-settings, langchain-community
Successfully installed langchain-community-0.3.3 pydantic-settings-2.6.0 python-dotenv-1.0.1


## Test different chunk sizes


In [27]:
chunk_sizes = [128, 256]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: up_gg262********************TDj0. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}