In [29]:
from llama_index.llms.ollama import Ollama
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings,
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import os
import time
import nest_asyncio

nest_asyncio.apply()

In [30]:
llm = Ollama(model="llama3.1", request_timeout=300.0)
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [31]:
documents = SimpleDirectoryReader(input_files=["gemma.pdf"]).load_data()

In [32]:
eval_documents = documents[:1]
data_generator = DatasetGenerator.from_documents(eval_documents, llm=llm)
eval_questions = data_generator.generate_questions_from_nodes()

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [33]:
print(eval_questions)

['Here are 10 questions for the quiz/exam based on the given text:', 'What is the name of the family of lightweight, state-of-the-art open models introduced by Gemma?', 'Which model family was used as inspiration to develop Gemma models?', 'How many sizes of Gemma models are released, and what are their respective parameter counts?', 'In which domains does Gemma achieve strong generalist capabilities in text, alongside state-of-the-art understanding and reasoning skills at scale?', 'What is the total token count used for training Gemma models?', 'Which Google model family was used as a base for developing Gemma?', 'What are the different types of checkpoints released along with the open-source codebase for inference and serving in Gemma?', 'What is the purpose of releasing both pre-trained and fine-tuned checkpoints, according to the authors of Gemma?', 'In which research areas do the authors of Gemma believe the responsible release of LLMs will have a significant impact?', 'Which publ

In [34]:
faithfulness = FaithfulnessEvaluator()
relevancy = RelevancyEvaluator()

In [35]:
def evaluate(chunk_size, eval_questions):
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    vector_index = VectorStoreIndex.from_documents(eval_documents)

    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness.evaluate_response(response=response_vector).passing
        relevancy_result = relevancy.evaluate_response(response=response_vector, query=question).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy


In [36]:
chunk_sizes = [128, 256, 512, 1024, 2048]

for chunk in chunk_sizes:
    avg_response_time, avg_faithfulness, avg_relevancy = evaluate(chunk, eval_questions)
    print(f"Chunk size: {chunk} - Avg. Response Time: {avg_response_time:.2f}s - Avg. Faithfulness: {avg_faithfulness:.2f} - Avg. Relevancy: {avg_relevancy:.2f}")

Chunk size: 128 - Avg. Response Time: 10.10s - Avg. Faithfulness: 0.91 - Avg. Relevancy: 1.00
Chunk size: 256 - Avg. Response Time: 13.65s - Avg. Faithfulness: 1.00 - Avg. Relevancy: 1.00
Chunk size: 512 - Avg. Response Time: 8.47s - Avg. Faithfulness: 0.91 - Avg. Relevancy: 1.00
Chunk size: 1024 - Avg. Response Time: 10.05s - Avg. Faithfulness: 1.00 - Avg. Relevancy: 1.00
Chunk size: 2048 - Avg. Response Time: 14.34s - Avg. Faithfulness: 1.00 - Avg. Relevancy: 1.00
