## LlamaIndex Bottoms-Up Development - Evaluation Baseline

In [1]:
import dotenv
import openai
import os

"""
Setup
"""
dotenv.load_dotenv('../.env')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

openai.api_key = OPENAI_API_KEY

In [2]:
from custom.markdown_docs_reader import MarkdownDocsReader
from llama_index.core import SimpleDirectoryReader

def load_markdown_docs(filepath):
    """Load Markdown docs from a directory, excluding all other file types."""
    loader = SimpleDirectoryReader(
        input_dir=filepath, 
        required_exts=[".md"],
        file_extractor={".md": MarkdownDocsReader()},
        recursive=True
    )
    
    documents = loader.load_data()
    
    # exclude some metadata from the LLM
    for document in documents:
        document.excluded_llm_metadata_keys = ["File Name", "Content Type", "Header Path"]
        
    return documents

In [3]:
# load our documents from each folder.
# we keep them separate for now, in order to create seperate indexes later
getting_started_docs = load_markdown_docs("data/docs/getting_started")
community_docs = load_markdown_docs("data/docs/community")
data_docs = load_markdown_docs("data/docs/core_modules/data_modules")
agent_docs = load_markdown_docs("data/docs/core_modules/agent_modules")
model_docs = load_markdown_docs("data/docs/core_modules/model_modules")
query_docs = load_markdown_docs("data/docs/core_modules/query_modules")
supporting_docs = load_markdown_docs("data/docs/core_modules/supporting_modules")
tutorials_docs = load_markdown_docs("data/docs/end_to_end_tutorials")
contributing_docs = load_markdown_docs("data/docs/development")

### Create Indexes

In [4]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage

# create a vector store index for each folder
try:
    getting_started_index = load_index_from_storage(StorageContext.from_defaults(persist_dir='data/storage/getting_started_index'))
    community_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/community_index"))
    data_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/data_index"))
    agent_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/agent_index"))
    model_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/model_index"))
    query_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/query_index"))
    supporting_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/supporting_index"))
    tutorials_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/tutorials_index"))
    contributing_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="data/storage/contributing_index"))
except Exception as e:
    print(e)
    getting_started_index = VectorStoreIndex.from_documents(getting_started_docs)
    getting_started_index.storage_context.persist(persist_dir="data/storage/getting_started_index")
    
    community_index = VectorStoreIndex.from_documents(community_docs)
    community_index.storage_context.persist(persist_dir="data/storage/community_index")

    data_index = VectorStoreIndex.from_documents(data_docs)
    data_index.storage_context.persist(persist_dir="data/storage/data_index")

    agent_index = VectorStoreIndex.from_documents(agent_docs)
    agent_index.storage_context.persist(persist_dir="data/storage/agent_index")

    model_index = VectorStoreIndex.from_documents(model_docs)
    model_index.storage_context.persist(persist_dir="data/storage/model_index")

    query_index = VectorStoreIndex.from_documents(query_docs)
    query_index.storage_context.persist(persist_dir="data/storage/query_index")    

    supporting_index = VectorStoreIndex.from_documents(supporting_docs)
    supporting_index.storage_context.persist(persist_dir="data/storage/supporting_index")

    tutorials_index = VectorStoreIndex.from_documents(tutorials_docs)
    tutorials_index.storage_context.persist(persist_dir="data/storage/tutorials_index")

    contributing_index = VectorStoreIndex.from_documents(contributing_docs)
    contributing_index.storage_context.persist(persist_dir="data/storage/contributing_index")

[Errno 2] No such file or directory: '/Users/johan/Desktop/Development/PycharmProjects/LlamaIndex-experiments/query_engine-llamaindex_documentation/data/storage/getting_started_index/docstore.json'


### Create Query Engine Tool
Since we have so many indexes, we can create a query engine tool for each and then use them in a single query engine!

In [6]:
from llama_index.core.tools import QueryEngineTool

# create a query engine tool for each folder
getting_started_tool = QueryEngineTool.from_defaults(
    query_engine=getting_started_index.as_query_engine(), 
    name="Getting Started", 
    description="Useful for answering questions about installing and running llama index, as well as basic explanations of how llama index works."
)

community_tool = QueryEngineTool.from_defaults(
    query_engine=community_index.as_query_engine(),
    name="Community",
    description="Useful for answering questions about integrations and other apps built by the community."
)

data_tool = QueryEngineTool.from_defaults(
    query_engine=data_index.as_query_engine(),
    name="Data Modules",
    description="Useful for answering questions about data loaders, documents, nodes, and index structures."
)

agent_tool = QueryEngineTool.from_defaults(
    query_engine=agent_index.as_query_engine(),
    name="Agent Modules",
    description="Useful for answering questions about data agents, agent configurations, and tools."
)

model_tool = QueryEngineTool.from_defaults(
    query_engine=model_index.as_query_engine(),
    name="Model Modules",
    description="Useful for answering questions about using and configuring LLMs, embedding modles, and prompts."
)

query_tool = QueryEngineTool.from_defaults(
    query_engine=query_index.as_query_engine(),
    name="Query Modules",
    description="Useful for answering questions about query engines, query configurations, and using various parts of the query engine pipeline."
)

supporting_tool = QueryEngineTool.from_defaults(
    query_engine=supporting_index.as_query_engine(),
    name="Supporting Modules",
    description="Useful for answering questions about supporting modules, such as callbacks, service context, and avaluation."
)

tutorials_tool = QueryEngineTool.from_defaults(
    query_engine=tutorials_index.as_query_engine(),
    name="Tutorials",
    description="Useful for answering questions about end-to-end tutorials and giving examples of specific use-cases."
)

contributing_tool = QueryEngineTool.from_defaults(
    query_engine=contributing_index.as_query_engine(),
    name="Contributing",
    description="Useful for answering questions about contributing to llama index, including how to contribute to the codebase and how to build documentation."
)

### Create Unified Query Engine

In [7]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        getting_started_tool,
        community_tool,
        data_tool,
        agent_tool,
        model_tool,
        query_tool,
        supporting_tool,
        tutorials_tool,
        contributing_tool
    ],
    # enable this for streaming
    # response_synthesizer=get_response_synthesizer(streaming=True),
    verbose=False
)

### Test the Query Engine!

In [8]:
response = query_engine.query("How do I install llama index?")
print(str(response))

To install llama index, you can use the following command: pip install llama-index.


## Evaluate the Baseline!

Now that we have our baseline query engine created, we can create a basic evaluation pipleine.

Our pipeline will:
- Generate a small dataset of questions
- Save/cache these questions (so we can properly compare performance later!)
- Evaluate both response quality and hallucination

To do this reliably, we need to use an LLM smarter that `gpt-3.5-turbo`, so we will setup `gpt-4` for the evaluation process.

### Generate the Dataset

In order to make the question generation more efficient, we can remove small documents and combine all documents into a giant single document.

I also modified the question generation prompt, to generate a single question for each chunk, along with extra context for what it is reading.

In [13]:
from llama_index.core import Document

documents = SimpleDirectoryReader("data/docs", recursive=True, required_exts=[".md"]).load_data()

all_text = ""

for doc in documents:
    all_text += doc.text
    
giant_document = Document(text=all_text)

In [32]:
import os
import random
random.seed(42)

from llama_index.core.indices.service_context import ServiceContext
from llama_index.core.settings import Settings
from llama_index.core.prompts import Prompt
from llama_index.llms import openai
from llama_index.core.evaluation import DatasetGenerator

llm = openai.OpenAI(llm="gpt-4", temperature=0)

question_dataset = []
if os.path.exists("data/question_dataset.txt"):
    with open("data/question_dataset.txt", "r") as f:
        for line in f:
            question_dataset.append(line.strip())
else:
    # generate questions
    data_generator = DatasetGenerator.from_documents(
        [giant_document],
        text_question_template=Prompt(
            "A sample from the LlamaIndex documentation is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Using the documentation sample, carefully follow the instructions below:\n"
            "{query_str}"
        ),
        question_gen_query=(
            "You are an evaluator for a search pipeline. Your task is to write a single question "
            "using the provided documentation sample above to test the search pipeline. The question should "
            "reference specific names, functions, and terms. Restrict the question to the "
            "context information provided.\n"
            "Question: "
        ),
        # set this to be low, so we can generate more questions
        # service_context=gpt4_service_context
        llm=llm
    )
    generated_questions = data_generator.generate_questions_from_nodes()

    # randomly pick 40 questions from each dataset
    generated_questions = random.sample(generated_questions, 40)
    question_dataset.extend(generated_questions)

    print(f"Generated {len(question_dataset)} questions.")

    # save the questions!
    with open("data/question_dataset.txt", "w") as f:
        for question in question_dataset:
            f.write(f"{question.strip()}\n")
    

In [17]:
question_dataset

['What are the possible settings for the LLM and how can the user set the prompt for term extraction?',
 'How can I convert tools to LangChain tools using the provided documentation sample?',
 'What is the purpose of the `GuidancePydanticProgram` class in the LlamaIndex documentation?',
 'What is the purpose of the SubQuestionQueryEngine class in LlamaIndex?',
 'What is the purpose of the `query_wrapper_prompt` in the `HuggingFaceLLM` class?',
 'What embedding model does LlamaIndex use by default?',
 'What are the available options for the storage backend of the index store in LlamaIndex?',
 'What is the function used to specify the metadata visible to the embedding model and how can it be customized?',
 'What are the node postprocessors available in the LlamaIndex documentation?',
 'What is the purpose of the LoadAndSearchToolSpec in the LlamaIndex documentation?',
 'What is the purpose of the DEFAULT_REFINE_PROMPT_SEL_LC in the LlamaIndex documentation?',
 "What is the purpose of the

## Evaluate with the Dataset

Now that we have our dataset, let's measure performance!
<br>

#### Evaluating Response for Hallucination

In [20]:
import time
import asyncio
import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Response


def evaluate_query_engine(evaluator, query_engine, questions):
    async def run_query(query_engine, q):
        try:
            return await query_engine.aquery(q)
        except:
            return Response(response="Error, query failed.")

    total_correct = 0
    all_results = []
    for batch_size in range(0, len(questions), 5):
        batch_qs = questions[batch_size:batch_size+5]

        tasks = [run_query(query_engine, q) for q in batch_qs]
        responses = asyncio.run(asyncio.gather(*tasks))
        print(f"finished batch {(batch_size // 5) + 1} out of {len(questions) // 5}")

        for response in responses:
            eval_result = 1 if "YES" in evaluator.evaluate(response) else 0
            total_correct += eval_result
            all_results.append(eval_result)
        
        # helps avoid rate limits
        time.sleep(1)

    return total_correct, all_results
    

In [33]:
from llama_index.core.evaluation import ResponseEvaluator

# gpt-4 evaluator!
evaluator = ResponseEvaluator(llm=llm)

total_correct, all_results = evaluate_query_engine(evaluator, query_engine, question_dataset)

print(f"Hallucination? Scored {total_correct} out of {len(question_dataset)} questions correctly.")

finished batch 1 out of 8


ValueError: contexts and response must be provided

### Investigate Hallucinations

In [34]:
import numpy as np

hallucinated_questions = np.array(question_dataset)[np.array(all_results) == 0]
print(all_results)
print(hallucinated_questions)

NameError: name 'all_results' is not defined

In [38]:
response = query_engine.query('How well does Llama-index work with Django?')

In [39]:
print(str(response))
print("-------")
print(response.get_formatted_sources(length=256))

Llama-index works efficiently with Django by allowing for the integration of data loaders like GoogleDocsReader. Additionally, Llama-index can be configured to work efficiently with Django by turning existing data loaders into tools that agents can use, enabling the loading, indexing, and querying of data within a single tool call. This approach abstracts away complexities and provides an ad-hoc index to overcome prompt window limitations for API calls, making Llama-index a suitable choice for working with Django.
-------
> Source (Doc id: 24f74b93-c572-42a0-9db7-a304afaf617b): Sub question: Can Llama-index be integrated with Django?
Response: Yes, Llama-index can be integrated with Django.

> Source (Doc id: 96ae4336-45e4-49e7-82a1-9c2acac8c3c5): Sub question: What are the data loaders available for Llama-index when using Django?
Response: Llama-index provides data loaders for GoogleDocsReader when using Django.

> Source (Doc id: d8ec1daa-4619-45b6-ab0c-001058eba292): Sub question: H