# Setup

In [None]:
# %pip install -r requirements.txt

# Environment Variables

In [5]:
# Get environment variables

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Utilities

In [6]:
# Import the utils module

from utils import *

# Pre-Processing

In [7]:
# Get the docs
docs = process_directory(path="docs/10k/html", glob="**/*.html")
print(f"\nNumber of docs = {len(docs)}")

# Chunk the docs
chunks = chunk_docs_recursive(docs=docs)
print(f"\nNumber of chunks = {len(chunks)}")

  0%|          | 0/2 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
 50%|█████     | 1/2 [00:01<00:01,  1.64s/it]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 2/2 [00:03<00:00,  1.50s/it]


Number of docs = 2

Number of chunks = 2344





In [8]:
# Create embeddings
embeddings = create_embeddings_openai()

# Create a vector store
vector_store = create_vector_store_qdrant(location=':memory:', collection_name='test-1a', vector_size=1536, embeddings=embeddings, docs=chunks)
print(f"\nCreated a vector store")

# Create a retriever
retriever = create_retriever_qdrant(vector_store)
print(f"\nCreated a retriever")

# Create the prompt template
chat_prompt_template = create_chat_prompt_template()
print(f"\nCreated a prompt template")


Created a vector store

Created a retriever

Created a prompt template


# Test 1a - OpenAI and Qdrant

In [None]:
# Create the chain using naive retrieval
def run_test_1a():
	chain = create_chain_openai(model="gpt-4o", prompt_template=chat_prompt_template, retriever=retriever)
	print(f"\nCreated a chain")

# Test 1b - OpenAI, QDrant and Cohere

In [None]:
# Create the chain using Cohere contextual compression
chain = create_chain_openai(model="gpt-4o", prompt_template=chat_prompt_template, retriever=retriever, use_cohere=True)
print(f"\nCreated a chain")

# Post-Processing

In [None]:
# Test the chain with a few questions 
questions = [
	"What is the annual revenue of Uber?",
	"What is the annual revenue of Lyft?",
	"How does Uber's revenue compare to Lyft's revenue?",
	"Summarize Lyft's risk factors",
	]

for question in questions:
	print(question)
	result = chain.invoke({"question" : question})
	print(result)
	print(result["response"].content)
	print("\n*****")

In [None]:
# Evaluate the chain using Ragas
ragas_results, ragas_results_df = run_ragas_evaluation(chain, testset_name="testsets/10k_testset.csv", eval_metrics=[answer_correctness, answer_relevancy, context_precision, context_recall, faithfulness,])
    
# Write the results to disk
ragas_results_df.to_csv(f"evaluations/10x_test1_testset_evaluation_{get_time_string()}.csv")

# Show the summary resutls
print(ragas_results)

In [None]:
# Build a heatmap showing the Ragas results

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

heatmap_data = ragas_results_df[
	[
		"answer_correctness",
		"answer_relevancy",
		"context_precision", 
		"context_recall", 
		"faithfulness",]
	]

cmap = LinearSegmentedColormap.from_list("green_red", ["red", "green"])

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".2f", linewidths=0.5, cmap=cmap)
plt.yticks(ticks=range(len(ragas_results_df["user_input"])), labels=ragas_results_df["user_input"].str[:100], rotation=0)
plt.show()