# Setup

In [None]:
%pip install -qU IProgress
%pip install -qU ipywidgets
%pip install -qU python-dotenv
%pip install -qU tqdm

%pip install -qU langchain-core
%pip install -qU langchain-community
%pip install -qU langchain-openai 
%pip install -qU langchain-qdrant 

%pip install -qU pymupdf
%pip install -qU ragas

# Environment Variables

In [None]:
# Get environment variables

from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

# Utilities

In [4]:
# Load docs from a directory

from langchain_community.document_loaders import DirectoryLoader 
from langchain_community.document_loaders import PyMuPDFLoader

def process_directory(path: str, glob: str, loader_cls: str, use_multithreading=True):
	
	loader = DirectoryLoader(path=path, glob=glob, show_progress=True, loader_cls=loader_cls, use_multithreading=use_multithreading)
	
	docs = loader.load()
	
	return docs

#####

def test_process_directory():
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	print(len(docs))

# test_process_directory()

In [5]:
# Create embeddings using OpenAI

from langchain_openai import OpenAIEmbeddings

def create_embeddings_openai(model="text-embedding-ada-002") -> OpenAIEmbeddings:

	# Initialize the OpenAIEmbeddings class
	embeddings = OpenAIEmbeddings(model=model)

	return embeddings

#####

def test_create_embeddings_openai():
	text = "What is the annual revenue of Uber?"
	
	embeddings = create_embeddings_openai()
	vector = embeddings.embed_query(text)
	
	print(vector)

# test_create_embeddings_openai()

In [6]:
# Create a text splitter using recursive character text splitter

# https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_docs_recursive(docs: list, chunk_size=500, chunk_overlap=50) -> list:

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

	chunks = text_splitter.split_documents(docs)

	return chunks

#####

def test_chunk_docs_recursive(): 
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	
	print(f"\nNumber of chunks = {len(chunks)}\n")
	print(f"First chunk = {chunks[0].page_content}")

# test_chunk_docs_recursive()

In [None]:
# Create a Qdrant vector store

from langchain_core.embeddings import Embeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def create_qdrant_vector_store(location: str, 
							   collection_name: str, 
							   vector_size: int, 
							   embeddings: Embeddings, 
							   docs: list) -> QdrantVectorStore:

	# Initialize the Qdrant client
	qdrant_client = QdrantClient(location=location)

	# Create a collection in Qdrant
	qdrant_client.create_collection(collection_name=collection_name, 
								 vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE))

	# Initialize QdrantVectorStore with the Qdrant client
	qdrant_vector_store = QdrantVectorStore(client=qdrant_client, 
										 collection_name=collection_name, embedding=embeddings)
	
	# Add the docs to the vector store
	qdrant_vector_store.add_documents(docs)
	
	return qdrant_vector_store

#####

def test_create_qdrant_vector_store():
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	
	embeddings = create_embeddings_openai()
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	
	print(vector_store.collection_name)

# test_create_qdrant_vector_store()

In [8]:
# Create a Qdrant retriever

from langchain_core.retrievers import BaseRetriever
from langchain_qdrant import QdrantVectorStore

def create_retriever_qdrant(vector_store: QdrantVectorStore) -> BaseRetriever:

	retriever = vector_store.as_retriever()

	return retriever

#####

def test_create_retriever_qdrant(text):
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	
	embeddings = create_embeddings_openai()
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	
	retriever = create_retriever_qdrant(vector_store)
	
	docs = retriever.invoke(text)	
	print(docs[0])

# test_create_retriever_qdrant("What is the annual revenue for Uber?")

In [9]:
# Create a prompt template

# https://python.langchain.com/v0.1/docs/modules/model_io/prompts/quick_start/#chatprompttemplate
# https://python.langchain.com/v0.2/api_reference/core/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html

from langchain.prompts import ChatPromptTemplate

def create_chat_prompt_template(template: str = None) -> ChatPromptTemplate:
	
	if template is None:
		template = '''
		You are an expert assistant designed to help users analyze and answer questions about 10K annual reports filed by publicly traded companies. Users may ask about specific sections, financial metrics, trends, or comparisons across multiple reports. Your role is to provide accurate, concise, and relevant answers, referencing appropriate sections or data points where applicable.

		When responding, adhere to the following principles:

		Understand the Question: Identify whether the query is focused on a specific company, year, or metric, or if it spans multiple reports for comparison.
		Clarify Uncertainty: If a user's question is unclear, ask for clarification or additional context.
		Locate and Reference Information: Use relevant sections of the 10K report(s), such as MD&A, Financial Statements, Risk Factors, or Notes to Financial Statements, to back up your answers.
		Synthesize Data: Provide summaries or insights when the question involves comparing data or trends across multiple reports.
		Stay Objective: Avoid providing subjective opinions or interpretations beyond the factual content in the reports.
		Example User Queries and Expected Responses:

		"What was the revenue for Company X in 2022 and 2023?"

		Locate and report the revenue figures from the Income Statements of the respective 10K reports for 2022 and 2023.
		"What are the main risk factors for Company Y in its latest report?"

		Summarize the key risk factors from the most recent 10K report's "Risk Factors" section.
		"How did the operating income of Company Z change over the last three years?"

		Extract operating income figures from the 10K reports for the past three years and provide a brief comparison.
		"Compare the debt levels of Company A and Company B in 2023."

		Retrieve debt-related figures from the Balance Sheets or Notes to Financial Statements of both companies and summarize the comparison.
		"What trends are evident in Company W's R&D expenses over the last five years?"

		Summarize trends using data from the Income Statements or footnotes for R&D expenses across five consecutive 10K reports.
		Assumptions and Constraints:

		Only use inforomation from 10K reports provided in the context below. 
		For complex queries spanning multiple reports, provide a structured summary highlighting key comparisons or trends.
		If certain information is unavailable, state so clearly and suggest alternative approaches to obtain it.

		Now it's your turn!
		
		{question}

		{context}
		'''
	
	prompt = ChatPromptTemplate.from_template(template)

	return prompt

#####

def test_create_chat_prompt_template():
	prompt = create_chat_prompt_template()
	
	print(prompt)

# test_create_chat_prompt_template()

In [10]:
# Create a Langchain chain

from operator import itemgetter

from langchain_core.retrievers import BaseRetriever
from langchain_core.runnables import RunnablePassthrough, RunnableSerializable
from langchain_openai import ChatOpenAI

def create_chain (model: str, 
				  prompt_template: ChatPromptTemplate, 
				  retriever: BaseRetriever
				  ) -> RunnableSerializable:

	llm = ChatOpenAI(model=model)
		
	chain = (
		{"context": itemgetter("question") | retriever, "question": itemgetter("question")} 
		| RunnablePassthrough.assign(context=itemgetter("context")) 
		| {"response": prompt_template | llm, "context": itemgetter("context")}
		)

	return chain

#####

def test_create_chain_qdrant():
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)

	embeddings = create_embeddings_openai()
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)

	retriever = create_retriever_qdrant(vector_store)
	chat_prompt_template = create_chat_prompt_template()
	chain = create_chain("gpt-4o-mini", chat_prompt_template, retriever)
	
	result = chain.invoke({"question" : "What is the annual revenue of Uber?"})
	print(result)

# test_create_chain_qdrant()

In [11]:
# Generate answers from a chain using a list of questions

from typing import List, Tuple

from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnableSerializable


def generate_answers_contexts(chain: RunnableSerializable, 
							  questions: list
							  ) -> Tuple[List, List]:
	
	answers = []
	contexts = []

	# Loop over the list of questions and call the chain to get the answer and context
	for question in questions:
		print(question)

		# Call the chain to get answers and contexts
		response = chain.invoke({"question" : question})
		print(response)

		# Capture the answer and context 
		answers.append(response["response"].content)
		contexts.append([context.page_content for context in response["context"]])

	return answers, contexts

#####

def test_generate_answers_contexts():
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)

	embeddings = create_embeddings_openai()
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	
	retriever = create_retriever_qdrant(vector_store)
	chat_prompt_template = create_chat_prompt_template()
	chain = create_chain("gpt-4o-mini", chat_prompt_template, retriever)
	
	questions = ["What is the annual revenue of Lyft?",
			  "What is the annual revenue of Uber?",
			  "Which company has a larger annual revenue - Lyft or Uber?"]	
	
	answers, contexts = generate_answers_contexts(chain=chain, questions=questions)
	
	print(f"Total number of answers = {len(answers)}")
	print(f"Total number of contexts = {len(contexts)}")

# test_generate_answers_contexts()

In [23]:
# Run a Ragas evaluation 

import time
from typing import Tuple

import pandas as pd
from pandas import DataFrame

from datasets import Dataset
from langchain_core.runnables import RunnableSerializable
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)

def run_ragas_evaluation(chain: RunnableSerializable, 
						questions: list, 
						groundtruths: list, 
						eval_metrics: list = [answer_correctness, answer_relevancy, context_recall, context_precision, faithfulness]
						):
	
	answers = []
	contexts = []
	answers, contexts = generate_answers_contexts(chain=chain, questions=questions)

	# Create the input dataset 
	input_dataset = Dataset.from_dict({"question" : questions,       	# From the dataframe
										"answer" : answers,             # From the chain
										"contexts" : contexts,          # From the chain
										"ground_truth" : groundtruths   # From the dataframe
										})

	# Run the Ragas evaluation using the input dataset and eval metrics
	ragas_results = evaluate(input_dataset, eval_metrics)
	ragas_results_df = ragas_results.to_pandas()
	
	return ragas_results, ragas_results_df
	
#####

def test_run_ragas_evaluation():
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	
	embeddings = create_embeddings_openai()
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	
	retriever = create_retriever_qdrant(vector_store)
	chat_prompt_template = create_chat_prompt_template()
	chain = create_chain("gpt-4o-mini", chat_prompt_template, retriever)
	
	testset_df = pd.read_csv("testsets/10k_test_testset.csv")	# This testset contains a small number of questions
	questions = testset_df["question"].values.tolist()
	questions = [str(question) for question in questions]
	groundtruths = testset_df["ground_truth"].values.tolist()
	groundtruths = [str(ground_truth) for ground_truth in groundtruths]  
	eval_metrics = [answer_correctness, answer_relevancy, context_precision, context_recall, faithfulness]
	ragas_results, ragas_results_df = run_ragas_evaluation(chain, questions, groundtruths, eval_metrics)
	
	timestr = time.strftime("%Y%m%d%H%M%S")
	ragas_results_df.to_csv(f"evaluations/10x_test_testset_evaluation_{timestr}.csv")
	
	print(ragas_results)

# test_run_ragas_evaluation()

# Test 1a - OpenAI and Qdrant

In [None]:
# Build chain using OpenAI and Qdrant

embeddings = create_embeddings_openai()
docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
chunks = chunk_docs_recursive(docs=docs)
print(f'\nNumber of chunks = {len(chunks)}\n')
vector_store = create_qdrant_vector_store(':memory:', 'holiday-test', 1536, embeddings, chunks)
retriever = create_retriever_qdrant(vector_store)
chat_prompt_template = create_chat_prompt_template()
chain = create_chain('gpt-4o', chat_prompt_template, retriever)

In [None]:
# Test the chain with a few questions 

questions = ["What is the annual revenue of Uber?",
"What is the annual revenue of Lyft?",
"How does Uber's revenue compare to Lyft's revenue?",]

for question in questions:
	print(question)
	result = chain.invoke({"question" : question})
	print(result)
	print(result["response"].content)
	print("\n*****")

In [None]:
# Evaluate the chain using Ragas

import time

import pandas as pd

# Run the Ragas evaluation and show the results
# Get the questions and groundtruths from the dataframe
testset_df = pd.read_csv("testsets/10k_full_testset.csv")

questions = testset_df["question"].values.tolist()
questions = [str(question) for question in questions]

groundtruths = testset_df["ground_truth"].values.tolist()
groundtruths = [str(ground_truth) for ground_truth in groundtruths]  

# Specify the eval metrics
eval_metrics = [answer_correctness, answer_relevancy, context_precision, context_recall, faithfulness]

# Run the Ragas evaluation and show the results
ragas_results, ragas_results_df = run_ragas_evaluation(chain, questions, groundtruths, eval_metrics)

# Write the results to disk
timestr = time.strftime("%Y%m%d%H%M%S")
ragas_results_df.to_csv(f"evaluations/10x_test1_testset_evaluation_{timestr}.csv")

# Show the resutls
print(ragas_results)