# Setup

In [None]:
%pip install -qU langchain-cohere langchain-community langchain-core==0.2.40 langchain-openai langchain-qdrant
%pip install -qU docx2txt
%pip install -qU IProgress
%pip install -qU ipywidgets
%pip install -qU pymupdf
%pip install -qU python-dotenv
%pip install -qU ragas==0.1.20
%pip install -qU tqdm
%pip install -qU unstructured 

In [None]:
# Get environment variables

import os
from dotenv import load_dotenv
import uuid

# Load environment variables from .env file
load_dotenv()

# Utilities

In [None]:
from langchain_community.document_loaders import DirectoryLoader 
from langchain_community.document_loaders import PyMuPDFLoader

def process_directory(path: str, glob: str, loader_cls: str, use_multithreading=True):
	
	loader = DirectoryLoader(path=path, glob=glob, show_progress=True, loader_cls=loader_cls, use_multithreading=use_multithreading)
	
	docs = loader.load()
	
	return docs


def test_process_directory():
	docs = []
	
	docs_pdf = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	
	docs.extend(docs_pdf)
	print(len(docs))

test_process_directory()

In [None]:
# Create embeddings using OpenAI

from langchain_openai import OpenAIEmbeddings

def create_embeddings_openai(model='text-embedding-ada-002') -> OpenAIEmbeddings:

	# Initialize the OpenAIEmbeddings class
	embeddings = OpenAIEmbeddings(model=model)

	return embeddings

#####

def test_create_embeddings_openai():
	text = 'What is the annual revenue of Uber?'
	embeddings = create_embeddings_openai()
	vector = embeddings.embed_query(text)
	print(vector)
	return embeddings

test_create_embeddings_openai()

In [None]:
# Create a text splitter using recursive character text splitter

# https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_docs_recursive(docs: list, chunk_size=500, chunk_overlap=50) -> list:

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

	chunks_start = text_splitter.split_documents(docs)

	# chunks_end = remove_empty_chunks(chunks_start=chunks_start)

	return chunks_start

#####

def test_chunk_docs_recursive(): 
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)

	print(f'\nNumber of chunks = {len(chunks)}\n')
	print(f'First chunk = {chunks[0].page_content}')

test_chunk_docs_recursive()

In [None]:
# Create a Qdrant vector store

from langchain_core.embeddings import Embeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def create_qdrant_vector_store(location: str, 
							   collection_name: str, 
							   vector_size: int, 
							   embeddings: Embeddings, 
							   docs: list) -> QdrantVectorStore:

	# Initialize the Qdrant client
	qdrant_client = QdrantClient(location=location)

	# Create a collection in Qdrant
	qdrant_client.create_collection(collection_name=collection_name, 
								 vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE))

	# Initialize QdrantVectorStore with the Qdrant client
	qdrant_vector_store = QdrantVectorStore(client=qdrant_client, 
										 collection_name=collection_name, embedding=embeddings)
	
	# Add the docs to the vector store
	qdrant_vector_store.add_documents(docs)
	
	return qdrant_vector_store

#####

def test_create_qdrant_vector_store():
	embeddings = create_embeddings_openai()
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	print(f'\nNumber of chunks = {len(chunks)}\n')
	vector_store = create_qdrant_vector_store(':memory:', 'holiday-test', 1536, embeddings, chunks)
	print(vector_store.collection_name)

test_create_qdrant_vector_store()

In [None]:
# Create a Qdrant retriever

from langchain_core.retrievers import BaseRetriever
from langchain_qdrant import QdrantVectorStore

def create_retriever_qdrant(vector_store: QdrantVectorStore) -> BaseRetriever:

	retriever = vector_store.as_retriever()

	return retriever

#####

def test_create_retriever_qdrant(text: str = None):
	embeddings = create_embeddings_openai()
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	print(f'\nNumber of chunks = {len(chunks)}\n')
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	retriever = create_retriever_qdrant(vector_store)
	if text:
		docs = retriever.invoke(text)
		print(docs[0])

print('\nQDRANT')
test_create_retriever_qdrant('What is the annual revenue for Uber?')

In [None]:
# Create a prompt template

# https://python.langchain.com/v0.1/docs/modules/model_io/prompts/quick_start/#chatprompttemplate
# https://python.langchain.com/v0.2/api_reference/core/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html

from langchain.prompts import ChatPromptTemplate

def create_chat_prompt_template(template: str = None) -> ChatPromptTemplate:
	
	if template is None:
		template = '''
		You are an expert assistant designed to help users analyze and answer questions about 10K annual reports filed by publicly traded companies. Users may ask about specific sections, financial metrics, trends, or comparisons across multiple reports. Your role is to provide accurate, concise, and relevant answers, referencing appropriate sections or data points where applicable.

		When responding, adhere to the following principles:

		Understand the Question: Identify whether the query is focused on a specific company, year, or metric, or if it spans multiple reports for comparison.
		Clarify Uncertainty: If a user's question is unclear, ask for clarification or additional context.
		Locate and Reference Information: Use relevant sections of the 10K report(s), such as MD&A, Financial Statements, Risk Factors, or Notes to Financial Statements, to back up your answers.
		Synthesize Data: Provide summaries or insights when the question involves comparing data or trends across multiple reports.
		Stay Objective: Avoid providing subjective opinions or interpretations beyond the factual content in the reports.
		Example User Queries and Expected Responses:

		"What was the revenue for Company X in 2022 and 2023?"

		Locate and report the revenue figures from the Income Statements of the respective 10K reports for 2022 and 2023.
		"What are the main risk factors for Company Y in its latest report?"

		Summarize the key risk factors from the most recent 10K report's "Risk Factors" section.
		"How did the operating income of Company Z change over the last three years?"

		Extract operating income figures from the 10K reports for the past three years and provide a brief comparison.
		"Compare the debt levels of Company A and Company B in 2023."

		Retrieve debt-related figures from the Balance Sheets or Notes to Financial Statements of both companies and summarize the comparison.
		"What trends are evident in Company W's R&D expenses over the last five years?"

		Summarize trends using data from the Income Statements or footnotes for R&D expenses across five consecutive 10K reports.
		Assumptions and Constraints:

		Only use inforomation from 10K reports provided in the context below. 
		For complex queries spanning multiple reports, provide a structured summary highlighting key comparisons or trends.
		If certain information is unavailable, state so clearly and suggest alternative approaches to obtain it.

		Now it's your turn!
		
		{question}

		{context}
		'''
	
	prompt = ChatPromptTemplate.from_template(template)

	return prompt

#####

def test_create_chat_prompt_template():
	prompt = create_chat_prompt_template()
	print(prompt)

test_create_chat_prompt_template()

In [None]:
# Create a Langchain chain..

from langchain_core.retrievers import BaseRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableSerializable
from langchain_openai import ChatOpenAI
from operator import itemgetter

def create_chain (model: str, 
				  prompt_template: ChatPromptTemplate, 
				  retriever: BaseRetriever) -> RunnableSerializable:

	llm = ChatOpenAI(model=model)
		
	chain = (
		{"context": itemgetter("question") | retriever, "question": itemgetter("question")} 
		| RunnablePassthrough.assign(context=itemgetter("context")) 
		| {"response": prompt_template | llm, "context": itemgetter("context")}
		)

	return chain

#####

def test_create_chain_qdrant():
	embeddings = create_embeddings_openai()
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	print(f'\nNumber of chunks = {len(chunks)}\n')
	vector_store = create_qdrant_vector_store(":memory:", "holiday-test", 1536, embeddings, chunks)
	retriever = create_retriever_qdrant(vector_store)
	chat_prompt_template = create_chat_prompt_template()
	chain = create_chain('gpt-4o', chat_prompt_template, retriever)
	result = chain.invoke({'question' : 'What is the annual revenue of Uber?'})
	print(result)

print('\nQDRANT')
test_create_chain_qdrant()

# Create Synthetic Testset

In [None]:
# https://docs.ragas.io/en/stable/getstarted/rag_testset_generation/
# I'm using an older version of Ragas because I couldn't the current version to work 

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas.testset.generator import RunConfig
from ragas.testset.generator import TestsetGenerator

# Split the documents into chunks
docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
chunks = chunk_docs_recursive(docs=docs, chunk_size=1000, chunk_overlap=200)

generator_llm = ChatOpenAI(model="gpt-4o")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
embeddings = OpenAIEmbeddings()

# Create the generator 
generator = TestsetGenerator.from_langchain(generator_llm=generator_llm, critic_llm=critic_llm, embeddings=embeddings)

# https://docs.ragas.io/en/v0.1.21/howtos/customisations/run_config.html
# Default is 16 but using a smaller number is used to avoid rate limits
run_config=RunConfig(max_workers=8)

# Set the number of questions
test_size=20

# Set the distribution 
distributions = {simple: 0.0, multi_context: 1.0, reasoning: 0.0}

# Generate the testset and save to disk 
testset = generator.generate_with_langchain_docs(documents=docs, test_size=test_size, distributions=distributions, run_config=run_config)

# Write the testet to disk
testset_name = "10k_multi_context_testset.csv"
print(f"Saving {testset_name}")
testset_df = testset.to_pandas()
testset_df.to_csv(f"{testset_name}")
testset_df

# Test 1 - Simple RAG Chain Using Qdrant and PyMuPDFLoader

In [None]:
# Build RAG chain using Vertex AI Agent Builder datastore

embeddings = create_embeddings_openai()
docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
chunks = chunk_docs_recursive(docs=docs)
print(f'\nNumber of chunks = {len(chunks)}\n')
vector_store = create_qdrant_vector_store(':memory:', 'holiday-test', 1536, embeddings, chunks)
retriever = create_retriever_qdrant(vector_store)
chat_prompt_template = create_chat_prompt_template()
chain = create_chain('gpt-4o', chat_prompt_template, retriever)

In [None]:
questions = ["What is the annual revenue of Uber?",
"What is the annual revenue of Lyft?",
"How does Uber's revenue compare to Lyft's revenue?",]

for question in questions:
	print(question)
	result = chain.invoke({"question" : question})
	print(result)
	print(result["response"].content)
	print("\n*****")