# Setup

In [None]:
%pip install -qU IProgress
%pip install -qU ipywidgets
%pip install -qU python-dotenv
%pip install -qU tqdm

%pip install -qU langchain-core==0.2.40
%pip install langchain-openai

%pip install -qU pymupdf
%pip install -qU ragas==0.1.20

# Environment Variables

In [None]:
# Get environment variables

from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

# Utilities

In [None]:
# Load docs from a directory

from langchain_community.document_loaders import DirectoryLoader 
from langchain_community.document_loaders import PyMuPDFLoader

def process_directory(path: str, glob: str, loader_cls: str, use_multithreading=True):
	
	loader = DirectoryLoader(path=path, glob=glob, show_progress=True, loader_cls=loader_cls, use_multithreading=use_multithreading)
	
	docs = loader.load()
	
	return docs

#####

def test_process_directory():
	docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
	print(len(docs))

test_process_directory()

In [4]:
# Create a text splitter using recursive character text splitter

# https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_docs_recursive(docs: list, chunk_size=500, chunk_overlap=50) -> list:

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

	chunks = text_splitter.split_documents(docs)

	return chunks

#####

def test_chunk_docs_recursive(): 
	docs = process_directory("docs/", "**/*.pdf", PyMuPDFLoader, True)
	chunks = chunk_docs_recursive(docs=docs)
	
	print(f"\nNumber of chunks = {len(chunks)}\n")
	print(f"First chunk = {chunks[0].page_content}")

# test_chunk_docs_recursive()

# Create Synthetic Testset

In [None]:
# https://docs.ragas.io/en/stable/getstarted/rag_testset_generation/
# I'm using an older version of Ragas because I couldn't the current version to work 

from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from ragas.testset.evolutions import multi_context, reasoning, simple
from ragas.testset.generator import RunConfig, TestsetGenerator

# Split the documents into chunks
docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
chunks = chunk_docs_recursive(docs=docs, chunk_size=1000, chunk_overlap=200)

generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
embeddings = OpenAIEmbeddings()

# Create the generator 
generator = TestsetGenerator.from_langchain(generator_llm=generator_llm, critic_llm=critic_llm, embeddings=embeddings)

# https://docs.ragas.io/en/v0.1.21/howtos/customisations/run_config.html
# Default is 16 but using a smaller number is used to avoid rate limits
run_config=RunConfig(max_workers=16)

# Set the number of questions
test_size=10

# Set the distribution 
distributions = {simple: 0.2, multi_context: 0.4, reasoning: 0.4}

# Generate the testset and save to disk 
testset = generator.generate_with_langchain_docs(documents=docs, test_size=test_size, distributions=distributions, run_config=run_config)

# Write the testet to disk
testset_name = "10k_testset.csv"
print(f"Saving {testset_name}")
testset_df = testset.to_pandas()
testset_df.to_csv(f"{testset_name}")
testset_df