In [None]:
%pip install -qU langchain-cohere langchain-community langchain-core==0.2.40 langchain-openai langchain-qdrant
%pip install -qU docx2txt
%pip install -qU IProgress
%pip install -qU ipywidgets
%pip install -qU pymupdf
%pip install -qU python-dotenv
%pip install -qU ragas
%pip install -qU tqdm
%pip install -qU unstructured 

In [None]:
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import SingleHopSpecificQuerySynthesizer, MultiHopSpecificQuerySynthesizer

# Split the documents into chunks
docs = process_directory('docs/', '**/*.pdf', PyMuPDFLoader, True)
chunks = chunk_docs_recursive(docs=docs, chunk_size=1000, chunk_overlap=200)

# Wrap the LLM with LangchainLLMWrapper using OpenAI GPT-4 model
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

# Generate the test set with the loaded documents (generating 30 examples)
generator = TestsetGenerator(llm=evaluator_llm, embedding_model=OpenAIEmbeddings())

query_distribution = [
    (MultiHopSpecificQuerySynthesizer(llm=evaluator_llm), 0.5),
    (SingleHopSpecificQuerySynthesizer(llm=evaluator_llm), 0.5),
]

# Call the generate_with_langchain_docs with the custom query_distribution
dataset = generator.generate_with_langchain_docs(
    chunks, 
    testset_size=2, 
    query_distribution=query_distribution
)

# Convert the generated dataset to a Pandas DataFrame
df = dataset.to_pandas()
print(df)

# Optionally, save the generated testset to a CSV file for further inspection
output_csv_path = "10k_testset.csv"
df.to_csv(output_csv_path, index=False)
print(f"Generated testset saved to {output_csv_path}")