In [1]:
import os
from dotenv import load_dotenv

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
dotenv_path = os.path.join(parent_dir, '.env')
load_dotenv(dotenv_path)

True

In [2]:
from typing import List
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP"))

def load_and_split_documents(path: str = "./data") -> List[Document]:
    print("Loading documents...")
    loader = DirectoryLoader(
        path=path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    print(f"Loaded {len(documents)} documents.")

    # Define separators for splitting text
    separators = [
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=separators
    )
    
    split_documents = []
    for doc in documents:
        splits = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(splits):
            metadata = doc.metadata.copy()
            metadata['chunk'] = i
            new_doc = Document(page_content=chunk, metadata=metadata)
            split_documents.append(new_doc)
    print(f"Split into {len(split_documents)} chunks.")
    return split_documents

In [3]:
documents = load_and_split_documents("./data")

Loading documents...
Loaded 86 documents.
Split into 903 chunks.


In [4]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama.llms import OllamaLLM 
from langchain_ollama.embeddings import OllamaEmbeddings

CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL")
EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE")

ollama_llm = OllamaLLM(model=CHAT_MODEL, base_url=OLLAMA_API_BASE)
ollama_embedding = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=OLLAMA_API_BASE)

ollama_llm_wrapper = LangchainLLMWrapper(ollama_llm)
ollama_embedding_wrapper = LangchainEmbeddingsWrapper(ollama_embedding)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator.from_langchain(
    llm=ollama_llm, 
    embedding_model=ollama_embedding
)

In [6]:
TEST_SET_SIZE = int(os.getenv("TEST_SET_SIZE"))

testset = generator.generate_with_langchain_docs(
    documents=documents,
    testset_size=TEST_SET_SIZE,
    with_debugging_logs=True,
    raise_exceptions=False
)    

Applying SummaryExtractor:   9%|▉         | 56/636 [17:59<3:11:40, 19.83s/it]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt summary_extractor_prompt failed to parse output: The output parser failed to parse the output including retries.
unable to apply transformation: The output parser failed to parse the output including retries.
Applying SummaryExtractor:  21%|██        | 135/636 [46:28<2:33:39, 18.40s/it]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output pars

TypeError: 'float' object is not iterable