In [3]:
import os
from dotenv import load_dotenv

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
dotenv_path = os.path.join(parent_dir, '.env')
load_dotenv(dotenv_path)

True

In [4]:
from typing import List
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP"))

def load_and_split_documents(path: str = "./data") -> List[Document]:
    print("Loading documents...")
    loader = DirectoryLoader(
        path=path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    
    documents = loader.load()
    print(f"Loaded {len(documents)} documents.")

    # Define separators for splitting text
    separators = [
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=separators
    )
    
    split_documents = []
    for doc in documents:
        splits = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(splits):
            metadata = doc.metadata.copy()
            metadata['chunk'] = i
            new_doc = Document(page_content=chunk, metadata=metadata)
            split_documents.append(new_doc)
    print(f"Split into {len(split_documents)} chunks.")
    return split_documents

In [5]:
documents = load_and_split_documents("./data")

Loading documents...


100%|██████████| 2/2 [00:00<00:00,  2.29it/s]

Loaded 40 documents.
Split into 361 chunks.





In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama.llms import OllamaLLM 
from langchain_ollama.embeddings import OllamaEmbeddings

CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL")
EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")
OLLAMA_API_BASE = "http://localhost:4444"  #os.getenv("OLLAMA_API_BASE")

ollama_llm = OllamaLLM(
    model=CHAT_MODEL, 
    base_url=OLLAMA_API_BASE
)
ollama_embedding = OllamaEmbeddings(
    model=EMBEDDING_MODEL, 
    base_url=OLLAMA_API_BASE
)

ollama_llm_wrapper = LangchainLLMWrapper(ollama_llm)
ollama_embedding_wrapper = LangchainEmbeddingsWrapper(ollama_embedding)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator.from_langchain(
    llm=ollama_llm, 
    embedding_model=ollama_embedding
)

In [None]:
TEST_SET_SIZE = int(os.getenv("TEST_SET_SIZE"))

testset = generator.generate_with_langchain_docs(
    documents=documents,
    testset_size=TEST_SET_SIZE,
    with_debugging_logs=True,
    raise_exceptions=False
)    

Applying CustomNodeFilter:   0%|          | 0/361 [00:00<?, ?it/s]            Node 35a2f06b-20f9-477a-a0c4-64b6d362d55f does not have a summary. Skipping filtering.
Node c1b39d9d-d2b2-488b-8371-883198434ace does not have a summary. Skipping filtering.
Node bd0c661b-0aca-4bd3-a087-001a7cea1b1b does not have a summary. Skipping filtering.
Node dcc554b6-49bb-42fa-a559-205bdeb258a5 does not have a summary. Skipping filtering.
Node 44b2c067-be8c-486e-9fe0-c53168dcda9f does not have a summary. Skipping filtering.
Node 9682d0d9-577c-48cf-80e0-77ea2c38ec98 does not have a summary. Skipping filtering.
Node e3000b89-7526-4be7-9e24-fc2363c09885 does not have a summary. Skipping filtering.
Node 290ab5a7-3890-4b61-8a5b-610c9510bc8d does not have a summary. Skipping filtering.
Node af8a8a7c-67b0-4e13-9fff-bfce32c0cfc2 does not have a summary. Skipping filtering.
Node 76d1fe96-0717-48ea-aeec-4d62eff26e71 does not have a summary. Skipping filtering.
Node 11099ea5-9f56-4036-acec-c6fb73c1b738 does not h

RagasOutputParserException: The output parser failed to parse the output including retries.

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt persona_generation_prompt failed to parse output: The output parser failed to parse the output including retries.
