In [1]:
import os
import random

from dotenv import load_dotenv, find_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator

In [2]:
_ = load_dotenv(find_dotenv())
API_KEY = os.environ['DEEPSEEK_API_KEY']
MODEL = "deepseek-chat"
BASE_URL = "https://api.deepseek.com"

generator_embeddings = HuggingFaceEmbeddings(model_name="bert-base-german-dbmdz-uncased")
generator_llm = LangchainLLMWrapper(ChatOpenAI(model=MODEL, api_key=API_KEY, base_url=BASE_URL))

No sentence-transformers model found with name bert-base-german-dbmdz-uncased. Creating a new one with mean pooling.


In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("../data/Kompendium1.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=20)
documents = loader.load_and_split(text_splitter)

In [4]:
len(documents)

413

In [5]:
# Selecting randomly a subset of documents to save money on LLM usage!
random_numbers = random.sample(range(6, 400), 20)
docs_subset = [documents[i] for i in random_numbers]

In [6]:
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

In [7]:
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)

distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]

for query, _ in distribution:
    prompts = await query.adapt_prompts("deutsch", llm=generator_llm)
    query.set_prompts(**prompts)

In [8]:
dataset = generator.generate_with_langchain_docs(docs_subset, testset_size=20, query_distribution=distribution)
dataset_df = dataset.to_pandas()
dataset_df.head()

Applying SummaryExtractor:   0%|          | 0/17 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/20 [00:00<?, ?it/s]

Node e6a31911-5ed8-4d72-a5a0-5c4c8a2c85b7 does not have a summary. Skipping filtering.
Node d2f6d06b-c30f-49ea-8547-c0946a8d734d does not have a summary. Skipping filtering.
Node 2f4a7312-5c1c-44b5-99c3-43be8ad0a59a does not have a summary. Skipping filtering.
unable to apply transformation: Failed to parse QuestionPotentialOutput from completion {"properties": {"score": {"description": "1 to 5 score", "title": "Score", "type": "integer"}}, "required": ["score"], "title": "QuestionPotentialOutput", "type": "object"}. Got: 1 validation error for QuestionPotentialOutput
score
  Field required [type=missing, input_value={'properties': {'score': ...tput', 'type': 'object'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
unable to apply transformation: Failed to parse QuestionPotentialOutput from completion {"properties": {"score": {"des

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/57 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/18 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is SVB and how does it relate to occupati...,"[sicherung übertragen bekommt, wird nur dann n...",The SVB (Ärztliche Dienste der SVB) is involve...,single_hop_specifc_query_synthesizer
1,Can you explain what the Allgemeinen Sozialver...,[zu steigern und damit das Ausmaß der Beanspru...,The Allgemeinen Sozialversicherungsgesetzes (A...,single_hop_specifc_query_synthesizer
2,What is the contribution of Baumgartner E. in ...,[tiert!\nLiteratur:\n1. Baumgartner E./Wensel ...,"Baumgartner E., along with Wensel H.G., contri...",single_hop_specifc_query_synthesizer
3,What is the protective function of the spine i...,[3) Detailanalyse\na) Heben und Tragen - Arbei...,The spine provides a protective function for t...,single_hop_specifc_query_synthesizer
4,What can cause elevated IgE levels in occupati...,[Werte können während der Alle rgieschübe auft...,Elevated IgE levels can occur during allergy a...,single_hop_specifc_query_synthesizer
