# ragas golden dataset

In [1]:
# 1. Install via uv, pinning only ragas, langchain-community, and pypdf2

!uv pip install -qU ragas==0.2.15 langchain langchain-community langchain-openai openai pypdf rapidfuzz

from google.colab import userdata
import os
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

from uuid import uuid4

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = f"ragas-golden-dataset-{uuid4().hex[0:8]}"
os.environ["LANGSMITH_API_KEY"] = userdata.get("LANGCHAIN_API_KEY")

# 2. Download PDFs
!mkdir -p data

!curl -L https://arxiv.org/pdf/2505.10468.pdf \
     -o data/ai_agents_vs_agentic_ai_2505.10468.pdf

!curl -L https://arxiv.org/pdf/2505.06913.pdf \
     -o data/redteamllm_agentic_ai_framework_2505.06913.pdf

!curl -L https://arxiv.org/pdf/2505.06817.pdf \
     -o data/control_plane_scalable_design_pattern_2505.06817.pdf

# 3. Load PDFs with PyPDFDirectoryLoader
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader(
    "data/",              # directory path
    glob="*.pdf",         # file pattern
    silent_errors=True,   # skip unreadable files
)
docs = loader.load()      # returns a list of Documents with page_content & metadata
print(f"Loaded {len(docs)} pages across all PDFs")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   249  100   249    0     0    444      0 --:--:-- --:--:-- --:--:--   444
100 3121k  100 3121k    0     0  4615k      0 --:--:-- --:--:-- --:--:-- 4615k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   249  100   249    0     0   1162      0 --:--:-- --:--:-- --:--:--  1169
100  546k  100  546k    0     0  1870k      0 --:--:-- --:--:-- --:--:-- 1870k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   249  100   249    0     0    387      0 --:--:-- --:--:-- --:--:--   387
100  273k  100  273k    0     0   378k      0 --:--:-- --:--:-- --:--:--  378k
Loaded 50 pages across all PDFs


In [2]:
from ragas.testset import TestsetGenerator
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# 2. Instantiate your generator
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

# 3. Generate testset (this builds the KG internally)
ragas_dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

# push ragas_dataset to Hugging Face

# 4. Grab the KG and save it
kg = generator.knowledge_graph
kg.save("my_ragas_knowledge_graph.json")

# 5. (Optional) Inspect nodes and relationships
print(f"Nodes:    {len(kg.nodes)}")
print(f"Edges:    {len(kg.relationships)}")

Applying HeadlinesExtractor:   0%|          | 0/44 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/50 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/64 [00:00<?, ?it/s]



Applying CustomNodeFilter:   0%|          | 0/51 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/166 [00:00<?, ?it/s]



Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

Nodes:    121
Edges:    755


In [3]:
import numpy as np
import pandas as pd

# 1. Get your raw list of records
records = ragas_dataset.to_list()

# 2. Normalize numpy arrays into lists
cleaned = []
for rec in records:
    cleaned.append({
        k: (v.tolist() if isinstance(v, np.ndarray) else v)
        for k, v in rec.items()
    })

# 3. Build your DataFrame
ragas_df = pd.DataFrame(cleaned)
ragas_df.to_csv('ragas_golden_dataset.csv', index=False)
print(f"Saved {len(ragas_df)} rows to CSV")

Saved 12 rows to CSV


In [12]:
ragas_testset_hf = ragas_dataset.to_hf_dataset()

In [10]:
# push ragas_testset_hf to hugging face
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
# push ragas_testset_hf to hugging face
ragas_testset_hf.push_to_hub("dwb2023/ragas-golden-dataset")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dwb2023/ragas-golden-dataset/commit/e350e308da54d49fbc4ad51f48e7086990315340', commit_message='Upload dataset', commit_description='', oid='e350e308da54d49fbc4ad51f48e7086990315340', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dwb2023/ragas-golden-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dwb2023/ragas-golden-dataset'), pr_revision=None, pr_num=None)

In [None]:
# 6. (Optional) Create Pandas DataFrame
ragas_df = ragas_dataset.to_pandas()
# ragas_df
ragas_df.to_csv('ragas_golden_dataset.csv', index=False)

In [4]:
# To see the type of the docs object
print(f"Type of docs: {type(docs)}")

# To see the number of documents in the list
print(f"Number of documents: {len(docs)}")

# To see the type of the first document in the list (if docs is not empty)
if len(docs) > 0:
    print(f"Type of first document: {type(docs[0])}")
    # To see the content of the first document (or its metadata)
    print(f"Content of first document (first 100 chars): {docs[0].page_content[:100]}...")
    print(f"Metadata of first document: {docs[0].metadata}")

Type of docs: <class 'list'>
Number of documents: 50
Type of first document: <class 'langchain_core.documents.base.Document'>
Content of first document (first 100 chars): arXiv:2505.06817v1  [cs.AI]  11 May 2025
Control Plane as a Tool: A Scalable Design Pattern for
Agen...
Metadata of first document: {'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:f38b2be)', 'creationdate': '', 'author': 'Sivasathivel Kandasamy', 'doi': 'https://doi.org/10.48550/arXiv.2505.06817', 'license': 'http://creativecommons.org/licenses/by-nc-nd/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'title': 'Control Plane as a Tool: A Scalable Design Pattern for Agentic AI Systems', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2505.06817v1', 'source': 'data/control_plane_scalable_design_pattern_2505.06817.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}
