Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/ragas/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ class SingleTurnSample(BaseSample):
The reference answer for the query.
rubric : Optional[Dict[str, str]]
Evaluation rubric for the sample.
persona_name : Optional[str]
Name of the persona used in query generation.
query_style : Optional[str]
Style of the generated query (e.g., formal, casual).
query_length : Optional[str]
Length category of the query (e.g., short, medium, long).
"""

user_input: t.Optional[str] = None
Expand All @@ -86,6 +92,9 @@ class SingleTurnSample(BaseSample):
multi_responses: t.Optional[t.List[str]] = None
reference: t.Optional[str] = None
rubrics: t.Optional[t.Dict[str, str]] = None
persona_name: t.Optional[str] = None
query_style: t.Optional[str] = None
query_length: t.Optional[str] = None


class MultiTurnSample(BaseSample):
Expand Down
3 changes: 3 additions & 0 deletions src/ragas/testset/synthesizers/single_hop/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,7 @@ async def _generate_sample(
user_input=response.query,
reference=response.answer,
reference_contexts=[reference_context],
persona_name=getattr(scenario.persona, "name", None),
query_style=getattr(scenario.style, "name", None),
query_length=getattr(scenario.length, "name", None),
)
27 changes: 27 additions & 0 deletions tests/unit/test_dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,33 @@ def test_evaluation_dataset_load_from_hf(eval_sample):
assert loaded_dataset == dataset


def test_single_turn_sample_metadata_roundtrip_hf_and_jsonl(tmpdir):
sample = SingleTurnSample(
user_input="Q",
response="A",
reference_contexts=["ctx"],
persona_name="Researcher",
query_style="FORMAL",
query_length="SHORT",
)
dataset = EvaluationDataset(samples=[sample])

# HF round-trip
hf = dataset.to_hf_dataset()
loaded_hf = EvaluationDataset.from_hf_dataset(hf)
assert loaded_hf.samples[0].persona_name == "Researcher"
assert loaded_hf.samples[0].query_style == "FORMAL"
assert loaded_hf.samples[0].query_length == "SHORT"

# JSONL round-trip
jsonl_path = tmpdir / "ds.jsonl"
dataset.to_jsonl(jsonl_path)
loaded_jsonl = EvaluationDataset.from_jsonl(jsonl_path)
assert loaded_jsonl.samples[0].persona_name == "Researcher"
assert loaded_jsonl.samples[0].query_style == "FORMAL"
assert loaded_jsonl.samples[0].query_length == "SHORT"


@pytest.mark.parametrize("eval_sample", samples)
def test_single_type_evaluation_dataset(eval_sample):
single_turn_sample = SingleTurnSample(user_input="What is X", response="Y")
Expand Down
16 changes: 12 additions & 4 deletions tests/unit/test_knowledge_graph_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,8 @@ def test_performance_find_n_indirect_clusters_large_web_constant_n(
curr_time = results[i]["time"]

# Skip performance check if previous time is too small to measure accurately
if prev_time < 1e-6: # Less than 1 microsecond
# Increased threshold to account for timing variance on CI (especially Windows)
if prev_time < 1e-4: # Less than 100 microseconds
print(
f"Skipping performance check for size {results[i]['size']} vs {results[i - 1]['size']}: "
f"previous time too small ({prev_time:.9f}s)"
Expand All @@ -961,12 +962,19 @@ def test_performance_find_n_indirect_clusters_large_web_constant_n(
time_ratio = curr_time / prev_time

scaled_size_ratio = size_ratio**2.5
# Add tolerance for platform variance; operations can be noisy on Windows runners
if prev_time < 1e-3:
tolerance_factor = 3.0
else:
tolerance_factor = 2.0
tolerance_threshold = scaled_size_ratio * tolerance_factor

print(
f"Size ratio: {size_ratio:.2f}, (Scaled: {scaled_size_ratio:.2f}), Time ratio: {time_ratio:.2f}"
f"Size ratio: {size_ratio:.2f}, (Scaled: {scaled_size_ratio:.2f}), Time ratio: {time_ratio:.2f}, Tolerance: {tolerance_threshold:.2f}"
)

assert time_ratio < scaled_size_ratio, (
f"Time complexity growing faster than expected: size {results[i]['size']} vs {results[i - 1]['size']}, time ratio {time_ratio:.2f} vs {scaled_size_ratio:.2f}"
assert time_ratio < tolerance_threshold, (
f"Time complexity growing faster than expected: size {results[i]['size']} vs {results[i - 1]['size']}, time ratio {time_ratio:.2f} vs {tolerance_threshold:.2f}"
)


Expand Down
42 changes: 42 additions & 0 deletions tests/unit/test_single_hop_query_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,48 @@ async def test_generate_scenarios_with_tuple_entities(fake_llm):
assert len(scenarios) > 0


@pytest.mark.asyncio
async def test_generate_sample_includes_metadata(fake_llm):
node = Node(type=NodeType.CHUNK)
node.add_property("page_content", "Context about microservices and patterns.")
persona = Persona(name="Engineer", role_description="Builds systems")

synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

# Stub the prompt to avoid LLM dependency and return deterministic values
class StubPrompt(PydanticPrompt):
async def generate(self, data, llm, callbacks=None): # type: ignore[override]
class R:
query = "What is microservices?"
answer = "Microservices are loosely coupled services."

return R()

synthesizer.generate_query_reference_prompt = StubPrompt()

# Build a minimal scenario
from ragas.testset.synthesizers.base import QueryLength, QueryStyle
from ragas.testset.synthesizers.single_hop.base import SingleHopScenario

scenario = SingleHopScenario(
nodes=[node],
persona=persona,
style=QueryStyle.PERFECT_GRAMMAR,
length=QueryLength.MEDIUM,
term="microservices",
)

sample = await synthesizer._generate_sample(scenario, callbacks=None) # type: ignore[arg-type]

assert sample.user_input == "What is microservices?"
assert sample.reference == "Microservices are loosely coupled services."
assert sample.reference_contexts == ["Context about microservices and patterns."]
# New metadata fields
assert sample.persona_name == "Engineer"
assert sample.query_style == "PERFECT_GRAMMAR"
assert sample.query_length == "MEDIUM"


@pytest.mark.asyncio
async def test_generate_scenarios_with_string_entities(fake_llm):
"""Test that _generate_scenarios still works with string-formatted entities."""
Expand Down
Loading