Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion src/ragas/testset/synthesizers/single_hop/specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,44 @@ class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer):
theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
property_name: str = "entities"

def _extract_themes_from_items(self, items: t.Any) -> t.List[str]:
"""
Extract unique theme names from various formats.

Handles multiple data formats that might appear during synthesis:
- List[Tuple[str, str]]: Entity pairs (from overlap detection)
- List[List[str]]: Entity pairs as lists
- List[str]: Direct entity names
- Dict[str, Any]: Keys as entity names

Parameters
----------
items : t.Any
The items to extract themes from.

Returns
-------
t.List[str]
List of unique theme strings.
"""
if isinstance(items, dict):
return list(items.keys())

if not isinstance(items, list):
return []

unique_themes = set()
for item in items:
if isinstance(item, (tuple, list)):
# Extract strings from pairs/sequences
for element in item:
if isinstance(element, str):
unique_themes.add(element)
elif isinstance(item, str):
unique_themes.add(item)

return list(unique_themes)

def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]:
node_type_dict = defaultdict(int)
for node in knowledge_graph.nodes:
Expand Down Expand Up @@ -101,7 +139,14 @@ async def _generate_scenarios(
for node in nodes:
if len(scenarios) >= n:
break
themes = node.properties.get(self.property_name, [""])
raw_themes = node.properties.get(self.property_name, [])
# Extract themes from potentially mixed data types (handles tuples, lists, strings)
themes = self._extract_themes_from_items(raw_themes)

if not themes: # Skip if no themes extracted
logger.debug("No themes extracted from node %s. Skipping.", node.id)
continue

prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list)
persona_concepts = await self.theme_persona_matching_prompt.generate(
data=prompt_input, llm=self.llm, callbacks=callbacks
Expand Down
147 changes: 147 additions & 0 deletions tests/unit/test_single_hop_query_synthesizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import typing as t

import pytest

from ragas.prompt import PydanticPrompt
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.persona import Persona
from ragas.testset.synthesizers.prompts import PersonaThemesMapping, ThemesPersonasInput
from ragas.testset.synthesizers.single_hop.specific import (
SingleHopSpecificQuerySynthesizer,
)


class MockThemePersonaMatchingPrompt(PydanticPrompt):
async def generate(self, data: ThemesPersonasInput, llm, callbacks=None):
themes: t.List[str] = data.themes
personas: t.List[Persona] = data.personas
return PersonaThemesMapping(
mapping={persona.name: themes for persona in personas}
)


def test_extract_themes_from_items_with_strings(fake_llm):
"""Test _extract_themes_from_items with string input."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

items = ["Theme1", "Theme2", "Theme3"]
themes = synthesizer._extract_themes_from_items(items)

assert set(themes) == {"Theme1", "Theme2", "Theme3"}


def test_extract_themes_from_items_with_tuples(fake_llm):
"""Test _extract_themes_from_items with tuple input (the bug fix)."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

# This is the format that was causing the ValidationError in issue #2368
items = [("Entity1", "Entity1"), ("Entity2", "Entity2")]
themes = synthesizer._extract_themes_from_items(items)

assert set(themes) == {"Entity1", "Entity2"}


def test_extract_themes_from_items_with_mixed_formats(fake_llm):
"""Test _extract_themes_from_items with mixed formats."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

items = ["Theme1", ("Entity2", "Entity2"), ["Entity3", "Entity3"]]
themes = synthesizer._extract_themes_from_items(items)

assert set(themes) == {"Theme1", "Entity2", "Entity3"}


def test_extract_themes_from_items_with_dict(fake_llm):
"""Test _extract_themes_from_items with dict input."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

items = {"Theme1": "value1", "Theme2": "value2"}
themes = synthesizer._extract_themes_from_items(items)

assert set(themes) == {"Theme1", "Theme2"}


def test_extract_themes_from_items_empty_input(fake_llm):
"""Test _extract_themes_from_items with empty input."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

assert synthesizer._extract_themes_from_items([]) == []
assert synthesizer._extract_themes_from_items(None) == []
assert synthesizer._extract_themes_from_items("invalid") == []


def test_extract_themes_from_items_with_nested_empty_tuples(fake_llm):
"""Test _extract_themes_from_items skips non-string elements."""
synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)

items = [("Theme1", 123), (456, "Theme2"), ("Theme3", "Theme3")]
themes = synthesizer._extract_themes_from_items(items)

# Only string elements should be extracted
assert set(themes) == {"Theme1", "Theme2", "Theme3"}


@pytest.mark.asyncio
async def test_generate_scenarios_with_tuple_entities(fake_llm):
"""Test that _generate_scenarios handles tuple-formatted entities correctly.

This test validates the fix for issue #2368 where entities property
containing tuples would cause ValidationError.
"""
# Create a node with tuple-formatted entities (the problematic case)
node = Node(type=NodeType.CHUNK)
node.add_property("entities", [("Entity1", "Entity1"), ("Entity2", "Entity2")])

kg = KnowledgeGraph(nodes=[node])

personas = [
Persona(
name="Researcher",
role_description="A researcher interested in entities.",
),
]

synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
synthesizer.theme_persona_matching_prompt = MockThemePersonaMatchingPrompt()

# This should not raise ValidationError
scenarios = await synthesizer._generate_scenarios(
n=2,
knowledge_graph=kg,
persona_list=personas,
callbacks=None,
)

# Should generate scenarios successfully
assert len(scenarios) > 0


@pytest.mark.asyncio
async def test_generate_scenarios_with_string_entities(fake_llm):
"""Test that _generate_scenarios still works with string-formatted entities."""
# Create a node with string-formatted entities (the normal case)
node = Node(type=NodeType.CHUNK)
node.add_property("entities", ["Entity1", "Entity2", "Entity3"])

kg = KnowledgeGraph(nodes=[node])

personas = [
Persona(
name="Researcher",
role_description="A researcher interested in entities.",
),
]

synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
synthesizer.theme_persona_matching_prompt = MockThemePersonaMatchingPrompt()

# This should work as before
scenarios = await synthesizer._generate_scenarios(
n=2,
knowledge_graph=kg,
persona_list=personas,
callbacks=None,
)

# Should generate scenarios successfully
assert len(scenarios) > 0
Loading