Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
c8042d9
add persona and topic extraction
shahules786 Nov 4, 2024
b965463
update init
shahules786 Nov 4, 2024
33fa657
add scikit learn to dev
shahules786 Nov 4, 2024
6e2c387
add persona to init
shahules786 Nov 4, 2024
1b8fe6e
add scikit-learn
shahules786 Nov 4, 2024
de73e68
add description
shahules786 Nov 4, 2024
3cb471c
find indirect and direct clusters
shahules786 Nov 4, 2024
f537ccc
add similarity builder
shahules786 Nov 4, 2024
3b7854f
add single hop
shahules786 Nov 4, 2024
9eae691
move prompts
shahules786 Nov 4, 2024
46b6a70
delete older query types
shahules786 Nov 4, 2024
caf0e4b
add new question types
shahules786 Nov 4, 2024
6e04a7c
add base
shahules786 Nov 4, 2024
67de667
add non llm rel builder
shahules786 Nov 5, 2024
65bb30f
add imports to init
shahules786 Nov 5, 2024
a2a1f56
more fixes
shahules786 Nov 5, 2024
632401f
modify dfs to return unique paths
shahules786 Nov 5, 2024
eebc42d
Merge branch 'main' into testgen-imp-queries
shahules786 Nov 5, 2024
1d3f15a
redo headline extraction
shahules786 Nov 5, 2024
fe7d795
add new default transforms
shahules786 Nov 5, 2024
4e3fb5a
add persona
shahules786 Nov 5, 2024
bc43072
add defaults
shahules786 Nov 5, 2024
6c0c64b
add callback
shahules786 Nov 5, 2024
91cc47a
reformat persona generation
shahules786 Nov 5, 2024
cf6b81c
add default filter
shahules786 Nov 5, 2024
2d967ce
fix merge
shahules786 Nov 5, 2024
b5262ae
fix typo
shahules786 Nov 5, 2024
1e8b8e3
rename PersonaList
shahules786 Nov 5, 2024
d38806f
make it a dataclass
shahules786 Nov 5, 2024
70b6629
simplify prompt
shahules786 Nov 5, 2024
6ab5488
redo persona
shahules786 Nov 5, 2024
dbfab95
type fixes
shahules786 Nov 5, 2024
3019d22
add names
shahules786 Nov 5, 2024
e816496
modify tests for improved version
shahules786 Nov 5, 2024
57b0923
Introduce epsilon in denominators to avoid division by zero (#1622)
jltham Nov 5, 2024
ea8da43
modify headline extraction
shahules786 Nov 6, 2024
31a3c02
add new base class
shahules786 Nov 6, 2024
fa435b9
Merge branch 'main' into testgen-imp-queries
shahules786 Nov 6, 2024
b75d419
remove redundant extractor
shahules786 Nov 6, 2024
ee66e46
add docs
shahules786 Nov 6, 2024
16786bd
reflect changes in main
shahules786 Nov 6, 2024
83c4213
remove scikit learn
shahules786 Nov 6, 2024
f23b5a9
updated test
shahules786 Nov 6, 2024
b546da6
change default query distribution
shahules786 Nov 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/getstarted/rag_testset_generation.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ query_distribution = default_query_distribution(generator_llm)
```
```
[
(AbstractQuerySynthesizer(llm=generator_llm), 0.25),
(ComparativeAbstractQuerySynthesizer(llm=generator_llm), 0.25),
(SpecificQuerySynthesizer(llm=generator_llm), 0.5),
(SingleHopSpecificQuerySynthesizer(llm=llm), 0.5),
(MultiHopAbstractQuerySynthesizer(llm=llm), 0.25),
(MultiHopSpecificQuerySynthesizer(llm=llm), 0.25),
]
```

Expand Down
4 changes: 2 additions & 2 deletions docs/references/testset_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
members:
- BaseScenario

::: ragas.testset.synthesizers.specific_query.SpecificQueryScenario
::: ragas.testset.synthesizers.single_hop.specific.SingleHopSpecificQuerySynthesizer
options:
show_root_heading: True
show_root_full_path: False

::: ragas.testset.synthesizers.abstract_query.AbstractQueryScenario
::: ragas.testset.synthesizers.multi_hop.specific.MultiHopSpecificQuerySynthesizer
options:
show_root_heading: True
show_root_full_path: False
2 changes: 2 additions & 0 deletions src/ragas/metrics/_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class DistanceMeasure(Enum):
LEVENSHTEIN = "levenshtein"
HAMMING = "hamming"
JARO = "jaro"
JARO_WINKLER = "jaro_winkler"


@dataclass
Expand Down Expand Up @@ -77,6 +78,7 @@ def __post_init__(self):
DistanceMeasure.LEVENSHTEIN: distance.Levenshtein,
DistanceMeasure.HAMMING: distance.Hamming,
DistanceMeasure.JARO: distance.Jaro,
DistanceMeasure.JARO_WINKLER: distance.JaroWinkler,
}

def init(self, run_config: RunConfig):
Expand Down
100 changes: 84 additions & 16 deletions src/ragas/testset/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,15 @@ def __repr__(self) -> str:
def __str__(self) -> str:
return self.__repr__()

def find_clusters(
self, relationship_condition: t.Callable[[Relationship], bool] = lambda _: True
def find_indirect_clusters(
self,
relationship_condition: t.Callable[[Relationship], bool] = lambda _: True,
depth_limit: int = 3,
) -> t.List[t.Set[Node]]:
"""
Finds clusters of nodes in the knowledge graph based on a relationship condition.
Finds indirect clusters of nodes in the knowledge graph based on a relationship condition.
Here if A -> B -> C -> D, then A, B, C, and D form a cluster. If there's also a path A -> B -> C -> E,
it will form a separate cluster.

Parameters
----------
Expand All @@ -223,31 +227,95 @@ def find_clusters(
A list of sets, where each set contains nodes that form a cluster.
"""
clusters = []
visited = set()
visited_paths = set()

relationships = [
rel for rel in self.relationships if relationship_condition(rel)
]

def dfs(node: Node, cluster: t.Set[Node]):
visited.add(node)
def dfs(node: Node, cluster: t.Set[Node], depth: int, path: t.Tuple[Node, ...]):
if depth >= depth_limit or path in visited_paths:
return
visited_paths.add(path)
cluster.add(node)

for rel in relationships:
if rel.source == node and rel.target not in visited:
dfs(rel.target, cluster)
# if the relationship is bidirectional, we need to check the reverse
neighbor = None
if rel.source == node and rel.target not in cluster:
neighbor = rel.target
elif (
rel.bidirectional
and rel.target == node
and rel.source not in visited
and rel.source not in cluster
):
dfs(rel.source, cluster)
neighbor = rel.source

if neighbor is not None:
dfs(neighbor, cluster.copy(), depth + 1, path + (neighbor,))

# Add completed path-based cluster
if len(cluster) > 1:
clusters.append(cluster)

for node in self.nodes:
if node not in visited:
cluster = set()
dfs(node, cluster)
if len(cluster) > 1:
initial_cluster = set()
dfs(node, initial_cluster, 0, (node,))

# Remove duplicates by converting clusters to frozensets
unique_clusters = [
set(cluster) for cluster in set(frozenset(c) for c in clusters)
]

return unique_clusters

def find_direct_clusters(
self, relationship_condition: t.Callable[[Relationship], bool] = lambda _: True
) -> t.Dict[Node, t.List[t.Set[Node]]]:
"""
Finds direct clusters of nodes in the knowledge graph based on a relationship condition.
Here if A->B, and A->C, then A, B, and C form a cluster.

Parameters
----------
relationship_condition : Callable[[Relationship], bool], optional
A function that takes a Relationship and returns a boolean, by default lambda _: True

Returns
-------
List[Set[Node]]
A list of sets, where each set contains nodes that form a cluster.
"""

clusters = []
relationships = [
rel for rel in self.relationships if relationship_condition(rel)
]
for node in self.nodes:
cluster = set()
cluster.add(node)
for rel in relationships:
if rel.bidirectional:
if rel.source == node:
cluster.add(rel.target)
elif rel.target == node:
cluster.add(rel.source)
else:
if rel.source == node:
cluster.add(rel.target)

if len(cluster) > 1:
if cluster not in clusters:
clusters.append(cluster)

return clusters
# Remove subsets from clusters
unique_clusters = []
for cluster in clusters:
if not any(cluster < other for other in clusters):
unique_clusters.append(cluster)
clusters = unique_clusters

cluster_dict = {}
for cluster in clusters:
cluster_dict.update({cluster.pop(): cluster})

return cluster_dict
38 changes: 38 additions & 0 deletions src/ragas/testset/graph_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import typing as t

from ragas.testset.graph import KnowledgeGraph, Node


def get_child_nodes(node: Node, graph: KnowledgeGraph, level: int = 1) -> t.List[Node]:
"""
Get the child nodes of a given node up to a specified level.

Parameters
----------
node : Node
The node to get the children of.
graph : KnowledgeGraph
The knowledge graph containing the node.
level : int
The maximum level to which child nodes are searched.

Returns
-------
List[Node]
The list of child nodes up to the specified level.
"""
children = []

# Helper function to perform depth-limited search for child nodes
def dfs(current_node: Node, current_level: int):
if current_level > level:
return
for rel in graph.relationships:
if rel.source == current_node and rel.type == "child":
children.append(rel.target)
dfs(rel.target, current_level + 1)

# Start DFS from the initial node at level 0
dfs(node, 1)

return children
31 changes: 11 additions & 20 deletions src/ragas/testset/synthesizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,29 @@
import typing as t

from ragas.llms import BaseRagasLLM

from .abstract_query import (
AbstractQuerySynthesizer,
ComparativeAbstractQuerySynthesizer,
from ragas.testset.synthesizers.multi_hop import (
MultiHopAbstractQuerySynthesizer,
MultiHopSpecificQuerySynthesizer,
)
from ragas.testset.synthesizers.single_hop.specific import (
SingleHopSpecificQuerySynthesizer,
)

from .base import BaseSynthesizer
from .base_query import QuerySynthesizer
from .specific_query import SpecificQuerySynthesizer

QueryDistribution = t.List[t.Tuple[BaseSynthesizer, float]]


def default_query_distribution(llm: BaseRagasLLM) -> QueryDistribution:
"""
Default query distribution for the test set.

By default, 25% of the queries are generated using `AbstractQuerySynthesizer`,
25% are generated using `ComparativeAbstractQuerySynthesizer`, and 50% are
generated using `SpecificQuerySynthesizer`.
"""
""" """
return [
(AbstractQuerySynthesizer(llm=llm), 0.25),
(ComparativeAbstractQuerySynthesizer(llm=llm), 0.25),
(SpecificQuerySynthesizer(llm=llm), 0.5),
(SingleHopSpecificQuerySynthesizer(llm=llm), 0.5),
(MultiHopAbstractQuerySynthesizer(llm=llm), 0.25),
(MultiHopSpecificQuerySynthesizer(llm=llm), 0.25),
]


__all__ = [
"BaseSynthesizer",
"QuerySynthesizer",
"AbstractQuerySynthesizer",
"ComparativeAbstractQuerySynthesizer",
"SpecificQuerySynthesizer",
"default_query_distribution",
]
Loading
Loading