From d7fd4440839d3fef6ffddb4cd36a87cc9b61b540 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 12 Nov 2024 20:58:14 +0530 Subject: [PATCH 01/18] add value error --- src/ragas/testset/persona.py | 3 +++ src/ragas/testset/synthesizers/multi_hop/abstract.py | 3 +++ src/ragas/testset/synthesizers/multi_hop/specific.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/src/ragas/testset/persona.py b/src/ragas/testset/persona.py index 09a56663b..89ac471bd 100644 --- a/src/ragas/testset/persona.py +++ b/src/ragas/testset/persona.py @@ -92,6 +92,9 @@ def generate_personas_from_kg( """ nodes = [node for node in kg.nodes if filter_fn(node)] + if len(nodes) == 0: + raise ValueError("No nodes that satisfied the given filer. Try changing the filter.") + summaries = [node.properties.get("summary") for node in nodes] summaries = [summary for summary in summaries if isinstance(summary, str)] diff --git a/src/ragas/testset/synthesizers/multi_hop/abstract.py b/src/ragas/testset/synthesizers/multi_hop/abstract.py index 20162ff4e..f80630bc1 100644 --- a/src/ragas/testset/synthesizers/multi_hop/abstract.py +++ b/src/ragas/testset/synthesizers/multi_hop/abstract.py @@ -69,6 +69,9 @@ async def _generate_scenarios( ) logger.info("found %d clusters", len(node_clusters)) scenarios = [] + + if len(node_clusters) == 0: + raise ValueError("No clusters found in the knowledge graph. Try changing the relationship condition.") num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) diff --git a/src/ragas/testset/synthesizers/multi_hop/specific.py b/src/ragas/testset/synthesizers/multi_hop/specific.py index b71af16c3..ac3f071b0 100644 --- a/src/ragas/testset/synthesizers/multi_hop/specific.py +++ b/src/ragas/testset/synthesizers/multi_hop/specific.py @@ -80,6 +80,10 @@ async def _generate_scenarios( logger.info("found %d clusters", len(cluster_dict)) scenarios = [] + + if len(node_clusters) == 0: + raise ValueError("No clusters found in the knowledge graph. Try changing the relationship condition.") + num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) for cluster in node_clusters: From 0d19cce30d243a0cecf67c4583bc4b2318e32bec Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 12 Nov 2024 20:58:24 +0530 Subject: [PATCH 02/18] tuning defauls --- src/ragas/testset/transforms/default.py | 121 +++++++++++++++--------- 1 file changed, 78 insertions(+), 43 deletions(-) diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 071c42756..76eb50a41 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -25,8 +25,10 @@ from .engine import Transforms +from langchain_core.documents import Document as LCDocument def default_transforms( + documents: t.List[LCDocument], llm: BaseRagasLLM, embedding_model: BaseRagasEmbeddings, ) -> Transforms: @@ -46,52 +48,85 @@ def default_transforms( A list of transformation steps to be applied to the knowledge graph. """ - - headline_extractor = HeadlinesExtractor(llm=llm) - splitter = HeadlineSplitter(min_tokens=500) - - def summary_filter(node): + + def count_doc_length_bins(documents, bin_ranges): + data = [num_tokens_from_string(doc.properties["page_content"]) for doc in documents] + bins = {f"{start}-{end}": 0 for start, end in bin_ranges} + + for num in data: + for start, end in bin_ranges: + if start <= num <= end: + bins[f"{start}-{end}"] += 1 + break # Move to the next number once it’s placed in a bin + + return bins + + def filter_doc_with_num_tokens(node, min_num_tokens=500): return ( node.type == NodeType.DOCUMENT - and num_tokens_from_string(node.properties["page_content"]) > 500 + and num_tokens_from_string(node.properties["page_content"]) > min_num_tokens + ) + + def filter_docs(node): + return node.type == NodeType.DOCUMENT + + def filter_chunks(node): + return node.type == NodeType.CHUNK + + bin_ranges = [(0, 500), (501, 100000)] + result = count_doc_length_bins(data, bin_ranges) + result = {k: v/len(documents) for k, v in result.items()} + + transforms = [] + + if result["501-100000"] > 0.1: + headline_extractor = HeadlinesExtractor(llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node)) + splitter = HeadlineSplitter(min_tokens=500) + summary_extractor = SummaryExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) + ) + + theme_extractor = ThemesExtractor(llm=llm, filter_nodes=lambda node: filter_docs(node)) + ner_extractor = NERExtractor( + llm=llm, filter_nodes=lambda node: filter_chunks(node) + ) + + summary_emb_extractor = EmbeddingExtractor( + embedding_model=embedding_model, + property_name="summary_embedding", + embed_property_name="summary", + filter_nodes=lambda node: filter_doc_with_num_tokens(node), ) - summary_extractor = SummaryExtractor( - llm=llm, filter_nodes=lambda node: summary_filter(node) - ) - - theme_extractor = ThemesExtractor(llm=llm) - ner_extractor = NERExtractor( - llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK - ) - - summary_emb_extractor = EmbeddingExtractor( - embedding_model=embedding_model, - property_name="summary_embedding", - embed_property_name="summary", - filter_nodes=lambda node: summary_filter(node), - ) - - cosine_sim_builder = CosineSimilarityBuilder( - property_name="summary_embedding", - new_property_name="summary_similarity", - threshold=0.7, - filter_nodes=lambda node: summary_filter(node), - ) - - ner_overlap_sim = OverlapScoreBuilder( - threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK - ) - - node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK) - - transforms = [ - headline_extractor, - splitter, - summary_extractor, - node_filter, - Parallel(summary_emb_extractor, theme_extractor, ner_extractor), - Parallel(cosine_sim_builder, ner_overlap_sim), - ] + cosine_sim_builder = CosineSimilarityBuilder( + property_name="summary_embedding", + new_property_name="summary_similarity", + threshold=0.7, + filter_nodes=lambda node: filter_doc_with_num_tokens(node), + ) + + + ner_overlap_sim = OverlapScoreBuilder( + threshold=0.01, filter_nodes=lambda node: filter_chunks(node) + ) + + node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: filter_chunks(node)) + + transforms = [ + headline_extractor, + splitter, + summary_extractor, + node_filter, + Parallel(summary_emb_extractor, theme_extractor, ner_extractor), + Parallel(cosine_sim_builder, ner_overlap_sim), + ] + else: + ner_extractor = NERExtractor( + llm=llm, filter_nodes=lambda node: filter_chunks(node) + ) + ner_overlap_sim = OverlapScoreBuilder( + threshold=0.01, filter_nodes=lambda node: filter_chunks(node) + ) + return transforms From 134153626155ff92a2494f51efaa2a3e2bc88a4a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 13:40:52 +0530 Subject: [PATCH 03/18] tune default transforms --- src/ragas/testset/transforms/default.py | 82 +++++++++++++++++-------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 76eb50a41..b81909773 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -27,6 +27,7 @@ from langchain_core.documents import Document as LCDocument + def default_transforms( documents: t.List[LCDocument], llm: BaseRagasLLM, @@ -48,11 +49,13 @@ def default_transforms( A list of transformation steps to be applied to the knowledge graph. """ - + def count_doc_length_bins(documents, bin_ranges): - data = [num_tokens_from_string(doc.properties["page_content"]) for doc in documents] + data = [ + num_tokens_from_string(doc.page_content) for doc in documents + ] bins = {f"{start}-{end}": 0 for start, end in bin_ranges} - + for num in data: for start, end in bin_ranges: if start <= num <= end: @@ -60,37 +63,41 @@ def count_doc_length_bins(documents, bin_ranges): break # Move to the next number once it’s placed in a bin return bins - + def filter_doc_with_num_tokens(node, min_num_tokens=500): return ( node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties["page_content"]) > min_num_tokens ) - + def filter_docs(node): return node.type == NodeType.DOCUMENT - + def filter_chunks(node): return node.type == NodeType.CHUNK - - bin_ranges = [(0, 500), (501, 100000)] - result = count_doc_length_bins(data, bin_ranges) - result = {k: v/len(documents) for k, v in result.items()} - + + bin_ranges = [(0,100),(101, 500), (501, 100000)] + result = count_doc_length_bins(documents, bin_ranges) + result = {k: v / len(documents) for k, v in result.items()} + transforms = [] - - if result["501-100000"] > 0.1: - headline_extractor = HeadlinesExtractor(llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node)) + + if result["501-100000"] >= 0.5: + headline_extractor = HeadlinesExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) + ) splitter = HeadlineSplitter(min_tokens=500) summary_extractor = SummaryExtractor( llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) ) - - theme_extractor = ThemesExtractor(llm=llm, filter_nodes=lambda node: filter_docs(node)) + + theme_extractor = ThemesExtractor( + llm=llm, filter_nodes=lambda node: filter_docs(node) + ) ner_extractor = NERExtractor( llm=llm, filter_nodes=lambda node: filter_chunks(node) ) - + summary_emb_extractor = EmbeddingExtractor( embedding_model=embedding_model, property_name="summary_embedding", @@ -104,13 +111,14 @@ def filter_chunks(node): threshold=0.7, filter_nodes=lambda node: filter_doc_with_num_tokens(node), ) - ner_overlap_sim = OverlapScoreBuilder( threshold=0.01, filter_nodes=lambda node: filter_chunks(node) ) - - node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: filter_chunks(node)) + + node_filter = CustomNodeFilter( + llm=llm, filter_nodes=lambda node: filter_chunks(node) + ) transforms = [ headline_extractor, @@ -120,13 +128,37 @@ def filter_chunks(node): Parallel(summary_emb_extractor, theme_extractor, ner_extractor), Parallel(cosine_sim_builder, ner_overlap_sim), ] - else: - ner_extractor = NERExtractor( - llm=llm, filter_nodes=lambda node: filter_chunks(node) + elif result["101-500"] >= 0.5: + summary_extractor = SummaryExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100) ) - ner_overlap_sim = OverlapScoreBuilder( - threshold=0.01, filter_nodes=lambda node: filter_chunks(node) + summary_emb_extractor = EmbeddingExtractor( + embedding_model=embedding_model, + property_name="summary_embedding", + embed_property_name="summary", + filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100), + ) + + cosine_sim_builder = CosineSimilarityBuilder( + property_name="summary_embedding", + new_property_name="summary_similarity", + threshold=0.7, + filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100), ) + ner_extractor = NERExtractor(llm=llm) + ner_overlap_sim = OverlapScoreBuilder(threshold=0.01) + theme_extractor = ThemesExtractor( + llm=llm, filter_nodes=lambda node: filter_docs(node) + ) + node_filter = CustomNodeFilter( + llm=llm + ) + + transforms = [ + summary_extractor, + node_filter, + Parallel(summary_emb_extractor, theme_extractor, ner_extractor), + ner_overlap_sim] return transforms From 0ffaa636c60762287e2585ad02890b9a322f81a5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 13:41:12 +0530 Subject: [PATCH 04/18] add better filtering --- .../synthesizers/multi_hop/abstract.py | 27 +++++++---- .../synthesizers/multi_hop/specific.py | 46 +++++++++++-------- .../synthesizers/single_hop/specific.py | 46 ++++++++++++++----- 3 files changed, 79 insertions(+), 40 deletions(-) diff --git a/src/ragas/testset/synthesizers/multi_hop/abstract.py b/src/ragas/testset/synthesizers/multi_hop/abstract.py index f80630bc1..ccab69346 100644 --- a/src/ragas/testset/synthesizers/multi_hop/abstract.py +++ b/src/ragas/testset/synthesizers/multi_hop/abstract.py @@ -7,7 +7,7 @@ import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.graph_queries import get_child_nodes from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.multi_hop.base import ( @@ -42,6 +42,17 @@ class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer): concept_combination_prompt: PydanticPrompt = ConceptCombinationPrompt() theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: + + node_clusters = knowledge_graph.find_indirect_clusters( + relationship_condition=lambda rel: ( + True if rel.get_property("summary_similarity") else False + ), + depth_limit=3, + ) + logger.info("found %d clusters", len(node_clusters)) + return node_clusters + async def _generate_scenarios( self, n: int, @@ -61,17 +72,13 @@ async def _generate_scenarios( 4. Sample diverse combinations of scenarios to get n samples """ - node_clusters = knowledge_graph.find_indirect_clusters( - relationship_condition=lambda rel: ( - True if rel.get_property("summary_similarity") else False - ), - depth_limit=3, - ) - logger.info("found %d clusters", len(node_clusters)) + node_clusters = self.get_node_clusters(knowledge_graph) scenarios = [] - + if len(node_clusters) == 0: - raise ValueError("No clusters found in the knowledge graph. Try changing the relationship condition.") + raise ValueError( + "No clusters found in the knowledge graph. Try changing the relationship condition." + ) num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) diff --git a/src/ragas/testset/synthesizers/multi_hop/specific.py b/src/ragas/testset/synthesizers/multi_hop/specific.py index ac3f071b0..02d483278 100644 --- a/src/ragas/testset/synthesizers/multi_hop/specific.py +++ b/src/ragas/testset/synthesizers/multi_hop/specific.py @@ -7,7 +7,7 @@ import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.multi_hop.base import ( MultiHopQuerySynthesizer, @@ -38,9 +38,26 @@ class MultiHopSpecificQuerySynthesizer(MultiHopQuerySynthesizer): """ name: str = "multi_hop_specific_query_synthesizer" + relation_type: str = "entities_overlap" + property_name: str = "entities" theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() generate_query_reference_prompt: PydanticPrompt = QueryAnswerGenerationPrompt() + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: + + cluster_dict = knowledge_graph.find_direct_clusters( + relationship_condition=lambda rel: ( + True if rel.type == self.relation_type else False + ) + ) + logger.info("found %d clusters", len(cluster_dict)) + node_clusters = [] + for key_node, list_of_nodes in cluster_dict.items(): + for node in list_of_nodes: + node_clusters.append((key_node, node)) + + return node_clusters + async def _generate_scenarios( self, n: int, @@ -61,30 +78,21 @@ async def _generate_scenarios( 4. Return the list of scenarios of length n """ - cluster_dict = knowledge_graph.find_direct_clusters( - relationship_condition=lambda rel: ( - True if rel.type == "entities_overlap" else False + node_clusters = self.get_node_clusters(knowledge_graph) + + if len(node_clusters) == 0: + raise ValueError( + "No clusters found in the knowledge graph. Try changing the relationship condition." ) - ) + + num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) valid_relationships = [ rel for rel in knowledge_graph.relationships - if rel.type == "entities_overlap" + if rel.type == self.relation_type ] - - node_clusters = [] - for key_node, list_of_nodes in cluster_dict.items(): - for node in list_of_nodes: - node_clusters.append((key_node, node)) - - logger.info("found %d clusters", len(cluster_dict)) scenarios = [] - - if len(node_clusters) == 0: - raise ValueError("No clusters found in the knowledge graph. Try changing the relationship condition.") - - num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) for cluster in node_clusters: if len(scenarios) < n: @@ -110,7 +118,7 @@ async def _generate_scenarios( overlapped_items, PersonaList(personas=persona_list), persona_concepts, - property_name="entities", + property_name=self.property_name, ) base_scenarios = self.sample_diverse_combinations( base_scenarios, num_sample_per_cluster diff --git a/src/ragas/testset/synthesizers/single_hop/specific.py b/src/ragas/testset/synthesizers/single_hop/specific.py index 283bca8d7..73bae0517 100644 --- a/src/ragas/testset/synthesizers/single_hop/specific.py +++ b/src/ragas/testset/synthesizers/single_hop/specific.py @@ -2,12 +2,13 @@ import logging import typing as t +from collections import defaultdict from dataclasses import dataclass import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.base import BaseScenario from ragas.testset.synthesizers.prompts import ( @@ -40,6 +41,37 @@ class SingleHopScenario(BaseScenario): class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer): name: str = "single_hop_specifc_query_synthesizer" theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() + property_name: str = "entities" + + def get_nodes(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]: + + node_type_dict = defaultdict(int) + for node in knowledge_graph.nodes: + if ( + node.type.name == "CHUNK" + and node.get_property(self.property_name) is not None + ): + node_type_dict["CHUNK"] += 1 + elif ( + node.type.name == "DOCUMENT" + and node.get_property(self.property_name) is not None + ): + node_type_dict["DOCUMENT"] += 1 + else: + pass + + node_filter = ( + "CHUNK" + if node_type_dict["CHUNK"] > node_type_dict["DOCUMENT"] + else "DOCUMENT" + ) + + nodes = [] + for node in knowledge_graph.nodes: + if node.type.name == node_filter: + nodes.append(node) + + return nodes async def _generate_scenarios( self, @@ -61,15 +93,7 @@ async def _generate_scenarios( 4. Return the list of scenarios """ - property_name = "entities" - nodes = [] - for node in knowledge_graph.nodes: - if ( - node.type.name == "CHUNK" - and node.get_property(property_name) is not None - ): - nodes.append(node) - + nodes = self.get_nodes(knowledge_graph) if len(nodes) == 0: raise ValueError("No nodes found with the `entities` property.") samples_per_node = int(np.ceil(n / len(nodes))) @@ -78,7 +102,7 @@ async def _generate_scenarios( for node in nodes: if len(scenarios) >= n: break - themes = node.get_property(property_name) + themes = node.properties.get(self.property_name, [""]) prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list) persona_concepts = await self.theme_persona_matching_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks From b0e60e391e83c84df890f732315b4e97c1ea76ad Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 13:41:43 +0530 Subject: [PATCH 05/18] calling default filter --- src/ragas/testset/synthesizers/generate.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py index c006e6c6a..b7723677e 100644 --- a/src/ragas/testset/synthesizers/generate.py +++ b/src/ragas/testset/synthesizers/generate.py @@ -10,10 +10,7 @@ from ragas._analytics import TestsetGenerationEvent, track from ragas.callbacks import new_group from ragas.cost import TokenUsageParser -from ragas.embeddings.base import ( - BaseRagasEmbeddings, - LlamaIndexEmbeddingsWrapper, -) +from ragas.embeddings.base import BaseRagasEmbeddings, LlamaIndexEmbeddingsWrapper from ragas.executor import Executor from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper from ragas.run_config import RunConfig @@ -155,6 +152,7 @@ def generate_with_langchain_docs( if not transforms: transforms = default_transforms( + documents=list(documents), llm=transforms_llm or self.llm, embedding_model=transforms_embedding_model, ) @@ -224,6 +222,7 @@ def generate_with_llamaindex_docs( transforms_embedding_model ) transforms = default_transforms( + documents=[LCDocument(page_content=doc.text) for doc in documents], llm=llm_for_transforms, embedding_model=embedding_model_for_transforms, ) From 74dbf1762f6bb246bd4076751dcce70bf565119b Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 13:42:06 +0530 Subject: [PATCH 06/18] modify custom filter --- src/ragas/testset/transforms/filters.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/ragas/testset/transforms/filters.py b/src/ragas/testset/transforms/filters.py index 44370fde4..116da029c 100644 --- a/src/ragas/testset/transforms/filters.py +++ b/src/ragas/testset/transforms/filters.py @@ -62,15 +62,21 @@ class CustomNodeFilter(LLMBasedNodeFilter): rubrics: t.Dict[str, str] = field(default_factory=lambda: DEFAULT_RUBRICS) async def custom_filter(self, node: Node, kg: KnowledgeGraph) -> bool: - - parent_nodes = get_parent_nodes(node, kg) - if len(parent_nodes) > 0: - summary = parent_nodes[0].properties.get("summary", "") + + if node.type.name == "CHUNK": + parent_nodes = get_parent_nodes(node, kg) + if len(parent_nodes) > 0: + summary = parent_nodes[0].properties.get("summary", "") + else: + summary = "" else: - summary = "" - + summary = node.properties.get("summary", "") + if summary == "": - logger.warning(f"Node {node} has no parent node with a summary.") + logger.warning( + f"Node {node.id} does not have a summary. Skipping filtering." + ) + return False prompt_input = QuestionPotentialInput( document_summary=summary, From 2578f8463b817684582948f234cc655613578935 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:15:27 +0530 Subject: [PATCH 07/18] modify query distribution using query --- src/ragas/testset/synthesizers/__init__.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/ragas/testset/synthesizers/__init__.py b/src/ragas/testset/synthesizers/__init__.py index 48679a179..f9132d9b3 100644 --- a/src/ragas/testset/synthesizers/__init__.py +++ b/src/ragas/testset/synthesizers/__init__.py @@ -1,6 +1,7 @@ import typing as t from ragas.llms import BaseRagasLLM +from ragas.testset.graph import KnowledgeGraph from ragas.testset.synthesizers.multi_hop import ( MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer, @@ -14,14 +15,23 @@ QueryDistribution = t.List[t.Tuple[BaseSynthesizer, float]] -def default_query_distribution(llm: BaseRagasLLM) -> QueryDistribution: +def default_query_distribution( + llm: BaseRagasLLM, kg: KnowledgeGraph +) -> QueryDistribution: """ """ - return [ - (SingleHopSpecificQuerySynthesizer(llm=llm), 0.5), - (MultiHopAbstractQuerySynthesizer(llm=llm), 0.25), - (MultiHopSpecificQuerySynthesizer(llm=llm), 0.25), + default_queries = [ + SingleHopSpecificQuerySynthesizer(llm=llm), + MultiHopAbstractQuerySynthesizer(llm=llm), + MultiHopSpecificQuerySynthesizer(llm=llm), ] + available_queries = [] + for query in default_queries: + if query.get_node_clusters(kg): + available_queries.append(query) + + return [(query, 1 / len(available_queries)) for query in available_queries] + __all__ = [ "BaseSynthesizer", From 11acb85eb33f8607d6b83c267003c4eacd11f4b5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:15:44 +0530 Subject: [PATCH 08/18] simplify defaults --- src/ragas/testset/transforms/default.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index b81909773..d92d62e32 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -51,9 +51,7 @@ def default_transforms( """ def count_doc_length_bins(documents, bin_ranges): - data = [ - num_tokens_from_string(doc.page_content) for doc in documents - ] + data = [num_tokens_from_string(doc.page_content) for doc in documents] bins = {f"{start}-{end}": 0 for start, end in bin_ranges} for num in data: @@ -76,7 +74,7 @@ def filter_docs(node): def filter_chunks(node): return node.type == NodeType.CHUNK - bin_ranges = [(0,100),(101, 500), (501, 100000)] + bin_ranges = [(0, 100), (101, 500), (501, 100000)] result = count_doc_length_bins(documents, bin_ranges) result = {k: v / len(documents) for k, v in result.items()} @@ -142,23 +140,22 @@ def filter_chunks(node): cosine_sim_builder = CosineSimilarityBuilder( property_name="summary_embedding", new_property_name="summary_similarity", - threshold=0.7, + threshold=0.5, filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100), ) - + ner_extractor = NERExtractor(llm=llm) ner_overlap_sim = OverlapScoreBuilder(threshold=0.01) theme_extractor = ThemesExtractor( llm=llm, filter_nodes=lambda node: filter_docs(node) ) - node_filter = CustomNodeFilter( - llm=llm - ) - + node_filter = CustomNodeFilter(llm=llm) + transforms = [ summary_extractor, node_filter, Parallel(summary_emb_extractor, theme_extractor, ner_extractor), - ner_overlap_sim] + ner_overlap_sim, + ] return transforms From 2214f6a91cebff783670774d5321c0267f83d712 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:16:05 +0530 Subject: [PATCH 09/18] rename method --- src/ragas/testset/synthesizers/single_hop/specific.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragas/testset/synthesizers/single_hop/specific.py b/src/ragas/testset/synthesizers/single_hop/specific.py index 73bae0517..7561ac7b7 100644 --- a/src/ragas/testset/synthesizers/single_hop/specific.py +++ b/src/ragas/testset/synthesizers/single_hop/specific.py @@ -43,7 +43,7 @@ class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer): theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() property_name: str = "entities" - def get_nodes(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]: + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]: node_type_dict = defaultdict(int) for node in knowledge_graph.nodes: @@ -93,7 +93,7 @@ async def _generate_scenarios( 4. Return the list of scenarios """ - nodes = self.get_nodes(knowledge_graph) + nodes = self.get_node_clusters(knowledge_graph) if len(nodes) == 0: raise ValueError("No nodes found with the `entities` property.") samples_per_node = int(np.ceil(n / len(nodes))) From 5fc823fc6817f546547260e79f4bda3e09ddf930 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:19:13 +0530 Subject: [PATCH 10/18] add better filter --- src/ragas/testset/persona.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ragas/testset/persona.py b/src/ragas/testset/persona.py index 89ac471bd..44021ba3d 100644 --- a/src/ragas/testset/persona.py +++ b/src/ragas/testset/persona.py @@ -93,10 +93,13 @@ def generate_personas_from_kg( nodes = [node for node in kg.nodes if filter_fn(node)] if len(nodes) == 0: - raise ValueError("No nodes that satisfied the given filer. Try changing the filter.") - + raise ValueError( + "No nodes that satisfied the given filer. Try changing the filter." + ) + summaries = [node.properties.get("summary") for node in nodes] summaries = [summary for summary in summaries if isinstance(summary, str)] + num_personas = min(num_personas, len(summaries)) embeddings = [] for node in nodes: From 47e54968a6bae4b35bc8f357aa4e37f96fb9b575 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:19:58 +0530 Subject: [PATCH 11/18] call defaults from generate --- src/ragas/testset/synthesizers/generate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py index b7723677e..65db77e3f 100644 --- a/src/ragas/testset/synthesizers/generate.py +++ b/src/ragas/testset/synthesizers/generate.py @@ -311,7 +311,9 @@ def generate( if run_config is not None: self.llm.set_run_config(run_config) - query_distribution = query_distribution or default_query_distribution(self.llm) + query_distribution = query_distribution or default_query_distribution( + self.llm, self.knowledge_graph + ) callbacks = callbacks or [] # dict to store any callbacks we define From 71d14388e1bae30451729d50302b92080be9da10 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:35:22 +0530 Subject: [PATCH 12/18] modify test --- tests/unit/test_analytics.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py index 4233ea7d3..7647649fd 100644 --- a/tests/unit/test_analytics.py +++ b/tests/unit/test_analytics.py @@ -2,6 +2,7 @@ import typing as t +import numpy as np import pytest from langchain_core.outputs import Generation, LLMResult from langchain_core.prompt_values import StringPromptValue as PromptValue @@ -139,11 +140,16 @@ def test_testset_generation_tracking(monkeypatch): "multi_hop_specific_query_synthesizer", ] - assert testset_event_payload.model_dump()["evolution_percentages"] == [ - 0.5, - 0.25, - 0.25, - ] + assert all( + np.isclose( + testset_event_payload.model_dump()["evolution_percentages"], + [ + 0.33, + 0.33, + 0.33, + ], atol=0.01 + ).tolist() + ) # just in the case you actually want to check if tracking is working in the # dashboard From 0841115c3edf44d01f4c102f8b2ca1cc514f039e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:35:39 +0530 Subject: [PATCH 13/18] make kg optional --- src/ragas/testset/synthesizers/__init__.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/ragas/testset/synthesizers/__init__.py b/src/ragas/testset/synthesizers/__init__.py index 7e0c003a9..e31a86515 100644 --- a/src/ragas/testset/synthesizers/__init__.py +++ b/src/ragas/testset/synthesizers/__init__.py @@ -16,18 +16,21 @@ def default_query_distribution( - llm: BaseRagasLLM, kg: KnowledgeGraph + llm: BaseRagasLLM, kg: t.Optional[KnowledgeGraph] = None ) -> QueryDistribution: """ """ default_queries = [ SingleHopSpecificQuerySynthesizer(llm=llm), MultiHopAbstractQuerySynthesizer(llm=llm), MultiHopSpecificQuerySynthesizer(llm=llm), - - available_queries = [] - for query in default_queries: - if query.get_node_clusters(kg): - available_queries.append(query) + ] + if kg is not None: + available_queries = [] + for query in default_queries: + if query.get_node_clusters(kg): + available_queries.append(query) + else: + available_queries = default_queries return [(query, 1 / len(available_queries)) for query in available_queries] From 53c12def428d14862c178e5f9f59d36f35cfb856 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 16:45:00 +0530 Subject: [PATCH 14/18] add valuerror --- src/ragas/testset/transforms/default.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index d92d62e32..290415344 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -80,7 +80,7 @@ def filter_chunks(node): transforms = [] - if result["501-100000"] >= 0.5: + if result["501-100000"] >= 0.25: headline_extractor = HeadlinesExtractor( llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) ) @@ -126,7 +126,7 @@ def filter_chunks(node): Parallel(summary_emb_extractor, theme_extractor, ner_extractor), Parallel(cosine_sim_builder, ner_overlap_sim), ] - elif result["101-500"] >= 0.5: + elif result["101-500"] >= 0.25: summary_extractor = SummaryExtractor( llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100) ) @@ -157,5 +157,7 @@ def filter_chunks(node): Parallel(summary_emb_extractor, theme_extractor, ner_extractor), ner_overlap_sim, ] + else: + raise ValueError("Documents appears to be too short (ie 100 tokens or less). Please provide longer documents.") return transforms From 22ce786064cbf416ce85a756365364b9246ff865 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 17:36:40 +0530 Subject: [PATCH 15/18] modify filter --- src/ragas/testset/persona.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/testset/persona.py b/src/ragas/testset/persona.py index 44021ba3d..b61b3c908 100644 --- a/src/ragas/testset/persona.py +++ b/src/ragas/testset/persona.py @@ -19,7 +19,7 @@ def default_filter(node: Node) -> bool: node.type.name == "DOCUMENT" and node.properties.get("summary_embedding") is not None ): - return random.random() < 0.25 + return True else: return False From 0756187893b41aed75ebf131282ca771af87a306 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 17:52:13 +0530 Subject: [PATCH 16/18] error handling --- src/ragas/testset/synthesizers/multi_hop/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/testset/synthesizers/multi_hop/base.py b/src/ragas/testset/synthesizers/multi_hop/base.py index 3b2e3010c..e51a14623 100644 --- a/src/ragas/testset/synthesizers/multi_hop/base.py +++ b/src/ragas/testset/synthesizers/multi_hop/base.py @@ -73,7 +73,7 @@ def prepare_combinations( valid_nodes = [] for node in nodes: node_themes = [ - theme.lower() for theme in node.get_property(property_name) + theme.lower() for theme in node.properties.get(property_name, []) ] if node.get_property(property_name) and any( concept.lower() in node_themes for concept in combination From 7b4c7ed8cdd8cd321d6248e05b23e02e61516651 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 17:56:24 +0530 Subject: [PATCH 17/18] format changes --- src/ragas/testset/persona.py | 1 - src/ragas/testset/transforms/default.py | 4 +++- src/ragas/testset/transforms/extractors/llm_based.py | 5 +++-- src/ragas/testset/transforms/filters.py | 4 ++-- tests/unit/test_analytics.py | 3 ++- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ragas/testset/persona.py b/src/ragas/testset/persona.py index b61b3c908..c3d4bc95e 100644 --- a/src/ragas/testset/persona.py +++ b/src/ragas/testset/persona.py @@ -1,5 +1,4 @@ import logging -import random import typing as t import numpy as np diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 290415344..611174627 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -158,6 +158,8 @@ def filter_chunks(node): ner_overlap_sim, ] else: - raise ValueError("Documents appears to be too short (ie 100 tokens or less). Please provide longer documents.") + raise ValueError( + "Documents appears to be too short (ie 100 tokens or less). Please provide longer documents." + ) return transforms diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index c7480e926..04616daa1 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -71,7 +71,7 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]): class Headlines(BaseModel): - headlines: t.List[str] + headlines: t.List[str] class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]): @@ -115,8 +115,9 @@ class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines "Main Concepts", "Detailed Analysis", "Future Directions", - ],) + ], ), + ), ] diff --git a/src/ragas/testset/transforms/filters.py b/src/ragas/testset/transforms/filters.py index 116da029c..44add2758 100644 --- a/src/ragas/testset/transforms/filters.py +++ b/src/ragas/testset/transforms/filters.py @@ -62,7 +62,7 @@ class CustomNodeFilter(LLMBasedNodeFilter): rubrics: t.Dict[str, str] = field(default_factory=lambda: DEFAULT_RUBRICS) async def custom_filter(self, node: Node, kg: KnowledgeGraph) -> bool: - + if node.type.name == "CHUNK": parent_nodes = get_parent_nodes(node, kg) if len(parent_nodes) > 0: @@ -71,7 +71,7 @@ async def custom_filter(self, node: Node, kg: KnowledgeGraph) -> bool: summary = "" else: summary = node.properties.get("summary", "") - + if summary == "": logger.warning( f"Node {node.id} does not have a summary. Skipping filtering." diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py index 7647649fd..7f263d51c 100644 --- a/tests/unit/test_analytics.py +++ b/tests/unit/test_analytics.py @@ -147,7 +147,8 @@ def test_testset_generation_tracking(monkeypatch): 0.33, 0.33, 0.33, - ], atol=0.01 + ], + atol=0.01, ).tolist() ) From 27c9a44e48047c11581b3acf1e95382360a05a59 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 14 Nov 2024 18:07:18 +0530 Subject: [PATCH 18/18] correct filter --- src/ragas/testset/transforms/default.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 611174627..11c0f84f9 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -90,7 +90,7 @@ def filter_chunks(node): ) theme_extractor = ThemesExtractor( - llm=llm, filter_nodes=lambda node: filter_docs(node) + llm=llm, filter_nodes=lambda node: filter_chunks(node) ) ner_extractor = NERExtractor( llm=llm, filter_nodes=lambda node: filter_chunks(node)