Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 120 additions & 159 deletions docs/howtos/integrations/_llamaindex.md

Large diffs are not rendered by default.

502 changes: 211 additions & 291 deletions docs/howtos/integrations/llamaindex.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ nav:
- Cost Analysis: howtos/applications/_cost.md
- Integrations:
- howtos/integrations/index.md
- LlamaIndex: howtos/integrations/_llamaindex.md
- Migrations:
- From v0.1 to v0.2: howtos/migrations/migrate_from_v01_to_v02.md
- 📖 References:
Expand Down
6 changes: 6 additions & 0 deletions src/ragas/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ def __getitem__(
else:
raise TypeError("Index must be int or slice")

def is_multi_turn(self) -> bool:
return self.get_sample_type() == MultiTurnSample

def to_list(self) -> t.List[t.Dict]:
rows = [sample.to_dict() for sample in self.samples]

Expand All @@ -341,6 +344,9 @@ def from_list(cls, data: t.List[t.Dict]) -> EvaluationDataset:
samples.extend(SingleTurnSample(**sample) for sample in data)
return cls(samples=samples)

def __repr__(self) -> str:
return f"EvaluationDataset(features={self.features()}, len={len(self.samples)})"


@dataclass
class EvaluationResult:
Expand Down
2 changes: 1 addition & 1 deletion src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def evaluate(
Returns
-------
EvaluationResult
EvaluationResult object containing the scores of each metric.
EvaluationResult object containing the scores of each metric.
You can use this do analysis later.

Raises
Expand Down
81 changes: 41 additions & 40 deletions src/ragas/integrations/llama_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@

import logging
import typing as t
from uuid import uuid4

from datasets import Dataset

from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
from ragas.embeddings import LlamaIndexEmbeddingsWrapper
from ragas.evaluation import evaluate as ragas_evaluate
from ragas.exceptions import ExceptionInRunner
from ragas.executor import Executor
from ragas.llms import LlamaIndexLLMWrapper
from ragas.run_config import RunConfig

if t.TYPE_CHECKING:
from langchain_core.callbacks import Callbacks
from llama_index.core.base.embeddings.base import (
BaseEmbedding as LlamaIndexEmbeddings,
)
from llama_index.core.base.llms.base import BaseLLM as LlamaindexLLM

from ragas.cost import TokenUsageParser
from ragas.evaluation import EvaluationResult
from ragas.metrics.base import Metric

Expand All @@ -28,74 +27,76 @@

def evaluate(
query_engine,
dataset: Dataset,
dataset: EvaluationDataset,
metrics: list[Metric],
llm: t.Optional[LlamaindexLLM] = None,
embeddings: t.Optional[LlamaIndexEmbeddings] = None,
callbacks: t.Optional[Callbacks] = None,
in_ci: bool = False,
run_config: t.Optional[RunConfig] = None,
token_usage_parser: t.Optional[TokenUsageParser] = None,
raise_exceptions: bool = False,
column_map: t.Optional[t.Dict[str, str]] = None,
run_config: t.Optional[RunConfig] = None,
show_progress: bool = True,
) -> EvaluationResult:
column_map = column_map or {}

# wrap llms and embeddings
li_llm = None
if llm is not None:
li_llm = LlamaIndexLLMWrapper(llm)
li_llm = LlamaIndexLLMWrapper(llm, run_config=run_config)
li_embeddings = None
if embeddings is not None:
li_embeddings = LlamaIndexEmbeddingsWrapper(embeddings)
li_embeddings = LlamaIndexEmbeddingsWrapper(embeddings, run_config=run_config)

# validate and transform dataset
if dataset is None:
raise ValueError("Provide dataset!")
if dataset is None or not isinstance(dataset, EvaluationDataset):
raise ValueError("Please provide a dataset that is of type EvaluationDataset")

exec = Executor(
desc="Running Query Engine",
keep_progress_bar=True,
show_progress=show_progress,
raise_exceptions=raise_exceptions,
run_config=run_config,
)

# get query
queries = dataset["question"]
# check if multi-turn
if dataset.is_multi_turn():
raise NotImplementedError(
"Multi-turn evaluation is not implemented yet. Please do raise an issue on GitHub if you need this feature and we will prioritize it"
)
samples = t.cast(t.List[SingleTurnSample], dataset.samples)

# get query and make jobs
queries = [sample.user_input for sample in samples]
for i, q in enumerate(queries):
exec.submit(query_engine.aquery, q, name=f"query-{i}")

answers: t.List[str] = []
contexts: t.List[t.List[str]] = []
try:
results = exec.results()
if results == []:
raise ExceptionInRunner()
except Exception as e:
raise e
else:
for r in results:
answers.append(r.response)
contexts.append([n.node.text for n in r.source_nodes])

# create HF dataset
hf_dataset = Dataset.from_dict(
{
"question": queries,
"contexts": contexts,
"answer": answers,
}
)
if "ground_truth" in dataset.column_names:
hf_dataset = hf_dataset.add_column(
name="ground_truth",
column=dataset["ground_truth"],
new_fingerprint=str(uuid4()),
)
# get responses and retrieved contexts
responses: t.List[str] = []
retrieved_contexts: t.List[t.List[str]] = []
results = exec.results()
for r in results:
responses.append(r.response)
retrieved_contexts.append([n.node.text for n in r.source_nodes])

# append the extra information to the dataset
for i, sample in enumerate(samples):
sample.response = responses[i]
sample.retrieved_contexts = retrieved_contexts[i]

results = ragas_evaluate(
dataset=hf_dataset,
dataset=dataset,
metrics=metrics,
llm=li_llm,
embeddings=li_embeddings,
raise_exceptions=raise_exceptions,
callbacks=callbacks,
show_progress=show_progress,
run_config=run_config or RunConfig(),
in_ci=in_ci,
token_usage_parser=token_usage_parser,
)

return results
3 changes: 3 additions & 0 deletions src/ragas/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,9 @@ def check_args(
"stop": stop,
}

def is_finished(self, response: LLMResult) -> bool:
return True

def generate_text(
self,
prompt: PromptValue,
Expand Down
2 changes: 0 additions & 2 deletions src/ragas/metrics/_factual_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,6 @@ async def _single_turn_ascore(
reference_response = await self.verify_claims(
premise=reference, hypothesis_list=response_claims, callbacks=callbacks
)


if self.mode != "precision":
response_reference = await self.verify_claims(
Expand All @@ -286,7 +285,6 @@ async def _single_turn_ascore(
fn = sum(~response_reference)
else:
fn = 0


if self.mode == "precision":
score = tp / (tp + fp + 1e-8)
Expand Down
4 changes: 3 additions & 1 deletion src/ragas/metrics/_tool_call_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def is_sequence_aligned(
async def _multi_turn_ascore(
self, sample: MultiTurnSample, callbacks: Callbacks
) -> float:
assert sample.reference_tool_calls is not None, "Reference tool calls is not set"
assert (
sample.reference_tool_calls is not None
), "Reference tool calls is not set"

pred_tool_calls = []
for item in sample.user_input:
Expand Down
141 changes: 138 additions & 3 deletions src/ragas/testset/synthesizers/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
from ragas._analytics import TestsetGenerationEvent, track
from ragas.callbacks import new_group
from ragas.cost import TokenUsageParser
from ragas.embeddings.base import BaseRagasEmbeddings, LangchainEmbeddingsWrapper
from ragas.embeddings.base import (
BaseRagasEmbeddings,
LangchainEmbeddingsWrapper,
LlamaIndexEmbeddingsWrapper,
)
from ragas.executor import Executor
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper
from ragas.run_config import RunConfig
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.synthesizers import default_query_distribution
Expand All @@ -24,6 +28,11 @@
from langchain_core.documents import Document as LCDocument
from langchain_core.embeddings.embeddings import Embeddings as LangchainEmbeddings
from langchain_core.language_models import BaseLanguageModel as LangchainLLM
from llama_index.core.base.embeddings.base import (
BaseEmbedding as LlamaIndexEmbedding,
)
from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM
from llama_index.core.schema import Document as LlamaIndexDocument

from ragas.embeddings.base import BaseRagasEmbeddings
from ragas.llms.base import BaseRagasLLM
Expand Down Expand Up @@ -71,6 +80,23 @@ def from_langchain(
knowledge_graph,
)

@classmethod
def from_llama_index(
cls,
llm: LlamaIndexLLM,
embedding_model: LlamaIndexEmbedding,
knowledge_graph: t.Optional[KnowledgeGraph] = None,
) -> TestsetGenerator:
"""
Creates a `TestsetGenerator` from a LlamaIndex LLM and embedding model.
"""
knowledge_graph = knowledge_graph or KnowledgeGraph()
return cls(
LlamaIndexLLMWrapper(llm),
LlamaIndexEmbeddingsWrapper(embedding_model),
knowledge_graph,
)

def generate_with_langchain_docs(
self,
documents: t.Sequence[LCDocument],
Expand All @@ -85,7 +111,40 @@ def generate_with_langchain_docs(
raise_exceptions: bool = True,
) -> Testset:
"""
Generates an evaluation dataset based on given scenarios and parameters.
Generates an evaluation dataset based on given Langchain documents and parameters.

Parameters
----------
documents : Sequence[LCDocument]
A sequence of Langchain documents to use as source material
testset_size : int
The number of test samples to generate
transforms : Optional[Transforms], optional
Custom transforms to apply to the documents, by default None
transforms_llm : Optional[BaseRagasLLM], optional
LLM to use for transforms if different from instance LLM, by default None
transforms_embedding_model : Optional[BaseRagasEmbeddings], optional
Embedding model to use for transforms if different from instance model, by default None
query_distribution : Optional[QueryDistribution], optional
Distribution of query types to generate, by default None
run_config : Optional[RunConfig], optional
Configuration for the generation run, by default None
callbacks : Optional[Callbacks], optional
Callbacks to use during generation, by default None
with_debugging_logs : bool, optional
Whether to include debug logs, by default False
raise_exceptions : bool, optional
Whether to raise exceptions during generation, by default True

Returns
-------
Testset
The generated evaluation dataset

Raises
------
ValueError
If no LLM or embedding model is provided either during initialization or as arguments
"""

# force the user to provide an llm and embedding client to prevent use of default LLMs
Expand Down Expand Up @@ -135,6 +194,79 @@ def generate_with_langchain_docs(
raise_exceptions=raise_exceptions,
)

def generate_with_llamaindex_docs(
self,
documents: t.Sequence[LlamaIndexDocument],
testset_size: int,
transforms: t.Optional[Transforms] = None,
transforms_llm: t.Optional[LlamaIndexLLM] = None,
transforms_embedding_model: t.Optional[LlamaIndexEmbedding] = None,
query_distribution: t.Optional[QueryDistribution] = None,
run_config: t.Optional[RunConfig] = None,
callbacks: t.Optional[Callbacks] = None,
with_debugging_logs=False,
raise_exceptions: bool = True,
):
"""
Generates an evaluation dataset based on given scenarios and parameters.
"""

run_config = run_config or RunConfig()

# force the user to provide an llm and embedding client to prevent use of default LLMs
if not self.llm and not transforms_llm:
raise ValueError(
"An llm client was not provided. Provide an LLM on TestsetGenerator instantiation or as an argument for transforms_llm parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
)
if not self.embedding_model and not transforms_embedding_model:
raise ValueError(
"An embedding client was not provided. Provide an embedding model on TestsetGenerator instantiation or as an argument for transforms_llm parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
)

if not transforms:
if transforms_llm is None:
llm_for_transforms = self.llm
else:
llm_for_transforms = LlamaIndexLLMWrapper(transforms_llm)
if transforms_embedding_model is None:
embedding_model_for_transforms = self.embedding_model
else:
embedding_model_for_transforms = LlamaIndexEmbeddingsWrapper(
transforms_embedding_model
)
transforms = default_transforms(
llm=llm_for_transforms,
embedding_model=embedding_model_for_transforms,
)

# convert the documents to Ragas nodes
nodes = []
for doc in documents:
if doc.text is not None and doc.text.strip() != "":
node = Node(
type=NodeType.DOCUMENT,
properties={
"page_content": doc.text,
"document_metadata": doc.metadata,
},
)
nodes.append(node)

kg = KnowledgeGraph(nodes=nodes)

# apply transforms and update the knowledge graph
apply_transforms(kg, transforms, run_config)
self.knowledge_graph = kg

return self.generate(
testset_size=testset_size,
query_distribution=query_distribution,
run_config=run_config,
callbacks=callbacks,
with_debugging_logs=with_debugging_logs,
raise_exceptions=raise_exceptions,
)

def generate(
self,
testset_size: int,
Expand Down Expand Up @@ -182,6 +314,9 @@ def generate(
4. Generate samples for each scenario.
5. Compile the results into an EvaluationDataset.
"""
if run_config is not None:
self.llm.set_run_config(run_config)

query_distribution = query_distribution or default_query_distribution(self.llm)
callbacks = callbacks or []

Expand Down
Loading