explodinggradients · jjmachan · Nov 1, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/docs/howtos/integrations/_llamaindex.md b/docs/howtos/integrations/_llamaindex.md
diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -87,6 +87,7 @@ nav:
           - Cost Analysis: howtos/applications/_cost.md
       - Integrations:
           - howtos/integrations/index.md
+          - LlamaIndex: howtos/integrations/_llamaindex.md
       - Migrations:
           - From v0.1 to v0.2: howtos/migrations/migrate_from_v01_to_v02.md
   - 📖 References: 

diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -316,6 +316,9 @@ def __getitem__(
         else:
             raise TypeError("Index must be int or slice")
 
+    def is_multi_turn(self) -> bool:
+        return self.get_sample_type() == MultiTurnSample
+
     def to_list(self) -> t.List[t.Dict]:
         rows = [sample.to_dict() for sample in self.samples]
 
@@ -341,6 +344,9 @@ def from_list(cls, data: t.List[t.Dict]) -> EvaluationDataset:
             samples.extend(SingleTurnSample(**sample) for sample in data)
         return cls(samples=samples)
 
+    def __repr__(self) -> str:
+        return f"EvaluationDataset(features={self.features()}, len={len(self.samples)})"
+
 
 @dataclass
 class EvaluationResult:

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -114,7 +114,7 @@ def evaluate(
     Returns
     -------
     EvaluationResult
-        EvaluationResult object containing the scores of each metric. 
+        EvaluationResult object containing the scores of each metric.
         You can use this do analysis later.
 
     Raises

diff --git a/src/ragas/integrations/llama_index.py b/src/ragas/integrations/llama_index.py
@@ -2,23 +2,22 @@
 
 import logging
 import typing as t
-from uuid import uuid4
-
-from datasets import Dataset
 
+from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
 from ragas.embeddings import LlamaIndexEmbeddingsWrapper
 from ragas.evaluation import evaluate as ragas_evaluate
-from ragas.exceptions import ExceptionInRunner
 from ragas.executor import Executor
 from ragas.llms import LlamaIndexLLMWrapper
 from ragas.run_config import RunConfig
 
 if t.TYPE_CHECKING:
+    from langchain_core.callbacks import Callbacks
     from llama_index.core.base.embeddings.base import (
         BaseEmbedding as LlamaIndexEmbeddings,
     )
     from llama_index.core.base.llms.base import BaseLLM as LlamaindexLLM
 
+    from ragas.cost import TokenUsageParser
     from ragas.evaluation import EvaluationResult
     from ragas.metrics.base import Metric
 
@@ -28,74 +27,76 @@
 
 def evaluate(
     query_engine,
-    dataset: Dataset,
+    dataset: EvaluationDataset,
     metrics: list[Metric],
     llm: t.Optional[LlamaindexLLM] = None,
     embeddings: t.Optional[LlamaIndexEmbeddings] = None,
+    callbacks: t.Optional[Callbacks] = None,
+    in_ci: bool = False,
+    run_config: t.Optional[RunConfig] = None,
+    token_usage_parser: t.Optional[TokenUsageParser] = None,
     raise_exceptions: bool = False,
     column_map: t.Optional[t.Dict[str, str]] = None,
-    run_config: t.Optional[RunConfig] = None,
+    show_progress: bool = True,
 ) -> EvaluationResult:
     column_map = column_map or {}
 
     # wrap llms and embeddings
     li_llm = None
     if llm is not None:
-        li_llm = LlamaIndexLLMWrapper(llm)
+        li_llm = LlamaIndexLLMWrapper(llm, run_config=run_config)
     li_embeddings = None
     if embeddings is not None:
-        li_embeddings = LlamaIndexEmbeddingsWrapper(embeddings)
+        li_embeddings = LlamaIndexEmbeddingsWrapper(embeddings, run_config=run_config)
 
     # validate and transform dataset
-    if dataset is None:
-        raise ValueError("Provide dataset!")
+    if dataset is None or not isinstance(dataset, EvaluationDataset):
+        raise ValueError("Please provide a dataset that is of type EvaluationDataset")
 
     exec = Executor(
         desc="Running Query Engine",
         keep_progress_bar=True,
+        show_progress=show_progress,
         raise_exceptions=raise_exceptions,
         run_config=run_config,
     )
 
-    # get query
-    queries = dataset["question"]
+    # check if multi-turn
+    if dataset.is_multi_turn():
+        raise NotImplementedError(
+            "Multi-turn evaluation is not implemented yet. Please do raise an issue on GitHub if you need this feature and we will prioritize it"
+        )
+    samples = t.cast(t.List[SingleTurnSample], dataset.samples)
+
+    # get query and make jobs
+    queries = [sample.user_input for sample in samples]
     for i, q in enumerate(queries):
         exec.submit(query_engine.aquery, q, name=f"query-{i}")
 
-    answers: t.List[str] = []
-    contexts: t.List[t.List[str]] = []
-    try:
-        results = exec.results()
-        if results == []:
-            raise ExceptionInRunner()
-    except Exception as e:
-        raise e
-    else:
-        for r in results:
-            answers.append(r.response)
-            contexts.append([n.node.text for n in r.source_nodes])
-
-    # create HF dataset
-    hf_dataset = Dataset.from_dict(
-        {
-            "question": queries,
-            "contexts": contexts,
-            "answer": answers,
-        }
-    )
-    if "ground_truth" in dataset.column_names:
-        hf_dataset = hf_dataset.add_column(
-            name="ground_truth",
-            column=dataset["ground_truth"],
-            new_fingerprint=str(uuid4()),
-        )
+    # get responses and retrieved contexts
+    responses: t.List[str] = []
+    retrieved_contexts: t.List[t.List[str]] = []
+    results = exec.results()
+    for r in results:
+        responses.append(r.response)
+        retrieved_contexts.append([n.node.text for n in r.source_nodes])
+
+    # append the extra information to the dataset
+    for i, sample in enumerate(samples):
+        sample.response = responses[i]
+        sample.retrieved_contexts = retrieved_contexts[i]
 
     results = ragas_evaluate(
-        dataset=hf_dataset,
+        dataset=dataset,
         metrics=metrics,
         llm=li_llm,
         embeddings=li_embeddings,
         raise_exceptions=raise_exceptions,
+        callbacks=callbacks,
+        show_progress=show_progress,
+        run_config=run_config or RunConfig(),
+        in_ci=in_ci,
+        token_usage_parser=token_usage_parser,
     )
 
     return results
diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py
@@ -299,6 +299,9 @@ def check_args(
                 "stop": stop,
             }
 
+    def is_finished(self, response: LLMResult) -> bool:
+        return True
+
     def generate_text(
         self,
         prompt: PromptValue,

diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
@@ -271,7 +271,6 @@ async def _single_turn_ascore(
         reference_response = await self.verify_claims(
             premise=reference, hypothesis_list=response_claims, callbacks=callbacks
         )
-
 
         if self.mode != "precision":
             response_reference = await self.verify_claims(
@@ -286,7 +285,6 @@ async def _single_turn_ascore(
             fn = sum(~response_reference)
         else:
             fn = 0
-
 
         if self.mode == "precision":
             score = tp / (tp + fp + 1e-8)

diff --git a/src/ragas/metrics/_tool_call_accuracy.py b/src/ragas/metrics/_tool_call_accuracy.py
@@ -61,7 +61,9 @@ def is_sequence_aligned(
     async def _multi_turn_ascore(
         self, sample: MultiTurnSample, callbacks: Callbacks
     ) -> float:
-        assert sample.reference_tool_calls is not None, "Reference tool calls is not set"
+        assert (
+            sample.reference_tool_calls is not None
+        ), "Reference tool calls is not set"
 
         pred_tool_calls = []
         for item in sample.user_input:

diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py
@@ -9,9 +9,13 @@
 from ragas._analytics import TestsetGenerationEvent, track
 from ragas.callbacks import new_group
 from ragas.cost import TokenUsageParser
-from ragas.embeddings.base import BaseRagasEmbeddings, LangchainEmbeddingsWrapper
+from ragas.embeddings.base import (
+    BaseRagasEmbeddings,
+    LangchainEmbeddingsWrapper,
+    LlamaIndexEmbeddingsWrapper,
+)
 from ragas.executor import Executor
-from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
+from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper
 from ragas.run_config import RunConfig
 from ragas.testset.graph import KnowledgeGraph, Node, NodeType
 from ragas.testset.synthesizers import default_query_distribution
@@ -24,6 +28,11 @@
     from langchain_core.documents import Document as LCDocument
     from langchain_core.embeddings.embeddings import Embeddings as LangchainEmbeddings
     from langchain_core.language_models import BaseLanguageModel as LangchainLLM
+    from llama_index.core.base.embeddings.base import (
+        BaseEmbedding as LlamaIndexEmbedding,
+    )
+    from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM
+    from llama_index.core.schema import Document as LlamaIndexDocument
 
     from ragas.embeddings.base import BaseRagasEmbeddings
     from ragas.llms.base import BaseRagasLLM
@@ -71,6 +80,23 @@ def from_langchain(
             knowledge_graph,
         )
 
+    @classmethod
+    def from_llama_index(
+        cls,
+        llm: LlamaIndexLLM,
+        embedding_model: LlamaIndexEmbedding,
+        knowledge_graph: t.Optional[KnowledgeGraph] = None,
+    ) -> TestsetGenerator:
+        """
+        Creates a `TestsetGenerator` from a LlamaIndex LLM and embedding model.
+        """
+        knowledge_graph = knowledge_graph or KnowledgeGraph()
+        return cls(
+            LlamaIndexLLMWrapper(llm),
+            LlamaIndexEmbeddingsWrapper(embedding_model),
+            knowledge_graph,
+        )
+
     def generate_with_langchain_docs(
         self,
         documents: t.Sequence[LCDocument],
@@ -85,7 +111,40 @@ def generate_with_langchain_docs(
         raise_exceptions: bool = True,
     ) -> Testset:
         """
-        Generates an evaluation dataset based on given scenarios and parameters.
+        Generates an evaluation dataset based on given Langchain documents and parameters.
+
+        Parameters
+        ----------
+        documents : Sequence[LCDocument]
+            A sequence of Langchain documents to use as source material
+        testset_size : int
+            The number of test samples to generate
+        transforms : Optional[Transforms], optional
+            Custom transforms to apply to the documents, by default None
+        transforms_llm : Optional[BaseRagasLLM], optional
+            LLM to use for transforms if different from instance LLM, by default None
+        transforms_embedding_model : Optional[BaseRagasEmbeddings], optional
+            Embedding model to use for transforms if different from instance model, by default None
+        query_distribution : Optional[QueryDistribution], optional
+            Distribution of query types to generate, by default None
+        run_config : Optional[RunConfig], optional
+            Configuration for the generation run, by default None
+        callbacks : Optional[Callbacks], optional
+            Callbacks to use during generation, by default None
+        with_debugging_logs : bool, optional
+            Whether to include debug logs, by default False
+        raise_exceptions : bool, optional
+            Whether to raise exceptions during generation, by default True
+
+        Returns
+        -------
+        Testset
+            The generated evaluation dataset
+
+        Raises
+        ------
+        ValueError
+            If no LLM or embedding model is provided either during initialization or as arguments
         """
 
         # force the user to provide an llm and embedding client to prevent use of default LLMs
@@ -135,6 +194,79 @@ def generate_with_langchain_docs(
             raise_exceptions=raise_exceptions,
         )
 
+    def generate_with_llamaindex_docs(
+        self,
+        documents: t.Sequence[LlamaIndexDocument],
+        testset_size: int,
+        transforms: t.Optional[Transforms] = None,
+        transforms_llm: t.Optional[LlamaIndexLLM] = None,
+        transforms_embedding_model: t.Optional[LlamaIndexEmbedding] = None,
+        query_distribution: t.Optional[QueryDistribution] = None,
+        run_config: t.Optional[RunConfig] = None,
+        callbacks: t.Optional[Callbacks] = None,
+        with_debugging_logs=False,
+        raise_exceptions: bool = True,
+    ):
+        """
+        Generates an evaluation dataset based on given scenarios and parameters.
+        """
+
+        run_config = run_config or RunConfig()
+
+        # force the user to provide an llm and embedding client to prevent use of default LLMs
+        if not self.llm and not transforms_llm:
+            raise ValueError(
+                "An llm client was not provided. Provide an LLM on TestsetGenerator instantiation or as an argument for transforms_llm parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
+            )
+        if not self.embedding_model and not transforms_embedding_model:
+            raise ValueError(
+                "An embedding client was not provided. Provide an embedding model on TestsetGenerator instantiation or as an argument for transforms_llm parameter. Alternatively you can provide your own transforms through the `transforms` parameter."
+            )
+
+        if not transforms:
+            if transforms_llm is None:
+                llm_for_transforms = self.llm
+            else:
+                llm_for_transforms = LlamaIndexLLMWrapper(transforms_llm)
+            if transforms_embedding_model is None:
+                embedding_model_for_transforms = self.embedding_model
+            else:
+                embedding_model_for_transforms = LlamaIndexEmbeddingsWrapper(
+                    transforms_embedding_model
+                )
+            transforms = default_transforms(
+                llm=llm_for_transforms,
+                embedding_model=embedding_model_for_transforms,
+            )
+
+        # convert the documents to Ragas nodes
+        nodes = []
+        for doc in documents:
+            if doc.text is not None and doc.text.strip() != "":
+                node = Node(
+                    type=NodeType.DOCUMENT,
+                    properties={
+                        "page_content": doc.text,
+                        "document_metadata": doc.metadata,
+                    },
+                )
+                nodes.append(node)
+
+        kg = KnowledgeGraph(nodes=nodes)
+
+        # apply transforms and update the knowledge graph
+        apply_transforms(kg, transforms, run_config)
+        self.knowledge_graph = kg
+
+        return self.generate(
+            testset_size=testset_size,
+            query_distribution=query_distribution,
+            run_config=run_config,
+            callbacks=callbacks,
+            with_debugging_logs=with_debugging_logs,
+            raise_exceptions=raise_exceptions,
+        )
+
     def generate(
         self,
         testset_size: int,
@@ -182,6 +314,9 @@ def generate(
         4. Generate samples for each scenario.
         5. Compile the results into an EvaluationDataset.
         """
+        if run_config is not None:
+            self.llm.set_run_config(run_config)
+
         query_distribution = query_distribution or default_query_distribution(self.llm)
         callbacks = callbacks or []