explodinggradients · jjmachan · Mar 28, 2024 · Mar 6, 2024 · Mar 7, 2024 · Mar 12, 2024
diff --git a/docs/alfred.py b/docs/alfred.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
-import os
-from collections import namedtuple
 import argparse
 import asyncio
-from tqdm.asyncio import tqdm
+import os
 import typing as t
-from langchain_openai.chat_models import ChatOpenAI
-from langchain_core.language_models.chat_models import BaseChatModel
+from collections import namedtuple
+
 from langchain.prompts import ChatPromptTemplate
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_openai.chat_models import ChatOpenAI
+from tqdm.asyncio import tqdm
 
 File = namedtuple("File", "name content")
 

diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py
@@ -44,9 +44,7 @@ def set_run_config(self, run_config: RunConfig):
 
 class LangchainEmbeddingsWrapper(BaseRagasEmbeddings):
     def __init__(
-        self,
-        embeddings: Embeddings,
-        run_config: t.Optional[RunConfig] = None
+        self, embeddings: Embeddings, run_config: t.Optional[RunConfig] = None
     ):
         self.embeddings = embeddings
         if run_config is None:

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -15,9 +15,9 @@
     LangchainEmbeddingsWrapper,
     embedding_factory,
 )
-from ragas.llms import llm_factory
 from ragas.exceptions import ExceptionInRunner
 from ragas.executor import Executor
+from ragas.llms import llm_factory
 from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper
 from ragas.metrics._answer_correctness import AnswerCorrectness
 from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM

diff --git a/src/ragas/executor.py b/src/ragas/executor.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
-import sys
 
 import asyncio
 import logging
+import sys
 import threading
 import typing as t
 from dataclasses import dataclass, field
@@ -24,28 +24,31 @@ def runner_exception_hook(args: threading.ExceptHookArgs):
 # set a custom exception hook
 # threading.excepthook = runner_exception_hook
 
+
 def as_completed(loop, coros, max_workers):
     loop_arg_dict = {"loop": loop} if sys.version_info[:2] < (3, 10) else {}
     if max_workers == -1:
         return asyncio.as_completed(coros, **loop_arg_dict)
-    
+
     # loop argument is removed since Python 3.10
     semaphore = asyncio.Semaphore(max_workers, **loop_arg_dict)
+
     async def sema_coro(coro):
         async with semaphore:
             return await coro
-    
+
     sema_coros = [sema_coro(c) for c in coros]
     return asyncio.as_completed(sema_coros, **loop_arg_dict)
 
+
 class Runner(threading.Thread):
     def __init__(
         self,
         jobs: t.List[t.Tuple[t.Coroutine, str]],
         desc: str,
         keep_progress_bar: bool = True,
         raise_exceptions: bool = True,
-        run_config: t.Optional[RunConfig] = None
+        run_config: t.Optional[RunConfig] = None,
     ):
         super().__init__()
         self.jobs = jobs
@@ -59,7 +62,7 @@ def __init__(
         self.futures = as_completed(
             loop=self.loop,
             coros=[coro for coro, _ in self.jobs],
-            max_workers=self.run_config.max_workers
+            max_workers=self.run_config.max_workers,
         )
 
     async def _aresults(self) -> t.List[t.Any]:

diff --git a/src/ragas/integrations/__init__.py b/src/ragas/integrations/__init__.py
diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import typing as t
+
+from langchain.chains.base import Chain
+from langchain.schema import RUN_KEY
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langsmith.evaluation import EvaluationResult, RunEvaluator
+from langsmith.schemas import Example, Run
+
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
+from ragas.metrics.base import (
+    EvaluationMode,
+    Metric,
+    MetricWithEmbeddings,
+    MetricWithLLM,
+    get_required_columns,
+)
+from ragas.run_config import RunConfig
+from ragas.validation import EVALMODE_TO_COLUMNS
+
+if t.TYPE_CHECKING:
+    from langchain.callbacks.manager import (
+        AsyncCallbackManagerForChainRun,
+        CallbackManagerForChainRun,
+    )
+
+
+class EvaluatorChain(Chain, RunEvaluator):
+    """
+    Wrapper around ragas Metrics to use them with langsmith.
+    """
+
+    metric: Metric
+
+    def __init__(self, metric: Metric, **kwargs: t.Any):
+        kwargs["metric"] = metric
+        super().__init__(**kwargs)
+        if "run_config" in kwargs:
+            run_config = kwargs["run_config"]
+        else:
+            run_config = RunConfig()
+        if isinstance(self.metric, MetricWithLLM):
+            llm = kwargs.get("llm", ChatOpenAI())
+            t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
+        if isinstance(self.metric, MetricWithEmbeddings):
+            embeddings = kwargs.get("embeddings", OpenAIEmbeddings())
+            t.cast(
+                MetricWithEmbeddings, self.metric
+            ).embeddings = LangchainEmbeddingsWrapper(embeddings)
+        self.metric.init(run_config)
+
+    @property
+    def input_keys(self) -> list[str]:
+        return get_required_columns(self.metric.evaluation_mode)
+
+    @property
+    def output_keys(self) -> list[str]:
+        return [self.metric.name]
+
+    def _call(
+        self,
+        inputs: dict[str, t.Any],
+        run_manager: t.Optional[CallbackManagerForChainRun] = None,
+    ) -> dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = self.metric.score(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=callbacks,
+        )
+        return {self.metric.name: score}
+
+    async def _acall(
+        self,
+        inputs: t.Dict[str, t.Any],
+        run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> t.Dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        # TODO: currently AsyncCallbacks are not supported in ragas
+        _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = await self.metric.ascore(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=[],
+        )
+        return {self.metric.name: score}
+
+    def _validate(
+        self,
+        input: dict[str, t.Any],
+        question_key: str = "question",
+        prediction_key: str = "answer",
+        context_key: str = "contexts",
+    ) -> None:
+        # validate each example
+        required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
+        if "question" in required_columns and question_key not in input:
+            raise ValueError(
+                f'"{question_key}" is required in each example'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "answer" in required_columns and prediction_key not in input:
+            raise ValueError(
+                f'"{prediction_key}" is required in each prediction'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "contexts" in required_columns and context_key not in input:
+            raise ValueError(
+                f'"{context_key}" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+        if "ground_truth" in required_columns and "ground_truth" not in input:
+            raise ValueError(
+                f'"ground_truth" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+
+    @staticmethod
+    def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
+        return [k for k in keys_to_check if k not in dict_to_check]
+
+    def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
+        if example is None:
+            raise ValueError(
+                "expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.inputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.outputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if "question" not in example.inputs or "ground_truth" not in example.outputs:
+            raise ValueError(
+                "Expected 'question' and 'ground_truth' in example."
+                f"Got: {[k for k in example.inputs.keys()]}"
+            )
+        assert (
+            run.outputs is not None
+        ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
+        output_keys = get_required_columns(
+            self.metric.evaluation_mode, ["question", "ground_truth"]
+        )
+        missing_keys = self._keys_are_present(output_keys, run.outputs)
+        if missing_keys:
+            raise ValueError(
+                "Expected 'answer' and 'contexts' in run.outputs."
+                f"Got: {[k for k in run.outputs.keys()]}"
+            )
+
+    def evaluate_run(
+        self, run: Run, example: t.Optional[Example] = None
+    ) -> EvaluationResult:
+        """
+        Evaluate a langsmith run
+        """
+        self._validate_langsmith_eval(run, example)
+
+        # this is just to suppress the type checker error
+        # actual check and error message is in the _validate_langsmith_eval
+        assert run.outputs is not None
+        assert example is not None
+        assert example.inputs is not None
+        assert example.outputs is not None
+
+        chain_eval = run.outputs
+        chain_eval["question"] = example.inputs["question"]
+        if self.metric.evaluation_mode in [
+            EvaluationMode.gc,
+            EvaluationMode.ga,
+            EvaluationMode.qcg,
+            EvaluationMode.qga,
+        ]:
+            if example.outputs is None or "ground_truth" not in example.outputs:
+                raise ValueError("expected `ground_truth` in example outputs.")
+            chain_eval["ground_truth"] = example.outputs["ground_truth"]
+        eval_output = self(chain_eval, include_run_info=True)
+
+        evaluation_result = EvaluationResult(
+            key=self.metric.name, score=eval_output[self.metric.name]
+        )
+        if RUN_KEY in eval_output:
+            evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
+        return evaluation_result