From 41a3c67fe757c00908c81cf153ef1c0b207c72ae Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Wed, 6 Mar 2024 12:41:32 -0800
Subject: [PATCH 1/7] fixed EvaluatorChain

---
 src/ragas/integrations/langchain.py | 186 ++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 src/ragas/integrations/langchain.py

diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
new file mode 100644
index 000000000..081eb8d8b
--- /dev/null
+++ b/src/ragas/integrations/langchain.py
@@ -0,0 +1,186 @@
+from __future__ import annotations
+
+import typing as t
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.base import Chain
+from langchain.schema import RUN_KEY
+from langsmith.evaluation import EvaluationResult, RunEvaluator
+from langsmith.schemas import Example, Run
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
+
+from ragas.metrics.base import (
+    EvaluationMode,
+    Metric,
+    MetricWithLLM,
+    MetricWithEmbeddings,
+)
+from ragas.validation import EVALMODE_TO_COLUMNS
+from ragas.llms import LangchainLLMWrapper
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.run_config import RunConfig
+
+__all__ = ["EvaluatorChain"]
+
+
+class EvaluatorChain(Chain, RunEvaluator):
+    """
+    Wrapper around ragas Metrics to use them with langsmith.
+    """
+
+    metric: Metric
+
+    def __init__(self, metric: Metric, **kwargs: t.Any):
+        kwargs["metric"] = metric
+        super().__init__(**kwargs)
+        if "run_config" in kwargs:
+            run_config = kwargs["run_config"]
+        else:
+            run_config = RunConfig()
+        if isinstance(self.metric, MetricWithLLM):
+            llm = kwargs.get("llm", ChatOpenAI())
+            t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
+        if isinstance(self.metric, MetricWithEmbeddings):
+            embeddings = kwargs.get("embeddings", OpenAIEmbeddings())
+            t.cast(
+                MetricWithEmbeddings, self.metric
+            ).embeddings = LangchainEmbeddingsWrapper(embeddings)
+        self.metric.init(run_config)
+
+    @property
+    def input_keys(self) -> list[str]:
+        keys = ["question", "answer"]
+        if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]:
+            keys += ["contexts"]
+        if self.metric.evaluation_mode in [EvaluationMode.gc]:
+            keys += ["ground_truth"]
+        return keys
+
+    @property
+    def output_keys(self) -> list[str]:
+        return [self.metric.name]
+
+    def _call(
+        self,
+        inputs: dict[str, t.Any],
+        run_manager: t.Optional[CallbackManagerForChainRun] = None,
+    ) -> dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = self.metric.score(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=callbacks,
+        )
+        return {self.metric.name: score}
+
+    def _validate(
+        self,
+        input: dict[str, t.Any],
+        question_key: str = "question",
+        prediction_key: str = "answer",
+        context_key: str = "contexts",
+    ) -> None:
+        # validate each example
+        required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
+        if "question" in required_columns and question_key not in input:
+            raise ValueError(
+                f'"{question_key}" is required in each example'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "answer" in required_columns and prediction_key not in input:
+            raise ValueError(
+                f'"{prediction_key}" is required in each prediction'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "contexts" in required_columns and context_key not in input:
+            raise ValueError(
+                f'"{context_key}" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+        if "ground_truth" in required_columns and "ground_truth" not in input:
+            raise ValueError(
+                f'"ground_truth" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+
+    @staticmethod
+    def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
+        return [k for k in keys_to_check if k not in dict_to_check]
+
+    def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
+        if example is None:
+            raise ValueError(
+                "expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.inputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.outputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if "question" not in example.inputs or "ground_truth" not in example.outputs:
+            raise ValueError(
+                "Expected 'question' and 'ground_truth' in example."
+                f"Got: {[k for k in example.inputs.keys()]}"
+            )
+        assert (
+            run.outputs is not None
+        ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
+        missing_keys = self._keys_are_present(["answer", "contexts"], run.outputs)
+        if missing_keys:
+            raise ValueError(
+                "Expected 'answer' and 'contexts' in run.outputs."
+                f"Got: {[k for k in run.outputs.keys()]}"
+            )
+
+    def evaluate_run(
+        self, run: Run, example: t.Optional[Example] = None
+    ) -> EvaluationResult:
+        """
+        Evaluate a langsmith run
+        """
+        self._validate_langsmith_eval(run, example)
+
+        # this is just to suppress the type checker error
+        # actual check and error message is in the _validate_langsmith_eval
+        assert run.outputs is not None
+        assert example is not None
+        assert example.inputs is not None
+        assert example.outputs is not None
+
+        chain_eval = run.outputs
+        chain_eval["question"] = example.inputs["question"]
+        if self.metric.evaluation_mode in [
+            EvaluationMode.gc,
+            EvaluationMode.ga,
+            EvaluationMode.qcg,
+            EvaluationMode.qga,
+        ]:
+            if example.outputs is None or "ground_truth" not in example.outputs:
+                raise ValueError("expected `ground_truth` in example outputs.")
+            chain_eval["ground_truth"] = example.outputs["ground_truth"]
+        eval_output = self(chain_eval, include_run_info=True)
+
+        evaluation_result = EvaluationResult(
+            key=self.metric.name, score=eval_output[self.metric.name]
+        )
+        if RUN_KEY in eval_output:
+            evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
+        return evaluation_result

From 593c2125ccfb8e8bae7a4396ce626eb7cba9b70e Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Wed, 6 Mar 2024 20:17:24 -0800
Subject: [PATCH 2/7] async invoke

---
 src/ragas/integrations/langchain.py | 65 ++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
index 081eb8d8b..b449d9af0 100644
--- a/src/ragas/integrations/langchain.py
+++ b/src/ragas/integrations/langchain.py
@@ -2,7 +2,6 @@
 
 import typing as t
 
-from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.chains.base import Chain
 from langchain.schema import RUN_KEY
 from langsmith.evaluation import EvaluationResult, RunEvaluator
@@ -21,7 +20,11 @@
 from ragas.embeddings import LangchainEmbeddingsWrapper
 from ragas.run_config import RunConfig
 
-__all__ = ["EvaluatorChain"]
+if t.TYPE_CHECKING:
+    from langchain.callbacks.manager import (
+        CallbackManagerForChainRun,
+        AsyncCallbackManagerForChainRun,
+    )
 
 
 class EvaluatorChain(Chain, RunEvaluator):
@@ -50,10 +53,34 @@ def __init__(self, metric: Metric, **kwargs: t.Any):
 
     @property
     def input_keys(self) -> list[str]:
-        keys = ["question", "answer"]
-        if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]:
+        keys = []
+        if self.metric.evaluation_mode in [
+            EvaluationMode.qac,
+            EvaluationMode.qa,
+            EvaluationMode.qc,
+            EvaluationMode.qga,
+            EvaluationMode.qcg,
+        ]:
+            keys += ["question"]
+        if self.metric.evaluation_mode in [
+            EvaluationMode.qac,
+            EvaluationMode.qa,
+            EvaluationMode.ga,
+            EvaluationMode.qga,
+        ]:
+            keys += ["answer"]
+        if self.metric.evaluation_mode in [
+            EvaluationMode.qac,
+            EvaluationMode.gc,
+            EvaluationMode.gc,
+            EvaluationMode.qcg,
+        ]:
             keys += ["contexts"]
-        if self.metric.evaluation_mode in [EvaluationMode.gc]:
+        if self.metric.evaluation_mode in [
+            EvaluationMode.gc,
+            EvaluationMode.qga,
+            EvaluationMode.qcg,
+        ]:
             keys += ["ground_truth"]
         return keys
 
@@ -88,6 +115,34 @@ def _call(
         )
         return {self.metric.name: score}
 
+    async def _acall(
+        self,
+        inputs: t.Dict[str, t.Any],
+        run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> t.Dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        # TODO: currently AsyncCallbacks are not supported in ragas
+        callbacks = _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = await self.metric.ascore(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=[],
+        )
+        return {self.metric.name: score}
+
     def _validate(
         self,
         input: dict[str, t.Any],

From 844cf05a00eea0c9842afa59889e2d1b428535d1 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Mon, 11 Mar 2024 20:16:14 -0700
Subject: [PATCH 3/7] langmsmith fixes

---
 src/ragas/integrations/__init__.py  |  0
 src/ragas/integrations/langchain.py | 37 ++++---------------
 src/ragas/integrations/langsmith.py | 55 +++++++++++++++++++++++++++++
 src/ragas/metrics/base.py           | 21 +++++++++++
 4 files changed, 82 insertions(+), 31 deletions(-)
 create mode 100644 src/ragas/integrations/__init__.py
 create mode 100644 src/ragas/integrations/langsmith.py

diff --git a/src/ragas/integrations/__init__.py b/src/ragas/integrations/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
index b449d9af0..ae8add433 100644
--- a/src/ragas/integrations/langchain.py
+++ b/src/ragas/integrations/langchain.py
@@ -14,6 +14,7 @@
     Metric,
     MetricWithLLM,
     MetricWithEmbeddings,
+    get_required_columns,
 )
 from ragas.validation import EVALMODE_TO_COLUMNS
 from ragas.llms import LangchainLLMWrapper
@@ -53,36 +54,7 @@ def __init__(self, metric: Metric, **kwargs: t.Any):
 
     @property
     def input_keys(self) -> list[str]:
-        keys = []
-        if self.metric.evaluation_mode in [
-            EvaluationMode.qac,
-            EvaluationMode.qa,
-            EvaluationMode.qc,
-            EvaluationMode.qga,
-            EvaluationMode.qcg,
-        ]:
-            keys += ["question"]
-        if self.metric.evaluation_mode in [
-            EvaluationMode.qac,
-            EvaluationMode.qa,
-            EvaluationMode.ga,
-            EvaluationMode.qga,
-        ]:
-            keys += ["answer"]
-        if self.metric.evaluation_mode in [
-            EvaluationMode.qac,
-            EvaluationMode.gc,
-            EvaluationMode.gc,
-            EvaluationMode.qcg,
-        ]:
-            keys += ["contexts"]
-        if self.metric.evaluation_mode in [
-            EvaluationMode.gc,
-            EvaluationMode.qga,
-            EvaluationMode.qcg,
-        ]:
-            keys += ["ground_truth"]
-        return keys
+        return get_required_columns(self.metric.evaluation_mode)
 
     @property
     def output_keys(self) -> list[str]:
@@ -198,7 +170,10 @@ def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> No
         assert (
             run.outputs is not None
         ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
-        missing_keys = self._keys_are_present(["answer", "contexts"], run.outputs)
+        output_keys = get_required_columns(
+            self.metric.evaluation_mode, ["question", "ground_truth"]
+        )
+        missing_keys = self._keys_are_present(output_keys, run.outputs)
         if missing_keys:
             raise ValueError(
                 "Expected 'answer' and 'contexts' in run.outputs."
diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py
new file mode 100644
index 000000000..edea4ebb0
--- /dev/null
+++ b/src/ragas/integrations/langsmith.py
@@ -0,0 +1,55 @@
+import typing as t
+
+from langchain.smith import RunEvalConfig
+from langsmith import Client
+from langsmith.utils import LangSmithNotFoundError
+
+
+from ragas.integrations.langchain import EvaluatorChain
+
+
+def evaluate(
+    dataset_name: str,
+    llm_or_chain_factory: t.Any,
+    run_name: str = "",
+    metrics: t.Optional[list] = None,
+    verbose: bool = False,
+) -> t.Dict[str, t.Any]:
+    # get sensible run name
+    if not run_name:
+        run_name = llm_or_chain_factory.get_name()
+    # init client and validate dataset
+    client = Client()
+    try:
+        _ = client.read_dataset(dataset_name=dataset_name)
+    except LangSmithNotFoundError:
+        raise ValueError(
+            f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
+        )
+
+    # make config
+    if metrics is None:
+        from ragas.metrics import (
+            answer_relevancy,
+            context_precision,
+            context_recall,
+            faithfulness,
+        )
+
+        metrics = [answer_relevancy, context_precision, faithfulness, context_recall]
+
+    metrics = [EvaluatorChain(m) for m in metrics]
+    eval_config = RunEvalConfig(
+        custom_evaluators=metrics,
+    )
+
+    # run evaluation with langsmith
+    run = client.run_on_dataset(
+        dataset_name=dataset_name,
+        llm_or_chain_factory=llm_or_chain_factory,
+        evaluation=eval_config,
+        verbose=verbose,
+        # Any experiment metadata can be specified here
+    )
+
+    return run
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index e0ed3fb7a..f4d05bf9e 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -25,6 +25,27 @@
 EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg")
 
 
+def get_required_columns(
+    eval_mod: EvaluationMode, ignore_columns: t.List[str] = []
+) -> t.List[str]:
+    if eval_mod == EvaluationMode.qac:
+        keys = ["question", "answer", "contexts"]
+    elif eval_mod == EvaluationMode.qa:
+        keys = ["question", "answer"]
+    elif eval_mod == EvaluationMode.qc:
+        keys = ["question", "contexts"]
+    elif eval_mod == EvaluationMode.gc:
+        keys = ["contexts", "ground_truth"]
+    elif eval_mod == EvaluationMode.ga:
+        keys = ["answer", "ground_truth"]
+    elif eval_mod == EvaluationMode.qga:
+        keys = ["question", "contexts", "answer", "ground_truth"]
+    elif eval_mod == EvaluationMode.qcg:
+        keys = ["question", "contexts", "ground_truth"]
+
+    return [k for k in keys if k not in ignore_columns]
+
+
 @dataclass
 class Metric(ABC):
     @property

From 4871ad451908a6da8a91ba2dc56e81d5b5a76627 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Wed, 20 Mar 2024 07:49:19 -0700
Subject: [PATCH 4/7] upload dataset

---
 src/ragas/integrations/langsmith.py | 138 ++++++++++++++++++++++++++--
 1 file changed, 131 insertions(+), 7 deletions(-)

diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py
index edea4ebb0..933ebeb41 100644
--- a/src/ragas/integrations/langsmith.py
+++ b/src/ragas/integrations/langsmith.py
@@ -1,23 +1,146 @@
+from __future__ import annotations
+
 import typing as t
 
 from langchain.smith import RunEvalConfig
-from langsmith import Client
-from langsmith.utils import LangSmithNotFoundError
-
 
 from ragas.integrations.langchain import EvaluatorChain
 
+if t.TYPE_CHECKING:
+    from ragas.testset.generator import TestDataset
+    from langsmith.schemas import Dataset as LangsmithDataset
+
+try:
+    from langsmith import Client
+    from langsmith.utils import LangSmithNotFoundError
+except ImportError:
+    raise ImportError(
+        "Please install langsmith to use this feature. You can install it via pip install langsmith"
+    )
+
+
+def upload_dataset(
+    dataset: TestDataset, dataset_name: str, dataset_desc: str = ""
+) -> LangsmithDataset:
+    """
+    Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
+    pandas DataFrame before upload. If a dataset with the specified name already
+    exists, the function raises an error.
+
+    Parameters
+    ----------
+    dataset : TestDataset
+        The dataset to be uploaded.
+    dataset_name : str
+        The name for the new dataset in LangSmith.
+    dataset_desc : str, optional
+        A description for the new dataset. The default is an empty string.
+
+    Returns
+    -------
+    LangsmithDataset
+        The dataset object as stored in LangSmith after upload.
+
+    Raises
+    ------
+    ValueError
+        If a dataset with the specified name already exists in LangSmith.
+
+    Notes
+    -----
+    The function attempts to read a dataset by the given name to check its existence.
+    If not found, it proceeds to upload the dataset after converting it to a pandas
+    DataFrame. This involves specifying input and output keys for the dataset being
+    uploaded.
+    """
+    client = Client()
+    try:
+        # check if dataset exists
+        dataset = client.read_dataset(dataset_name=dataset_name)
+        raise ValueError(
+            f"Dataset {dataset_name} already exists in langsmith. [{dataset}]"
+        )
+    except LangSmithNotFoundError:
+        # if not create a new one with the generated query examples
+        dataset = client.upload_dataframe(
+            df=dataset.to_pandas(),
+            name=dataset_name,
+            input_keys=["question"],
+            output_keys=["ground_truth"],
+            description=dataset_desc,
+        )
+
+        print(
+            f"Created a new dataset '{dataset.name}'. Dataset is accessible at {dataset.url}"
+        )
+        return dataset
+
 
 def evaluate(
     dataset_name: str,
     llm_or_chain_factory: t.Any,
-    run_name: str = "",
+    experiment_name: t.Optional[str] = None,
     metrics: t.Optional[list] = None,
     verbose: bool = False,
 ) -> t.Dict[str, t.Any]:
-    # get sensible run name
-    if not run_name:
-        run_name = llm_or_chain_factory.get_name()
+    """
+    Evaluates a language model or a chain factory on a specified dataset using
+    LangSmith, with the option to customize metrics and verbosity.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset to use for evaluation. This dataset must exist in
+        LangSmith.
+    llm_or_chain_factory : Any
+        The language model or chain factory to be evaluated. This parameter is
+        flexible and can accept a variety of objects depending on the implementation.
+    experiment_name : Optional[str], optional
+        The name of the experiment. This can be used to categorize or identify the
+        evaluation run within LangSmith. The default is None.
+    metrics : Optional[list], optional
+        A list of custom metrics (functions or evaluators) to be used for the
+        evaluation. If None, a default set of metrics (answer relevancy, context
+        precision, context recall, and faithfulness) are used.
+        The default is None.
+    verbose : bool, optional
+        If True, detailed progress and results will be printed during the evaluation
+        process.
+        The default is False.
+
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing the results of the evaluation.
+
+    Raises
+    ------
+    ValueError
+        If the specified dataset does not exist in LangSmith.
+
+    See Also
+    --------
+    Client.read_dataset : Method to read an existing dataset.
+    Client.run_on_dataset : Method to run the evaluation on the specified dataset.
+
+    Examples
+    --------
+    >>> results = evaluate(
+    ...     dataset_name="MyDataset",
+    ...     llm_or_chain_factory=my_llm,
+    ...     experiment_name="experiment_1_with_vanila_rag",
+    ...     verbose=True
+    ... )
+    >>> print(results)
+    {'evaluation_result': ...}
+
+    Notes
+    -----
+    The function initializes a client to interact with LangSmith, validates the existence
+    of the specified dataset, prepares evaluation metrics, and runs the evaluation,
+    returning the results. Custom evaluation metrics can be specified, or a default set
+    will be used if none are provided.
+    """
     # init client and validate dataset
     client = Client()
     try:
@@ -50,6 +173,7 @@ def evaluate(
         evaluation=eval_config,
         verbose=verbose,
         # Any experiment metadata can be specified here
+        project_name=experiment_name,
     )
 
     return run

From 5b42ed51a63f9c91f82fee5c32e2ec9aaf4f3791 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Thu, 28 Mar 2024 08:43:24 -0700
Subject: [PATCH 5/7] fix linting and fmt

---
 docs/alfred.py                      | 11 ++++++-----
 src/ragas/embeddings/base.py        |  4 +---
 src/ragas/evaluation.py             |  2 +-
 src/ragas/executor.py               | 13 ++++++++-----
 src/ragas/integrations/langchain.py | 16 ++++++++--------
 src/ragas/integrations/langsmith.py |  3 ++-
 src/ragas/llms/base.py              |  7 ++-----
 src/ragas/testset/docstore.py       |  2 +-
 src/ragas/testset/evolutions.py     | 12 ++----------
 src/ragas/testset/generator.py      |  6 +++---
 10 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/docs/alfred.py b/docs/alfred.py
index da966d2ab..c7f50e38d 100644
--- a/docs/alfred.py
+++ b/docs/alfred.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
-import os
-from collections import namedtuple
 import argparse
 import asyncio
-from tqdm.asyncio import tqdm
+import os
 import typing as t
-from langchain_openai.chat_models import ChatOpenAI
-from langchain_core.language_models.chat_models import BaseChatModel
+from collections import namedtuple
+
 from langchain.prompts import ChatPromptTemplate
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_openai.chat_models import ChatOpenAI
+from tqdm.asyncio import tqdm
 
 File = namedtuple("File", "name content")
 
diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py
index cd6e83d9c..38d5efd72 100644
--- a/src/ragas/embeddings/base.py
+++ b/src/ragas/embeddings/base.py
@@ -44,9 +44,7 @@ def set_run_config(self, run_config: RunConfig):
 
 class LangchainEmbeddingsWrapper(BaseRagasEmbeddings):
     def __init__(
-        self,
-        embeddings: Embeddings,
-        run_config: t.Optional[RunConfig] = None
+        self, embeddings: Embeddings, run_config: t.Optional[RunConfig] = None
     ):
         self.embeddings = embeddings
         if run_config is None:
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
index 9529c9984..37f2d2486 100644
--- a/src/ragas/evaluation.py
+++ b/src/ragas/evaluation.py
@@ -15,9 +15,9 @@
     LangchainEmbeddingsWrapper,
     embedding_factory,
 )
-from ragas.llms import llm_factory
 from ragas.exceptions import ExceptionInRunner
 from ragas.executor import Executor
+from ragas.llms import llm_factory
 from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper
 from ragas.metrics._answer_correctness import AnswerCorrectness
 from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM
diff --git a/src/ragas/executor.py b/src/ragas/executor.py
index 3d034a967..70c6e0a00 100644
--- a/src/ragas/executor.py
+++ b/src/ragas/executor.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
-import sys
 
 import asyncio
 import logging
+import sys
 import threading
 import typing as t
 from dataclasses import dataclass, field
@@ -24,20 +24,23 @@ def runner_exception_hook(args: threading.ExceptHookArgs):
 # set a custom exception hook
 # threading.excepthook = runner_exception_hook
 
+
 def as_completed(loop, coros, max_workers):
     loop_arg_dict = {"loop": loop} if sys.version_info[:2] < (3, 10) else {}
     if max_workers == -1:
         return asyncio.as_completed(coros, **loop_arg_dict)
-    
+
     # loop argument is removed since Python 3.10
     semaphore = asyncio.Semaphore(max_workers, **loop_arg_dict)
+
     async def sema_coro(coro):
         async with semaphore:
             return await coro
-    
+
     sema_coros = [sema_coro(c) for c in coros]
     return asyncio.as_completed(sema_coros, **loop_arg_dict)
 
+
 class Runner(threading.Thread):
     def __init__(
         self,
@@ -45,7 +48,7 @@ def __init__(
         desc: str,
         keep_progress_bar: bool = True,
         raise_exceptions: bool = True,
-        run_config: t.Optional[RunConfig] = None
+        run_config: t.Optional[RunConfig] = None,
     ):
         super().__init__()
         self.jobs = jobs
@@ -59,7 +62,7 @@ def __init__(
         self.futures = as_completed(
             loop=self.loop,
             coros=[coro for coro, _ in self.jobs],
-            max_workers=self.run_config.max_workers
+            max_workers=self.run_config.max_workers,
         )
 
     async def _aresults(self) -> t.List[t.Any]:
diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
index ae8add433..064c9a7f5 100644
--- a/src/ragas/integrations/langchain.py
+++ b/src/ragas/integrations/langchain.py
@@ -4,27 +4,27 @@
 
 from langchain.chains.base import Chain
 from langchain.schema import RUN_KEY
-from langsmith.evaluation import EvaluationResult, RunEvaluator
-from langsmith.schemas import Example, Run
 from langchain_openai.chat_models import ChatOpenAI
 from langchain_openai.embeddings import OpenAIEmbeddings
+from langsmith.evaluation import EvaluationResult, RunEvaluator
+from langsmith.schemas import Example, Run
 
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
 from ragas.metrics.base import (
     EvaluationMode,
     Metric,
-    MetricWithLLM,
     MetricWithEmbeddings,
+    MetricWithLLM,
     get_required_columns,
 )
-from ragas.validation import EVALMODE_TO_COLUMNS
-from ragas.llms import LangchainLLMWrapper
-from ragas.embeddings import LangchainEmbeddingsWrapper
 from ragas.run_config import RunConfig
+from ragas.validation import EVALMODE_TO_COLUMNS
 
 if t.TYPE_CHECKING:
     from langchain.callbacks.manager import (
-        CallbackManagerForChainRun,
         AsyncCallbackManagerForChainRun,
+        CallbackManagerForChainRun,
     )
 
 
@@ -98,7 +98,7 @@ async def _acall(
         self._validate(inputs)
         _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
         # TODO: currently AsyncCallbacks are not supported in ragas
-        callbacks = _run_manager.get_child()
+        _run_manager.get_child()
 
         c = inputs.get("contexts", [""])
         g = inputs.get("ground_truth", "")
diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py
index 933ebeb41..32762878e 100644
--- a/src/ragas/integrations/langsmith.py
+++ b/src/ragas/integrations/langsmith.py
@@ -7,9 +7,10 @@
 from ragas.integrations.langchain import EvaluatorChain
 
 if t.TYPE_CHECKING:
-    from ragas.testset.generator import TestDataset
     from langsmith.schemas import Dataset as LangsmithDataset
 
+    from ragas.testset.generator import TestDataset
+
 try:
     from langsmith import Client
     from langsmith.utils import LangSmithNotFoundError
diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py
index f2e0d7820..5979d336b 100644
--- a/src/ragas/llms/base.py
+++ b/src/ragas/llms/base.py
@@ -119,9 +119,7 @@ class LangchainLLMWrapper(BaseRagasLLM):
     """
 
     def __init__(
-        self,
-        langchain_llm: BaseLanguageModel,
-        run_config: t.Optional[RunConfig] = None
+        self, langchain_llm: BaseLanguageModel, run_config: t.Optional[RunConfig] = None
     ):
         self.langchain_llm = langchain_llm
         if run_config is None:
@@ -206,8 +204,7 @@ def set_run_config(self, run_config: RunConfig):
 
 
 def llm_factory(
-    model: str = "gpt-3.5-turbo-16k",
-    run_config: t.Optional[RunConfig] = None
+    model: str = "gpt-3.5-turbo-16k", run_config: t.Optional[RunConfig] = None
 ) -> BaseRagasLLM:
     timeout = None
     if run_config is not None:
diff --git a/src/ragas/testset/docstore.py b/src/ragas/testset/docstore.py
index 7ab3b3916..456ec6c40 100644
--- a/src/ragas/testset/docstore.py
+++ b/src/ragas/testset/docstore.py
@@ -78,7 +78,7 @@ class Direction(str, Enum):
     PREV = "prev"
     UP = "up"
     DOWN = "down"
-    
+
 
 class Node(Document):
     keyphrases: t.List[str] = Field(default_factory=list, repr=False)
diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py
index 8167b3bb2..61e22b465 100644
--- a/src/ragas/testset/evolutions.py
+++ b/src/ragas/testset/evolutions.py
@@ -88,11 +88,7 @@ def merge_nodes(nodes: CurrentNodes) -> Node:
             new_node.embedding = np.average(node_embeddings, axis=0)
         return new_node
 
-    def init(
-        self,
-        is_async: bool = True,
-        run_config: t.Optional[RunConfig] = None
-    ):
+    def init(self, is_async: bool = True, run_config: t.Optional[RunConfig] = None):
         self.is_async = is_async
         if run_config is None:
             run_config = RunConfig()
@@ -339,11 +335,7 @@ class ComplexEvolution(Evolution):
         default_factory=lambda: compress_question_prompt
     )
 
-    def init(
-        self,
-        is_async: bool = True,
-        run_config: t.Optional[RunConfig] = None
-    ):
+    def init(self, is_async: bool = True, run_config: t.Optional[RunConfig] = None):
         if run_config is None:
             run_config = RunConfig()
         super().init(is_async=is_async, run_config=run_config)
diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
index 4febcc5e1..ebf7648a1 100644
--- a/src/ragas/testset/generator.py
+++ b/src/ragas/testset/generator.py
@@ -120,7 +120,7 @@ def generate_with_llamaindex_docs(
         with_debugging_logs=False,
         is_async: bool = True,
         raise_exceptions: bool = True,
-        run_config: t.Optional[RunConfig] = None
+        run_config: t.Optional[RunConfig] = None,
     ):
         # chunk documents and add to docstore
         self.docstore.add_documents(
@@ -146,7 +146,7 @@ def generate_with_langchain_docs(
         with_debugging_logs=False,
         is_async: bool = True,
         raise_exceptions: bool = True,
-        run_config: t.Optional[RunConfig] = None
+        run_config: t.Optional[RunConfig] = None,
     ):
         # chunk documents and add to docstore
         self.docstore.add_documents(
@@ -184,7 +184,7 @@ def generate(
         with_debugging_logs=False,
         is_async: bool = True,
         raise_exceptions: bool = True,
-        run_config: t.Optional[RunConfig] = None
+        run_config: t.Optional[RunConfig] = None,
     ):
         # validate distributions
         if not check_if_sum_is_close(list(distributions.values()), 1.0, 3):

From 3eb6ae66130a948d7da5dbd9df15cff850d805eb Mon Sep 17 00:00:00 2001
From: Jithin James <jamesjithin97@gmail.com>
Date: Thu, 28 Mar 2024 08:50:01 -0700
Subject: [PATCH 6/7] Update src/ragas/metrics/base.py

Co-authored-by: Massimiliano Pronesti <massimiliano.pronesti@gmail.com>
---
 src/ragas/metrics/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index f4d05bf9e..dd02204bf 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -26,7 +26,7 @@
 
 
 def get_required_columns(
-    eval_mod: EvaluationMode, ignore_columns: t.List[str] = []
+    eval_mod: EvaluationMode, ignore_columns: t.Oprional[t.List[str]] = None
 ) -> t.List[str]:
     if eval_mod == EvaluationMode.qac:
         keys = ["question", "answer", "contexts"]

From aae083c419badee6322bf9eaa88f0d731e48413a Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Thu, 28 Mar 2024 08:56:53 -0700
Subject: [PATCH 7/7] complete change

---
 src/ragas/metrics/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index dd02204bf..d7cd39aac 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -4,6 +4,7 @@
 C - contexts: context used for generation
 G - ground_truth: ground truth answer
 """
+
 from __future__ import annotations
 
 import asyncio
@@ -26,7 +27,7 @@
 
 
 def get_required_columns(
-    eval_mod: EvaluationMode, ignore_columns: t.Oprional[t.List[str]] = None
+    eval_mod: EvaluationMode, ignore_columns: t.Optional[t.List[str]] = None
 ) -> t.List[str]:
     if eval_mod == EvaluationMode.qac:
         keys = ["question", "answer", "contexts"]
@@ -42,6 +43,7 @@ def get_required_columns(
         keys = ["question", "contexts", "answer", "ground_truth"]
     elif eval_mod == EvaluationMode.qcg:
         keys = ["question", "contexts", "ground_truth"]
+    ignore_columns = ignore_columns or []
 
     return [k for k in keys if k not in ignore_columns]