Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions docs/alfred.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from __future__ import annotations

import os
from collections import namedtuple
import argparse
import asyncio
from tqdm.asyncio import tqdm
import os
import typing as t
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.language_models.chat_models import BaseChatModel
from collections import namedtuple

from langchain.prompts import ChatPromptTemplate
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai.chat_models import ChatOpenAI
from tqdm.asyncio import tqdm

File = namedtuple("File", "name content")

Expand Down
4 changes: 1 addition & 3 deletions src/ragas/embeddings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@ def set_run_config(self, run_config: RunConfig):

class LangchainEmbeddingsWrapper(BaseRagasEmbeddings):
def __init__(
self,
embeddings: Embeddings,
run_config: t.Optional[RunConfig] = None
self, embeddings: Embeddings, run_config: t.Optional[RunConfig] = None
):
self.embeddings = embeddings
if run_config is None:
Expand Down
2 changes: 1 addition & 1 deletion src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
LangchainEmbeddingsWrapper,
embedding_factory,
)
from ragas.llms import llm_factory
from ragas.exceptions import ExceptionInRunner
from ragas.executor import Executor
from ragas.llms import llm_factory
from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper
from ragas.metrics._answer_correctness import AnswerCorrectness
from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM
Expand Down
13 changes: 8 additions & 5 deletions src/ragas/executor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations
import sys

import asyncio
import logging
import sys
import threading
import typing as t
from dataclasses import dataclass, field
Expand All @@ -24,28 +24,31 @@ def runner_exception_hook(args: threading.ExceptHookArgs):
# set a custom exception hook
# threading.excepthook = runner_exception_hook


def as_completed(loop, coros, max_workers):
loop_arg_dict = {"loop": loop} if sys.version_info[:2] < (3, 10) else {}
if max_workers == -1:
return asyncio.as_completed(coros, **loop_arg_dict)

# loop argument is removed since Python 3.10
semaphore = asyncio.Semaphore(max_workers, **loop_arg_dict)

async def sema_coro(coro):
async with semaphore:
return await coro

sema_coros = [sema_coro(c) for c in coros]
return asyncio.as_completed(sema_coros, **loop_arg_dict)


class Runner(threading.Thread):
def __init__(
self,
jobs: t.List[t.Tuple[t.Coroutine, str]],
desc: str,
keep_progress_bar: bool = True,
raise_exceptions: bool = True,
run_config: t.Optional[RunConfig] = None
run_config: t.Optional[RunConfig] = None,
):
super().__init__()
self.jobs = jobs
Expand All @@ -59,7 +62,7 @@ def __init__(
self.futures = as_completed(
loop=self.loop,
coros=[coro for coro, _ in self.jobs],
max_workers=self.run_config.max_workers
max_workers=self.run_config.max_workers,
)

async def _aresults(self) -> t.List[t.Any]:
Expand Down
Empty file.
216 changes: 216 additions & 0 deletions src/ragas/integrations/langchain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
from __future__ import annotations

import typing as t

from langchain.chains.base import Chain
from langchain.schema import RUN_KEY
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics.base import (
EvaluationMode,
Metric,
MetricWithEmbeddings,
MetricWithLLM,
get_required_columns,
)
from ragas.run_config import RunConfig
from ragas.validation import EVALMODE_TO_COLUMNS

if t.TYPE_CHECKING:
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
)


class EvaluatorChain(Chain, RunEvaluator):
"""
Wrapper around ragas Metrics to use them with langsmith.
"""

metric: Metric

def __init__(self, metric: Metric, **kwargs: t.Any):
kwargs["metric"] = metric
super().__init__(**kwargs)
if "run_config" in kwargs:
run_config = kwargs["run_config"]
else:
run_config = RunConfig()
if isinstance(self.metric, MetricWithLLM):
llm = kwargs.get("llm", ChatOpenAI())
t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
if isinstance(self.metric, MetricWithEmbeddings):
embeddings = kwargs.get("embeddings", OpenAIEmbeddings())
t.cast(
MetricWithEmbeddings, self.metric
).embeddings = LangchainEmbeddingsWrapper(embeddings)
self.metric.init(run_config)

@property
def input_keys(self) -> list[str]:
return get_required_columns(self.metric.evaluation_mode)

@property
def output_keys(self) -> list[str]:
return [self.metric.name]

def _call(
self,
inputs: dict[str, t.Any],
run_manager: t.Optional[CallbackManagerForChainRun] = None,
) -> dict[str, t.Any]:
"""
Call the evaluation chain.
"""
self._validate(inputs)
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()

c = inputs.get("contexts", [""])
g = inputs.get("ground_truth", "")
q = inputs.get("question", "")
a = inputs.get("answer", "")
score = self.metric.score(
{
"question": q,
"answer": a,
"contexts": c,
"ground_truth": g,
},
callbacks=callbacks,
)
return {self.metric.name: score}

async def _acall(
self,
inputs: t.Dict[str, t.Any],
run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None,
) -> t.Dict[str, t.Any]:
"""
Call the evaluation chain.
"""
self._validate(inputs)
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
# TODO: currently AsyncCallbacks are not supported in ragas
_run_manager.get_child()

c = inputs.get("contexts", [""])
g = inputs.get("ground_truth", "")
q = inputs.get("question", "")
a = inputs.get("answer", "")
score = await self.metric.ascore(
{
"question": q,
"answer": a,
"contexts": c,
"ground_truth": g,
},
callbacks=[],
)
return {self.metric.name: score}

def _validate(
self,
input: dict[str, t.Any],
question_key: str = "question",
prediction_key: str = "answer",
context_key: str = "contexts",
) -> None:
# validate each example
required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
if "question" in required_columns and question_key not in input:
raise ValueError(
f'"{question_key}" is required in each example'
f"for the metric[{self.metric.name}] you have chosen."
)
if "answer" in required_columns and prediction_key not in input:
raise ValueError(
f'"{prediction_key}" is required in each prediction'
f"for the metric[{self.metric.name}] you have chosen."
)
if "contexts" in required_columns and context_key not in input:
raise ValueError(
f'"{context_key}" is required in each prediction for the '
f"metric[{self.metric.name}] you have chosen."
)
if "ground_truth" in required_columns and "ground_truth" not in input:
raise ValueError(
f'"ground_truth" is required in each prediction for the '
f"metric[{self.metric.name}] you have chosen."
)

@staticmethod
def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
return [k for k in keys_to_check if k not in dict_to_check]

def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
if example is None:
raise ValueError(
"expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
)
if example.inputs is None:
raise ValueError(
"expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
)
if example.outputs is None:
raise ValueError(
"expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
)
if "question" not in example.inputs or "ground_truth" not in example.outputs:
raise ValueError(
"Expected 'question' and 'ground_truth' in example."
f"Got: {[k for k in example.inputs.keys()]}"
)
assert (
run.outputs is not None
), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
output_keys = get_required_columns(
self.metric.evaluation_mode, ["question", "ground_truth"]
)
missing_keys = self._keys_are_present(output_keys, run.outputs)
if missing_keys:
raise ValueError(
"Expected 'answer' and 'contexts' in run.outputs."
f"Got: {[k for k in run.outputs.keys()]}"
)

def evaluate_run(
self, run: Run, example: t.Optional[Example] = None
) -> EvaluationResult:
"""
Evaluate a langsmith run
"""
self._validate_langsmith_eval(run, example)

# this is just to suppress the type checker error
# actual check and error message is in the _validate_langsmith_eval
assert run.outputs is not None
assert example is not None
assert example.inputs is not None
assert example.outputs is not None

chain_eval = run.outputs
chain_eval["question"] = example.inputs["question"]
if self.metric.evaluation_mode in [
EvaluationMode.gc,
EvaluationMode.ga,
EvaluationMode.qcg,
EvaluationMode.qga,
]:
if example.outputs is None or "ground_truth" not in example.outputs:
raise ValueError("expected `ground_truth` in example outputs.")
chain_eval["ground_truth"] = example.outputs["ground_truth"]
eval_output = self(chain_eval, include_run_info=True)

evaluation_result = EvaluationResult(
key=self.metric.name, score=eval_output[self.metric.name]
)
if RUN_KEY in eval_output:
evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
return evaluation_result
Loading