Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions src/ragas/embeddings/base.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,47 @@
from __future__ import annotations

import asyncio
import typing as t
from abc import ABC
from dataclasses import field
from typing import List

import numpy as np
from langchain_core.embeddings import Embeddings as BaseRagasEmbeddings
from langchain_core.embeddings import Embeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from pydantic.dataclasses import dataclass

DEFAULT_MODEL_NAME = "BAAI/bge-small-en-v1.5"


class BaseRagasEmbeddings(Embeddings, ABC):
async def embed_texts(
self, texts: List[str], is_async: bool = True
) -> t.List[t.List[float]]:
if is_async:
return await self.aembed_documents(texts)
else:
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self.embed_documents, texts)


class LangchainEmbeddingsWrapper(BaseRagasEmbeddings):
def __init__(self, embeddings: Embeddings):
self.embeddings = embeddings

def embed_query(self, text: str) -> List[float]:
return self.embeddings.embed_query(text)

def embed_documents(self, texts: List[str]) -> List[List[float]]:
return self.embeddings.embed_documents(texts)

async def aembed_query(self, text: str) -> List[float]:
return await self.embeddings.aembed_query(text)

async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
return await self.embeddings.aembed_documents(texts)


@dataclass
class HuggingfaceEmbeddings(BaseRagasEmbeddings):
model_name: str = DEFAULT_MODEL_NAME
Expand Down Expand Up @@ -89,4 +119,4 @@ def predict(self, texts: List[List[str]]) -> List[List[float]]:

def embedding_factory() -> BaseRagasEmbeddings:
openai_embeddings = OpenAIEmbeddings()
return openai_embeddings
return LangchainEmbeddingsWrapper(openai_embeddings)
19 changes: 8 additions & 11 deletions src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def evaluate(
executor = Executor(
desc="Evaluating",
keep_progress_bar=True,
is_async=is_async,
is_async=True,
max_workers=max_workers,
raise_exceptions=raise_exceptions,
)
Expand All @@ -174,21 +174,18 @@ def evaluate(
is_async=is_async,
)
row_run_managers.append((row_rm, row_group_cm))

if is_async:
[
executor.submit(
metric.ascore, row, row_group_cm, name=f"{metric.name}-{i}"
)
for metric in metrics
]
else:
[executor.submit(metric.score, row, row_group_cm) for metric in metrics]
[
executor.submit(
metric.ascore, row, row_group_cm, is_async, name=f"{metric.name}-{i}"
)
for metric in metrics
]

scores = []
try:
# get the results
results = executor.results()

# convert results to dataset_like
for i, _ in enumerate(dataset):
s = {}
Expand Down
38 changes: 36 additions & 2 deletions src/ragas/llms/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import asyncio
import typing as t
from abc import ABC, abstractmethod
from dataclasses import dataclass
from functools import partial

from langchain_community.chat_models import AzureChatOpenAI, ChatOpenAI, ChatVertexAI
from langchain_community.llms import AzureOpenAI, OpenAI, VertexAI
Expand Down Expand Up @@ -63,6 +65,38 @@ async def agenerate_text(
) -> LLMResult:
...

async def generate(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = [],
is_async: bool = True,
) -> LLMResult:
"""Generate text using the given event loop."""
loop = asyncio.get_event_loop()

if is_async:
return await self.agenerate_text(
prompt=prompt,
n=n,
temperature=temperature,
stop=stop,
callbacks=callbacks,
)
else:
loop = asyncio.get_event_loop()
generate_text = partial(
self.generate_text,
prompt=prompt,
n=n,
temperature=temperature,
stop=stop,
callbacks=callbacks,
)
return await loop.run_in_executor(None, generate_text)


@dataclass
class LangchainLLMWrapper(BaseRagasLLM):
Expand All @@ -75,7 +109,7 @@ class LangchainLLMWrapper(BaseRagasLLM):

langchain_llm: BaseLanguageModel

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(15))
def generate_text(
self,
prompt: PromptValue,
Expand Down Expand Up @@ -106,7 +140,7 @@ def generate_text(
result.generations = generations
return result

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(15))
async def agenerate_text(
self,
prompt: PromptValue,
Expand Down
25 changes: 23 additions & 2 deletions src/ragas/llms/json_load.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from __future__ import annotations

import asyncio
import json
import logging
import typing as t
from dataclasses import dataclass
from functools import partial

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,7 +77,7 @@ def load_as_json(text) -> t.Dict:
class JsonLoader:
max_retries: int = 2

def safe_load(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None):
def _safe_load(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None):
retry = 0
while retry <= self.max_retries:
try:
Expand All @@ -94,7 +96,7 @@ def safe_load(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None):

return {}

async def asafe_load(
async def _asafe_load(
self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None
):
retry = 0
Expand All @@ -115,6 +117,25 @@ async def asafe_load(

return {}

async def safe_load(
self,
text: str,
llm: BaseRagasLLM,
callbacks: Callbacks = None,
is_async: bool = True,
):
if is_async:
return await self._asafe_load(text=text, llm=llm, callbacks=callbacks)
else:
loop = asyncio.get_event_loop()
safe_load = partial(
self._safe_load, text=text, llm=llm, callbacks=callbacks
)
return await loop.run_in_executor(
None,
safe_load,
)

def _find_outermost_json(self, text):
stack = []
start_index = -1
Expand Down
2 changes: 1 addition & 1 deletion src/ragas/llms/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def adapt(
{k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])}
)
example_dict[self.output_key] = (
json_loader.safe_load(example[-1], llm)
json_loader._safe_load(example[-1], llm)
if self.output_type.lower() == "json"
else example[-1]
)
Expand Down
35 changes: 6 additions & 29 deletions src/ragas/metrics/_answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,40 +139,17 @@ def _compute_statement_presence(self, prediction: t.Any) -> float:

return score

def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "LLM must be set"
q, a, g = row["question"], row["answer"], row["ground_truth"]
p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks)

prediction = json_loader.safe_load(
is_statement_present.generations[0][0].text, self.llm
)
f1_score = self._compute_statement_presence(prediction)

if self.weights[1] == 0:
similarity_score = 0
else:
similarity_score = self.answer_similarity.score(row, callbacks=callbacks) # type: ignore

score = np.average(
[f1_score, similarity_score],
weights=self.weights,
)

return float(score)

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float:
assert self.llm is not None, "LLM must be set"

q, a, g = row["question"], row["answer"], row["ground_truth"]
p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
is_statement_present = await self.llm.agenerate_text(
p_value, callbacks=callbacks
is_statement_present = await self.llm.generate(
p_value, callbacks=callbacks, is_async=is_async
)

prediction = await json_loader.asafe_load(
is_statement_present.generations[0][0].text, self.llm
prediction = await json_loader.safe_load(
is_statement_present.generations[0][0].text, self.llm, is_async=is_async
)
f1_score = self._compute_statement_presence(prediction)

Expand All @@ -182,7 +159,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.answer_similarity is not None, "AnswerSimilarity must be set"

similarity_score = await self.answer_similarity.ascore(
row, callbacks=callbacks
row, callbacks=callbacks, is_async=is_async
)

score = np.average(
Expand Down
22 changes: 4 additions & 18 deletions src/ragas/metrics/_answer_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,32 +117,18 @@ def _create_question_gen_prompt(self, row: t.Dict) -> PromptValue:
ans, ctx = row["answer"], row["contexts"]
return self.question_generation.format(answer=ans, context="\n".join(ctx))

def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float:
assert self.llm is not None, "LLM is not set"

prompt = self._create_question_gen_prompt(row)
result = self.llm.generate_text(
result = await self.llm.generate(
prompt,
n=self.strictness,
callbacks=callbacks,
is_async=is_async,
)
response = [
json_loader.safe_load(r.text, self.llm) for r in result.generations[0]
]

return self._calculate_score(response, row)

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "LLM is not set"

prompt = self._create_question_gen_prompt(row)
result = await self.llm.agenerate_text(
prompt,
n=self.strictness,
callbacks=callbacks,
)
response = [
await json_loader.asafe_load(r.text, self.llm)
await json_loader.safe_load(r.text, self.llm, is_async=is_async)
for r in result.generations[0]
]

Expand Down
37 changes: 5 additions & 32 deletions src/ragas/metrics/_answer_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def __post_init__(self: t.Self):
def init_model(self):
super().init_model()

def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(
self: t.Self, row: t.Dict, callbacks: Callbacks, is_async: bool
) -> float:
assert self.embeddings is not None, "embeddings must be set"

ground_truth, answers = row["ground_truth"], row["answer"]
Expand All @@ -67,37 +69,8 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
"async score [ascore()] not implemented for HuggingFace embeddings"
)
else:
embeddings_1 = np.array(self.embeddings.embed_documents(ground_truth))
embeddings_2 = np.array(self.embeddings.embed_documents(answers))
similarity = embeddings_1 @ embeddings_2.T
if similarity.size == 1:
# If similarity has only one value, directly use this value as scores
scores = similarity.flatten()
else:
# If similarity contains multiple values, extract the diagonal as scores
scores = np.diagonal(similarity)

assert isinstance(scores, np.ndarray), "Expects ndarray"
if self.threshold:
scores = scores >= self.threshold

return scores.tolist()[0]

async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float:
assert self.embeddings is not None, "embeddings must be set"

ground_truth: t.List[str] = row["ground_truth"]
answer: t.List[str] = row["answer"]

if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings):
raise NotImplementedError(
"async score [ascore()] not implemented for HuggingFace embeddings"
)
else:
embeddings_1 = np.array(
await self.embeddings.aembed_documents(ground_truth)
)
embeddings_2 = np.array(await self.embeddings.aembed_documents(answer))
embeddings_1 = np.array(await self.embeddings.embed_texts(ground_truth))
embeddings_2 = np.array(await self.embeddings.embed_texts(answers))
similarity = embeddings_1 @ embeddings_2.T
if similarity.size == 1:
scores = similarity.flatten()
Expand Down
Loading