Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: MVP for the new TestsetGenerator - SimpleEvolution #464

Merged
merged 10 commits into from
Jan 18, 2024
Merged
13 changes: 7 additions & 6 deletions src/ragas/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field

import numpy as np
from tqdm.auto import tqdm


@dataclass
class Executor:
desc: str = "Evaluating"
is_async: bool = True
max_workers: t.Optional[int] = None
futures: t.List[t.Any] = field(default_factory=list, repr=False)
Expand Down Expand Up @@ -71,10 +71,10 @@ async def _aresults(self) -> t.List[t.Any]:
results = []
for future in tqdm(
asyncio.as_completed(self.futures),
desc="Evaluating",
desc=self.desc,
total=len(self.futures),
):
r = np.nan
r = (-1, None)
try:
r = await future
except Exception as e:
Expand Down Expand Up @@ -106,20 +106,21 @@ def results(self) -> t.List[t.Any]:
try:
for future in tqdm(
as_completed(self.futures),
desc="Evaluating",
desc=self.desc,
total=len(self.futures),
):
r = np.nan
r = (-1, None)
try:
r = future.result()
except Exception as e:
r = np.nan
r = (-1, None)
if self.raise_exceptions:
raise e
finally:
results.append(r)
finally:
self.executor.shutdown(wait=False)

print(results)
sorted_results = sorted(results, key=lambda x: x[0])
return [r[1] for r in sorted_results]
3 changes: 2 additions & 1 deletion src/ragas/llms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models import ChatOpenAI

from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper

__all__ = [
"BaseRagasLLM",
"LangchainLLMWrapper",
"llm_factory",
]

Expand Down
3 changes: 1 addition & 2 deletions src/ragas/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool:

@dataclass
class BaseRagasLLM(ABC):

def get_temperature(self, n: int) -> float:
"""Return the temperature to use for completion based on n."""
return 0.3 if n > 1 else 1e-8

@abstractmethod
def generate_text(
self,
Expand Down
186 changes: 131 additions & 55 deletions src/ragas/testset/docstore.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,112 @@
import heapq
import logging
import typing as t
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from random import choices

import numpy as np
import numpy.typing as npt
from langchain.text_splitter import TextSplitter
from langchain_core.documents import Document as LCDocument
from pydantic import Field
from langchain_core.pydantic_v1 import Field
from llama_index.readers.schema import Document as LlamaindexDocument

from ragas.async_utils import run_async_tasks
from ragas.embeddings.base import BaseRagasEmbeddings, embedding_factory

Embedding = t.Union[t.List[float], npt.NDArray[np.float64]]
logger = logging.getLogger(__name__)
rng = np.random.default_rng()


class Document(LCDocument):
doc_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
filename: t.Optional[str] = None
embedding: t.Optional[t.List[float]] = Field(default=None, repr=False)

@classmethod
def from_langchain_document(cls, doc: LCDocument):
doc_id = str(uuid.uuid4())
if doc.metadata.get("filename"):
filename = doc.metadata["filename"]
else:
logger.info(
"Document [ID: %s] has no filename. Using doc_id as filename.", doc_id
)
filename = doc_id
return cls(
page_content=doc.page_content,
metadata=doc.metadata,
doc_id=doc_id,
filename=filename,
)

@classmethod
def from_llamaindex_document(cls, doc: LlamaindexDocument):
doc_id = str(uuid.uuid4())
if doc.metadata.get("filename"):
filename = doc.metadata["filename"]
else:
logger.info(
"Document [ID: %s] has no filename. Using doc_id as filename.", doc_id
)
filename = doc_id
return cls(
page_content=doc.text,
metadata=doc.metadata,
doc_id=doc_id,
filename=filename,
)


class Node(Document):
...


class Direction(str, Enum):
"""
Direction for getting adjascent nodes.
"""

NEXT = "next"
PREV = "prev"
UP = "up"
DOWN = "down"


class DocumentStore(ABC):
def __init__(self):
self.documents = {}

@abstractmethod
def add(self, doc: t.Union[Document, t.Sequence[Document]], show_progress=True):
def add_documents(self, docs: t.Sequence[Document], show_progress=True):
...

@abstractmethod
def add_nodes(self, nodes: t.Sequence[Node], show_progress=True):
...

@abstractmethod
def get(self, doc_id: str) -> Document:
def get_node(self, node_id: str) -> Node:
...

@abstractmethod
def get_random_nodes(self, k=1) -> t.List[Node]:
...

@abstractmethod
def get_similar(
self, doc: Document, threshold: float = 0.7, top_k: int = 3
) -> t.List[Document]:
self, node: Node, threshold: float = 0.7, top_k: int = 3
) -> t.Union[t.List[Document], t.List[Node]]:
...

@abstractmethod
def get_adjascent(
self, doc: Document, direction: str = "next"
) -> t.Optional[Document]:
def get_adjacent(
self, node: Node, direction: Direction = Direction.NEXT
) -> t.Optional[Node]:
...


Expand Down Expand Up @@ -117,84 +179,98 @@ class InMemoryDocumentStore(DocumentStore):
embeddings: BaseRagasEmbeddings = field(
default_factory=embedding_factory, repr=False
)
documents_list: t.List[Document] = field(default_factory=list)
embeddings_list: t.List[Embedding] = field(default_factory=list)
documents_map: t.Dict[str, Document] = field(default_factory=dict)
nodes: t.List[Node] = field(default_factory=list)
node_embeddings_list: t.List[Embedding] = field(default_factory=list)
node_map: t.Dict[str, Node] = field(default_factory=dict)

def _embed_items(self, items: t.Union[t.Sequence[Document], t.Sequence[Node]]):
...

def _add_documents_batch(self, docs: t.Sequence[Document], show_progress=True):
def add_documents(self, docs: t.Sequence[Document], show_progress=True):
"""
Add documents in batch mode.
"""
# split documents with self.splitter into smaller nodes
nodes = [
Node.from_langchain_document(d)
for d in self.splitter.transform_documents(docs)
]

self.add_nodes(nodes, show_progress=show_progress)

def add_nodes(
self, nodes: t.Sequence[Node], show_progress=True, desc: str = "embedding nodes"
):
# NOTE: Adds everything in async mode for now.
embed_tasks = []
docs_to_embed = []
for doc in docs:
if doc.embedding is None:
embed_tasks.append(self.embeddings.aembed_query(doc.page_content))
docs_to_embed.append(doc)
# get embeddings for the docs
for n in nodes:
if n.embedding is None:
embed_tasks.append(self.embeddings.aembed_query(n.page_content))
docs_to_embed.append(n)
else:
self.documents_list.append(doc)
self.documents_map[doc.doc_id] = doc
self.embeddings_list.append(doc.embedding)

embeddings = run_async_tasks(embed_tasks, show_progress=show_progress)
for doc, embedding in zip(docs_to_embed, embeddings):
doc.embedding = embedding
self.documents_list.append(doc)
self.documents_map[doc.doc_id] = doc
self.embeddings_list.append(doc.embedding)

def add(self, doc: t.Union[Document, t.Sequence[Document]], show_progress=True):
if isinstance(doc, list) or isinstance(doc, tuple):
self._add_documents_batch(doc)
elif isinstance(doc, Document):
self.documents_list.append(doc)
self.documents_map[doc.doc_id] = doc
if doc.embedding is None:
doc.embedding = self.embeddings.embed_query(doc.page_content)
self.embeddings_list.append(doc.embedding)
else:
raise ValueError("add() method only supports Document or List[Document]")
self.nodes.append(n)
self.node_map[n.doc_id] = n
self.node_embeddings_list.append(n.embedding)

embeddings = run_async_tasks(
embed_tasks, show_progress=show_progress, progress_bar_desc=desc
)
for n, embedding in zip(docs_to_embed, embeddings):
n.embedding = embedding
self.nodes.append(n)
self.node_map[n.doc_id] = n
self.node_embeddings_list.append(n.embedding)

def get_node(self, node_id: str) -> Node:
return self.node_map[node_id]

def get_document(self, doc_id: str) -> Node:
raise NotImplementedError

def get(self, doc_id: str) -> Document:
return self.documents_map[doc_id]
def get_random_nodes(self, k=1) -> t.List[Node]:
return choices(self.nodes, k=k)

def get_similar(
self, doc: Document, threshold: float = 0.7, top_k: int = 3
) -> t.List[Document]:
self, node: Node, threshold: float = 0.7, top_k: int = 3
) -> t.Union[t.List[Document], t.List[Node]]:
items = []
doc = node
if doc.embedding is None:
raise ValueError("Document has no embedding.")
scores, doc_ids = get_top_k_embeddings(
query_embedding=doc.embedding,
embeddings=self.embeddings_list,
embeddings=self.node_embeddings_list,
similarity_fn=similarity,
similarity_cutoff=threshold,
# we need to return k+1 docs here as the top result is the input doc itself
similarity_top_k=top_k + 1,
)
# remove the query doc itself from results
scores, doc_ids = scores[1:], doc_ids[1:]
return [self.documents_list[doc_id] for doc_id in doc_ids]
items = [self.nodes[doc_id] for doc_id in doc_ids]
return items

def get_adjascent(
self, doc: Document, direction: str = "next"
) -> t.Optional[Document]:
def get_adjacent(
self, node: Node, direction: Direction = Direction.NEXT
) -> t.Optional[Node]:
# linear search for doc_id of doc in documents_list
index = self.documents_list.index(doc)
index = self.nodes.index(node)

if direction == "next":
if len(self.documents_list) > index + 1:
next_doc = self.documents_list[index + 1]
if next_doc.filename == doc.filename:
if direction == Direction.NEXT:
if len(self.nodes) > index + 1:
next_doc = self.nodes[index + 1]
if next_doc.filename == node.filename:
return next_doc
else:
return None
else:
return None
if direction == "prev":
if direction == Direction.PREV:
if index > 0:
prev_doc = self.documents_list[index - 1]
if prev_doc.filename == doc.filename:
prev_doc = self.nodes[index - 1]
if prev_doc.filename == node.filename:
return prev_doc
else:
return None
Expand Down