# `Dataset`

> A python list like object that contains your evaluation data.

In [1]:
# | default_exp dataset

In [2]:
# | hide

from unittest.mock import MagicMock
from fastcore.test import *

In [3]:
# | export
import typing as t

from fastcore.utils import patch

from pydantic import BaseModel

from ragas_annotator.backends.ragas_api_client import RagasApiClient

In [4]:
# | export
PydanticModelType = t.TypeVar("PydanticModelType", bound=BaseModel)


class Dataset(t.Generic[PydanticModelType]):
    """A list-like interface for managing NotionModel instances in a Notion database."""

    def __init__(
        self,
        name: str,
        model: t.Type[BaseModel],
        project_id: str,
        dataset_id: str,
        ragas_api_client: RagasApiClient,
    ):
        self.name = name
        self.model = model
        self.project_id = project_id
        self.dataset_id = dataset_id
        self._ragas_api_client = ragas_api_client
        self._entries: t.List[PydanticModelType] = []

    def __getitem__(
        self, key: t.Union[int, slice]
    ) -> t.Union[PydanticModelType, "Dataset[PydanticModelType]"]:
        """Get an entry by index or slice."""
        if isinstance(key, slice):
            new_dataset = type(self)(
                name=self.name,
                model=self.model,
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                ragas_api_client=self._ragas_api_client,
            )
            new_dataset._entries = self._entries[key]
            return new_dataset
        else:
            return self._entries[key]

    def __setitem__(self, index: int, entry: PydanticModelType) -> None:
        """Update an entry at the given index and sync to Notion."""
        if not isinstance(entry, self.model):
            raise TypeError(f"Entry must be an instance of {self.model.__name__}")

        # Get existing entry to get Notion page ID
        existing = self._entries[index]
        if not hasattr(existing, "_page_id"):
            raise ValueError("Existing entry has no page_id")

        # Update in Notion
        assert (
            existing._page_id is not None
        )  # mypy fails to infer that we check for it above
        response = self._ragas_api_client.update_page(
            page_id=existing._page_id, properties=entry.to_notion()["properties"]
        )

        # Update local cache with response data
        self._entries[index] = self.model.from_notion(response)

    def __repr__(self) -> str:
        return (
            f"Dataset(name={self.name}, model={self.model.__name__}, len={len(self)})"
        )

    def __len__(self) -> int:
        return len(self._entries)

    def __iter__(self) -> t.Iterator[PydanticModelType]:
        return iter(self._entries)

In [5]:
# | hide
import ragas_annotator.typing as rt
from pydantic import BaseModel
from ragas_annotator.backends.factory import RagasApiClientFactory

In [6]:
# test model
class TestModel(BaseModel):
    id: int
    name: str
    description: str


test_model = TestModel(id=0, name="test", description="test description")
test_model

TestModel(id=0, name='test', description='test description')

In [7]:
import os

In [8]:
RAGAS_APP_TOKEN = "apt.47bd-c55e4a45b27c-02f8-8446-1441f09b-651a8"
RAGAS_API_BASE_URL = "https://api.dev.app.ragas.io"

os.environ["RAGAS_APP_TOKEN"] = RAGAS_APP_TOKEN
os.environ["RAGAS_API_BASE_URL"] = RAGAS_API_BASE_URL

In [9]:
ragas_api_client = RagasApiClientFactory.create()

In [11]:
# https://dev.app.ragas.io/dashboard/projects/0a7c4ecb-b313-4bb0-81c0-852c9634ce03/datasets/a4f0d169-ebce-4a2b-b758-0ff49c0c4312
TEST_PROJECT_ID = "0a7c4ecb-b313-4bb0-81c0-852c9634ce03"
TEST_DATASET_ID = "a4f0d169-ebce-4a2b-b758-0ff49c0c4312"
test_project = await ragas_api_client.get_project(project_id=TEST_PROJECT_ID)
test_dataset = await ragas_api_client.get_dataset(project_id=TEST_PROJECT_ID, dataset_id=TEST_DATASET_ID)
test_dataset

{'id': 'a4f0d169-ebce-4a2b-b758-0ff49c0c4312',
 'name': 'TestModel',
 'description': None,
 'created_at': '2025-04-10T02:41:25.077552+00:00',
 'updated_at': '2025-04-10T02:41:25.077552+00:00',
 'version_counter': 0,
 'project_id': '0a7c4ecb-b313-4bb0-81c0-852c9634ce03'}

In [12]:
dataset = Dataset(
    name="TestModel", model=TestModel, project_id=TEST_PROJECT_ID, dataset_id=TEST_DATASET_ID, ragas_api_client=ragas_api_client
)

In [None]:
from ragas_annotator.project.core import async_to_sync

In [35]:
@patch
def get_column_id_map(self: Dataset, dataset_id: str) -> dict:
    """Get a map of column name to column id"""
    sync_func = async_to_sync(self._ragas_api_client.list_dataset_columns)
    columns = sync_func(project_id=self.project_id, dataset_id=dataset_id)
    column_id_map = {column["name"]: column["id"] for column in columns["items"]}
    return column_id_map


In [38]:
# | export
@patch
def append(self: Dataset, entry: PydanticModelType) -> None:
    """Add a new entry to the dataset and sync to Notion."""
    # Create row inside the table

    # first get the columns for the dataset
    # TODO: this is a hack to get the columns for the dataset
    column_id_map = self.get_column_id_map(dataset_id=self.dataset_id)

    # create the rows
    row_dict = entry.model_dump()
    row_id = create_nano_id()
    row_data = {}
    for key, value in row_dict.items():
        if key in column_id_map:
            row_data[column_id_map[key]] = value

    sync_func = async_to_sync(self._ragas_api_client.create_dataset_row)
    response = sync_func(
        project_id=self.project_id,
        dataset_id=self.dataset_id,
        id=row_id,
        data=row_data,
    )
    # Update entry with Notion data (like ID)
    self._entries.append(entry)

In [39]:
dataset.append(test_model)
len(dataset)

4

In [None]:
# | hide
test_eq(len(dataset), 1)

In [20]:
# | export
@patch
def pop(self: Dataset, index: int = -1) -> PydanticModelType:
    """Remove and return entry at index, sync deletion to Notion."""
    entry = self._entries[index]
    # get the row id
    # TODO: this is a hack to get the row id
    sync_func = async_to_sync(self._ragas_api_client.list_dataset_rows)
    rows = sync_func(project_id=self.project_id, dataset_id=self.dataset_id)
    for row in rows["items"]:
        print(row, entry.id)
    return
    # Archive in Notion (soft delete)
    assert entry._page_id is not None  # mypy fails to infer that we check for it above
    self._ragas_api_client.update_page(page_id=entry._page_id, archived=True)

    # Remove from local cache
    return self._entries.pop(index)

In [21]:
dataset.pop()
len(dataset)

{'id': 'bvthgi7gc7Kj', 'data': {'id': 'bvthgi7gc7Kj', 'bhPF1NFCoxrL': 'test description', 'crNbVl176nwv': 0, 'djZ0STd4j6zA': 'test'}, 'created_at': '2025-04-10T02:43:20.64121+00:00', 'updated_at': '2025-04-10T02:43:20.64121+00:00', 'datatable_id': 'a4f0d169-ebce-4a2b-b758-0ff49c0c4312'} 0


1

In [None]:
# | hide
test_eq(len(dataset), 0)

In [None]:
# | export
@patch
def load(self: Dataset) -> None:
    """Load all entries from the Notion database."""
    # Query the database
    response = self._ragas_api_client.query_database(
        database_id=self.dataset_id, archived=False
    )

    # Clear existing entries
    self._entries.clear()

    # Convert results to model instances
    for page in response.get("results", []):
        entry = self.model.from_notion(page)
        self._entries.append(entry)

In [None]:
dataset.load()

In [None]:
for i in range(3):
    dataset.append(test_model)
len(dataset)

3

In [None]:
# create a new instance of the dataset
dataset = Dataset(
    name="TestModel",
    model=TestModel,
    dataset_id=datasets_id,
    ragas_api_client=backend,
)
len(dataset)

0

In [None]:
dataset.load()
test_eq(len(dataset), 3)

In [None]:
# | export
@patch
def get(self: Dataset, id: int) -> t.Optional[PydanticModelType]:
    """Get an entry by ID."""
    if not self._ragas_api_client:
        return None

    # Query the database for the specific ID
    response = self._ragas_api_client.query_database(
        database_id=self.dataset_id,
        filter={"property": "id", "unique_id": {"equals": id}},
    )

    if not response.get("results"):
        return None

    return self.model.from_notion(response["results"][0])

In [None]:
test_model = dataset.get(0)
test_model

TestModel(name='test' description='test description')

In [None]:
# | hide
test_eq(test_model.description, "test description")

In [None]:
# | export
@patch
def save(self: Dataset, item: PydanticModelType) -> None:
    """Save changes to an item to Notion."""
    if not isinstance(item, self.model):
        raise TypeError(f"Item must be an instance of {self.model.__name__}")

    if not hasattr(item, "_page_id"):
        raise ValueError("Item has no page_id")

    # Update in Notion
    assert item._page_id is not None  # mypy fails to infer that we check for it above
    response = self._ragas_api_client.update_page(
        page_id=item._page_id, properties=item.to_notion()["properties"]
    )

    # Update local cache
    for i, existing in enumerate(self._entries):
        if existing._page_id == item._page_id:
            self._entries[i] = self.model.from_notion(response)
            break

In [None]:
test_model.description = "updated description"
dataset.save(test_model)

In [None]:
dataset.get(0)

TestModel(name='test' description='updated description')

In [None]:
# | hide
test_eq(dataset.get(0).description, "updated description")