# `Dataset`

> A python list like object that contains your evaluation data.

In [1]:
# | default_exp dataset

In [2]:
# | hide

from unittest.mock import MagicMock
from fastcore.test import *

In [3]:
# | export
import typing as t

from fastcore.utils import patch

from ragas_annotator.model.notion_model import NotionModel
from ragas_annotator.backends.notion_backend import NotionBackend

In [4]:
# | export
NotionModelType = t.TypeVar("NotionModelType", bound=NotionModel)


class Dataset(t.Generic[NotionModelType]):
    """A list-like interface for managing NotionModel instances in a Notion database."""

    def __init__(
        self,
        name: str,
        model: t.Type[NotionModel],
        database_id: str,
        notion_backend: NotionBackend,
    ):
        self.name = name
        self.model = model
        self.database_id = database_id
        self._notion_backend = notion_backend
        self._entries: t.List[NotionModelType] = []

    def __getitem__(
        self, key: t.Union[int, slice]
    ) -> t.Union[NotionModelType, "Dataset[NotionModelType]"]:
        """Get an entry by index or slice."""
        if isinstance(key, slice):
            new_dataset = type(self)(
                name=self.name,
                model=self.model,
                database_id=self.database_id,
                notion_backend=self._notion_backend,
            )
            new_dataset._entries = self._entries[key]
            return new_dataset
        else:
            return self._entries[key]

    def __setitem__(self, index: int, entry: NotionModelType) -> None:
        """Update an entry at the given index and sync to Notion."""
        if not isinstance(entry, self.model):
            raise TypeError(f"Entry must be an instance of {self.model.__name__}")

        # Get existing entry to get Notion page ID
        existing = self._entries[index]
        if not hasattr(existing, "_page_id"):
            raise ValueError("Existing entry has no page_id")

        # Update in Notion
        assert (
            existing._page_id is not None
        )  # mypy fails to infer that we check for it above
        response = self._notion_backend.update_page(
            page_id=existing._page_id, properties=entry.to_notion()["properties"]
        )

        # Update local cache with response data
        self._entries[index] = self.model.from_notion(response)

    def __repr__(self) -> str:
        return f"Dataset(name={self.name}, model={self.model.__name__}, len={len(self)})"

    def __len__(self) -> int:
        return len(self._entries)

    def __iter__(self) -> t.Iterator[NotionModelType]:
        return iter(self._entries)

In [5]:
#| hide
import ragas_annotator.model.notion_typing as nmt
from ragas_annotator.backends.mock_notion import MockNotionClient
from ragas_annotator.backends.factory import NotionClientFactory
from ragas_annotator.backends.notion_backend import NotionBackend

In [6]:
# test model
class TestModel(NotionModel):
    id: int = nmt.ID()
    name: str = nmt.Title()
    description: str = nmt.Text()


test_model = TestModel(name="test", description="test description")
test_model

TestModel(name='test' description='test description')

In [7]:
#| hide
# Set up a test environment with mock Notion client and a test database.
# root page id
root_page_id = "test-root-id"
# Create a mock client
mock_client = NotionClientFactory.create(use_mock=True, initialize_project=True, root_page_id=root_page_id)

# Create NotionBackend with mock client
backend = NotionBackend(root_page_id=root_page_id, notion_client=mock_client)

# get the page id of the datasets page
dataset_page_id = backend.get_page_id(parent_id=root_page_id, page_name="Datasets")

# create a new database in the datasets page
properties = {}
for _, field in TestModel._fields.items():
    properties.update(field._to_notion_property())
datasets_id = backend.create_new_database(
    parent_page_id=dataset_page_id, title="TestModel", properties=properties
)

In [8]:
dataset = Dataset(
    name="TestModel",
    model=TestModel,
    database_id=datasets_id,
    notion_backend=backend
)

In [9]:
# | export
@patch
def append(self: Dataset, entry: NotionModelType) -> None:
    """Add a new entry to the dataset and sync to Notion."""
    # if not isinstance(entry, self.model):
    #     raise TypeError(f"Entry must be an instance of {self.model.__name__}")

    # Create in Notion and get response
    response = self._notion_backend.create_page_in_database(
        database_id=self.database_id, properties=entry.to_notion()["properties"]
    )

    # Update entry with Notion data (like ID)
    updated_entry = self.model.from_notion(response)
    self._entries.append(updated_entry)

In [10]:
dataset.append(test_model)
len(dataset)

1

In [11]:
#| hide
test_eq(len(dataset), 1)

In [12]:
# | export
@patch
def pop(self: Dataset, index: int = -1) -> NotionModelType:
    """Remove and return entry at index, sync deletion to Notion."""
    entry = self._entries[index]
    if not hasattr(entry, "_page_id"):
        raise ValueError("Entry has no page_id")

    # Archive in Notion (soft delete)
    assert entry._page_id is not None  # mypy fails to infer that we check for it above
    self._notion_backend.update_page(page_id=entry._page_id, archived=True)

    # Remove from local cache
    return self._entries.pop(index)

In [13]:
dataset.pop()
len(dataset)

0

In [14]:
#| hide
test_eq(len(dataset), 0)

In [15]:
# | export
@patch
def load(self: Dataset) -> None:
    """Load all entries from the Notion database."""
    # Query the database
    response = self._notion_backend.query_database(
        database_id=self.database_id, archived=False
    )

    # Clear existing entries
    self._entries.clear()

    # Convert results to model instances
    for page in response.get("results", []):
        entry = self.model.from_notion(page)
        self._entries.append(entry)

In [16]:
dataset.load()

In [17]:
for i in range(3):
    dataset.append(test_model)
len(dataset)

3

In [18]:
# create a new instance of the dataset
dataset = Dataset(
    name="TestModel",
    model=TestModel,
    database_id=datasets_id,
    notion_backend=backend,
)
len(dataset)

0

In [19]:
dataset.load()
test_eq(len(dataset), 3)

In [20]:
# | export
@patch
def get(self: Dataset, id: int) -> t.Optional[NotionModelType]:
    """Get an entry by ID."""
    if not self._notion_backend:
        return None

    # Query the database for the specific ID
    response = self._notion_backend.query_database(
        database_id=self.database_id,
        filter={"property": "id", "unique_id": {"equals": id}},
    )

    if not response.get("results"):
        return None

    return self.model.from_notion(response["results"][0])

In [21]:
test_model = dataset.get(0)
test_model

TestModel(name='test' description='test description')

In [22]:
#| hide
test_eq(test_model.description, "test description")

In [23]:
# | export
@patch
def save(self: Dataset, item: NotionModelType) -> None:
    """Save changes to an item to Notion."""
    if not isinstance(item, self.model):
        raise TypeError(f"Item must be an instance of {self.model.__name__}")

    if not hasattr(item, "_page_id"):
        raise ValueError("Item has no page_id")

    # Update in Notion
    assert item._page_id is not None  # mypy fails to infer that we check for it above
    response = self._notion_backend.update_page(
        page_id=item._page_id, properties=item.to_notion()["properties"]
    )

    # Update local cache
    for i, existing in enumerate(self._entries):
        if existing._page_id == item._page_id:
            self._entries[i] = self.model.from_notion(response)
            break

In [24]:
test_model.description = "updated description"
dataset.save(test_model)

In [25]:
dataset.get(0)

TestModel(name='test' description='updated description')

In [26]:
#| hide
test_eq(dataset.get(0).description, "updated description")