# `Comparison` 

> Create Comparison views with different experiments

In [1]:
#| default_exp project.comparison

In [22]:
#| hide
import pytest
from unittest.mock import MagicMock
from fastcore.test import *

from ragas_annotator.backends.notion_backend import NotionBackend

In [127]:
# | export
import typing as t
import logging

from fastcore.utils import patch
from tqdm import tqdm

from ragas_annotator.project.core import Project
from ragas_annotator.model.notion_model import NotionModel
import ragas_annotator.model.notion_typing as nmt
from ragas_annotator.experiment import Experiment
from ragas_annotator.dataset import Dataset

In [20]:
#| export
logger = logging.getLogger(__name__)

In [124]:
#| export
# utility function to check if a model has a title property and get the name of the title property
@t.overload
def _get_title_property(model: NotionModel|t.Type[NotionModel], raise_exception: t.Literal[True] = True) -> str:
    ...
@t.overload
def _get_title_property(model: NotionModel|t.Type[NotionModel], raise_exception: t.Literal[False] = False) -> t.Optional[str]:
    ...
def _get_title_property(model: NotionModel|t.Type[NotionModel], raise_exception: bool = True) -> t.Optional[str]:
    has_title = False
    for field in model._fields.keys():
        if isinstance(model._fields[field], nmt.Title):
            has_title = True
            title_property = field
            return title_property

    if not has_title:
        if raise_exception:
            raise ValueError("Model has no title property")
        else:
            return None

In [37]:
#| hide
class ModelWithoutTitle(NotionModel):
    id: int = nmt.ID()
    select: str = nmt.Select()

class ModelWithTitle(ModelWithoutTitle):
    some_title: str = nmt.Title()


In [38]:
#| hide
test_eq(_get_title_property(ModelWithoutTitle, raise_exception=False), None)
pytest.raises(ValueError, _get_title_property, ModelWithoutTitle)
test_eq(_get_title_property(ModelWithTitle), "some_title")

In [39]:
#| export
def _validate_experiments(experiments: t.Sequence[Experiment]):
    # validate we have more than 2 experiments
    if len(experiments) < 2:
        raise ValueError("We need at least 2 experiments to compare")

    # validate that all experiments are of the same model
    top_exp = experiments[0]
    title_property = _get_title_property(top_exp.model)
    for exp in experiments:
        if not isinstance(exp, Experiment):
            raise ValueError("All experiments must be of type Experiment")
        if top_exp != exp.model:
            logger.warning(f"Experiments have different models: {top_exp.model} and {exp.model}")
        if title_property != _get_title_property(exp.model):
            raise ValueError("All experiments must have the same title property.")

In [40]:
#| hide

example_notion_backend = MagicMock(spec=NotionBackend)

# test the validation logics
with pytest.raises(ValueError):
    _validate_experiments([
        Experiment("test_experiment_1", ModelWithTitle, "test_database_id", example_notion_backend), 
        Experiment("test_experiment_1", ModelWithoutTitle, "test_database_id", example_notion_backend), 
        Experiment("test_experiment_2", ModelWithTitle, "test_database_id", example_notion_backend),
    ])


# with should pass
_validate_experiments([
    Experiment("test_experiment_1", ModelWithTitle, "test_database_id", example_notion_backend), 
    Experiment("test_experiment_2", ModelWithTitle, "test_database_id", example_notion_backend)
])

# throw a warning if the models are different
class DifferentTitleModel(ModelWithoutTitle):
    some_title: str = nmt.Title()

_validate_experiments([
    Experiment("test_experiment_1", ModelWithTitle, "test_database_id", example_notion_backend), 
    Experiment("test_experiment_2", ModelWithTitle, "test_database_id", example_notion_backend),
    Experiment("test_experiment_3", DifferentTitleModel, "test_database_id", example_notion_backend)
])

# throw an error if the title properties are different
class DifferentTitleNameModel(ModelWithoutTitle):
    some_title_other: str = nmt.Title()
with pytest.raises(ValueError):
    _validate_experiments([
        Experiment("test_experiment_1", ModelWithTitle, "test_database_id", example_notion_backend), 
        Experiment("test_experiment_2", DifferentTitleNameModel, "test_database_id", example_notion_backend),
    ])

Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithoutTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.DifferentTitleModel'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.ModelWithTitle'>
Experiments have different models: <class '__main__.ModelWithTitle'> and <class '__main__.DifferentTitleNameModel'>


In [123]:
#| hide
test_eq(_get_title_property(ModelWithTitle), "some_title")
test_eq(_get_title_property(DifferentTitleNameModel), "some_title_other")
with pytest.raises(ValueError):
    _get_title_property(ModelWithoutTitle)

In [89]:
#| hide
# a test for grouping experiments
NUM_EXPS = 3
# dummy experiments
exp1 = Experiment("test_experiment_1", ModelWithTitle, "test_database_id", example_notion_backend)
exp2 = Experiment("test_experiment_2", ModelWithTitle, "test_database_id", example_notion_backend)
exp3 = Experiment("test_experiment_3", ModelWithTitle, "test_database_id", example_notion_backend)
# fill the experiments with dummy data
for i in range(NUM_EXPS):
    exp1._entries.append(ModelWithTitle(some_title=f"test_{i}", id=i, select=f"test_exp_1_{i}"))
for i in range(NUM_EXPS):
    exp2._entries.append(ModelWithTitle(some_title=f"test_{i}", id=i, select=f"test_exp_2_{i}"))
for i in range(NUM_EXPS):
    exp3._entries.append(ModelWithTitle(some_title=f"test_{i}", id=i, select=f"test_exp_3_{i}"))


# manually create the combined fields
combined_experiments_fields = []
for i in range(NUM_EXPS):
    exp1_as_field = {
        "id_str": str(i),
        "experiment_name": "test_experiment_1",
        "some_title": f"test_{i}",
        "select": f"test_exp_1_{i}",
    }
    exp2_as_field = exp1_as_field.copy()
    exp2_as_field["experiment_name"] = "test_experiment_2"
    exp2_as_field["some_title"] = f"test_{i}"
    exp2_as_field["select"] = f"test_exp_2_{i}"
    exp3_as_field = exp1_as_field.copy()
    exp3_as_field["experiment_name"] = "test_experiment_3"
    exp3_as_field["some_title"] = f"test_{i}"
    exp3_as_field["select"] = f"test_exp_3_{i}"
    combined_experiments_fields.append([exp1_as_field, exp2_as_field, exp3_as_field])

combined_experiments_fields[1]


[{'id_str': '1',
  'experiment_name': 'test_experiment_1',
  'some_title': 'test_1',
  'select': 'test_exp_1_1'},
 {'id_str': '1',
  'experiment_name': 'test_experiment_2',
  'some_title': 'test_1',
  'select': 'test_exp_2_1'},
 {'id_str': '1',
  'experiment_name': 'test_experiment_3',
  'some_title': 'test_1',
  'select': 'test_exp_3_1'}]

In [131]:
#| export
def _model_to_dict(model: NotionModel) -> dict:
    # drop ID filed
    data = {}
    for field_name in model._fields.keys():
        if isinstance(model._fields[field_name], nmt.ID):
            continue
        data[field_name] = model.__getattribute__(field_name)
    return data


In [132]:
#| hide
# test it
for i, grouped_row in enumerate(combined_experiments_fields):
    # add the missing fields to exp1
    exp1_dict = _model_to_dict(exp1._entries[i])
    exp1_dict["id_str"] = str(i)
    exp1_dict["experiment_name"] = "test_experiment_1"
    test_eq(grouped_row[0], exp1_dict)

In [125]:
#| export
def _combine_experiments(experiments: t.Sequence[Experiment]):
    """Group experiment rows by their title property value."""
    if not experiments:
        return []
    
    title_property: str = _get_title_property(experiments[0].model)
    
    # Create a dictionary to group rows by title value
    grouped_by_title = {}
    
    # Process each experiment
    for exp in experiments:
        for row in exp:
            title_value = getattr(row, title_property)
            
            # Create key if it doesn't exist
            if title_value not in grouped_by_title:
                grouped_by_title[title_value] = []
            
            # Add this row to the appropriate group
            row_dict = _model_to_dict(row)
            row_dict["experiment_name"] = exp.name
            grouped_by_title[title_value].append(row_dict)
    
    # Convert dictionary to list and add id_str
    result = []
    for i, (_, rows) in enumerate(grouped_by_title.items()):
        for row in rows:
            row["id_str"] = str(i)
        result.append(rows)
    
    return result

In [129]:
#| hide
# lets see if the asserts pass though
test_eq(_combine_experiments([exp1, exp2, exp3]), combined_experiments_fields)

TODO:
- leverage the `Dataset` object here to reduce duplicate code.

In [133]:
#| export
@patch
def compare_experiments(
    self: Project,
    *experiments: Experiment,
):
    _validate_experiments(experiments)

    # create a combined Model with all the fields of the experiments
    class CombinedModel(NotionModel):
        id_str: str = nmt.Text()
        experiment_name: str = nmt.Text()
    for exp in experiments:
        for field in exp.model._fields.keys():
            if field not in CombinedModel._fields:
                CombinedModel._fields[field] = exp.model._fields[field]

    # create a new database with the combined model
    properties = {}
    for field in CombinedModel._fields.keys():
        properties.update(CombinedModel._fields[field]._to_notion_property())
    comparison_database_id = self._notion_backend.create_new_database(
        parent_page_id=self.comparisons_page_id,
        title=f"{' and '.join([exp.name for exp in experiments])}",
        properties=properties,
    )

    # make sure all experiments are synced to upstream
    for exp in experiments:
        exp.load()

    # group together by title property
    grouped_experiments = _combine_experiments(experiments)

    # append these to database
    for grouped_row in tqdm(grouped_experiments, desc="Uploading to Notion"):
        for row in grouped_row:
            combined_model_instance = CombinedModel(**row)
            self._notion_backend.create_page_in_database(
                database_id=comparison_database_id,
                properties=combined_model_instance.to_notion()["properties"]
            )
    # Get the URL for the created database
    # The format for Notion URLs is: https://www.notion.so/{database_id}
    notion_url = f"https://www.notion.so/{comparison_database_id.replace('-', '')}"
    
    return notion_url