# Experiments

> How to run experiments

In [None]:
# | default_exp project.experiments

In [None]:
# | export
from tqdm import tqdm
from functools import wraps
import asyncio
from tqdm import tqdm

import typing as t

from fastcore.utils import patch

from ragas_experimental.project.core import Project
from ragas_experimental.model.pydantic_model import ExtendedPydanticBaseModel as BaseModel
from ragas_experimental.utils import async_to_sync, create_nano_id
from ragas_experimental.dataset import Dataset, BaseModelType
from ragas_experimental.experiment import Experiment
import ragas_experimental.typing as rt

## Basics

In [None]:
# | export
@patch
def create_experiment(
    self: Project, name: str, model: t.Type[BaseModel]
) -> Experiment:
    """Create a new experiment.

    Args:
        name: Name of the experiment
        model: Model class defining the experiment structure

    Returns:
        Experiment: An experiment object for managing results
    """
    # Create the experiment
    sync_version = async_to_sync(self._ragas_api_client.create_experiment)
    experiment_info = sync_version(
        project_id=self.project_id,
        name=name,
    )

    # Create the columns for the experiment
    column_types = rt.ModelConverter.model_to_columns(model)
    sync_version = async_to_sync(create_experiment_columns)
    sync_version(
        project_id=self.project_id,
        experiment_id=experiment_info["id"],
        columns=column_types,
        create_experiment_column_func=self._ragas_api_client.create_experiment_column,
    )
    
    # Return a new Experiment instance
    return Experiment(
        name=name,
        model=model,
        project_id=self.project_id,
        experiment_id=experiment_info["id"],
        ragas_api_client=self._ragas_api_client,
    )

# Add this helper function similar to create_dataset_columns in core.ipynb
async def create_experiment_columns(project_id, experiment_id, columns, create_experiment_column_func):
    tasks = []
    for column in columns:
        tasks.append(create_experiment_column_func(
            project_id=project_id,
            experiment_id=experiment_id,
            id=create_nano_id(),
            name=column["name"],
            type=column["type"],
            settings={
                "max_length": 255,
                "is_required": True,
            },
        ))
    return await asyncio.gather(*tasks)

In [None]:
import os

RAGAS_APP_TOKEN = "app-token"
RAGAS_API_BASE_URL = "https://api.dev.app.ragas.io"

os.environ["RAGAS_APP_TOKEN"] = RAGAS_APP_TOKEN
os.environ["RAGAS_API_BASE_URL"] = RAGAS_API_BASE_URL

PROJECT_ID = "919a4d42-aaf2-45cd-badd-152249788bfa"
p = Project(project_id=PROJECT_ID)
p

Project(name='yann-lecun-wisdom')

In [None]:
class TestModel(BaseModel):
    name: str
    description: str
    price: float


In [None]:
experiment_id = "5d7752ab-17bf-46bc-a302-afe04ce1a763"
exp = p.create_experiment(name="test-exp", model=TestModel)
#exp = p.create_dataset(name="just name and desc 2", model=TestModel)

exp

Experiment(name=test-exp, model=TestModel)

In [None]:
# | export
@patch
def get_experiment_by_id(self: Project, experiment_id: str, model: t.Type[BaseModel]) -> Experiment:
    """Get an existing experiment by ID."""
    # Get experiment info
    sync_version = async_to_sync(self._ragas_api_client.get_experiment)
    experiment_info = sync_version(
        project_id=self.project_id,
        experiment_id=experiment_id
    )

    return Experiment(
        name=experiment_info["name"],
        model=model,
        project_id=self.project_id,
        experiment_id=experiment_id,
        ragas_api_client=self._ragas_api_client,
    )

In [None]:
exp.experiment_id

'effe0e10-916d-4530-b974-91d5115f5dc2'

In [None]:
p.get_experiment_by_id(exp.experiment_id, TestModel)

Experiment(name=test-exp, model=TestModel)

In [None]:
# | export
@patch
def get_experiment(self: Project, dataset_name: str, model) -> Dataset:
    """Get an existing dataset by name."""
    # Search for dataset with given name
    sync_version = async_to_sync(self._ragas_api_client.get_experiment_by_name)
    exp_info = sync_version(
        project_id=self.project_id,
        experiment_name=dataset_name
    )

    # Return Dataset instance
    return Experiment(
        name=exp_info["name"],
        model=model,
        project_id=self.project_id,
        experiment_id=exp_info["id"],
        ragas_api_client=self._ragas_api_client,
    )

In [None]:
p.get_experiment("test-exp", TestModel)

Experiment(name=test-exp, model=TestModel)

## Experiment Wrapper

In [None]:
# | export
@t.runtime_checkable
class ExperimentProtocol(t.Protocol):
    async def __call__(self, *args, **kwargs): ...
    async def run_async(self, name: str, dataset: Dataset): ...

In [None]:
# | export

# this one we have to clean up
from langfuse.decorators import observe

In [None]:
# | export
from ragas_experimental.project.naming import MemorableNames

In [None]:
# | export
memorable_names = MemorableNames()

In [None]:
# | export
@patch
def experiment(
    self: Project, experiment_model, name_prefix: str = ""
):
    """Decorator for creating experiment functions without Langfuse integration.

    Args:
        experiment_model: The NotionModel type to use for experiment results
        name_prefix: Optional prefix for experiment names

    Returns:
        Decorator function that wraps experiment functions
    """

    def decorator(func: t.Callable) -> ExperimentProtocol:
        @wraps(func)
        async def wrapped_experiment(*args, **kwargs):
            # Simply call the function without Langfuse observation
            return await func(*args, **kwargs)

        # Add run method to the wrapped function
        async def run_async(dataset: Dataset, name: t.Optional[str] = None):
            # if name is not provided, generate a memorable name
            if name is None:
                name = memorable_names.generate_unique_name()
            if name_prefix:
                name = f"{name_prefix}-{name}"

            # Create tasks for all items
            tasks = []
            for item in dataset:
                tasks.append(wrapped_experiment(item))

            # Use as_completed with tqdm for progress tracking
            results = []
            for future in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
                result = await future
                # Add each result to experiment view as it completes
                if result is not None:
                    results.append(result)

            # upload results to experiment view
            experiment_view = self.create_experiment(name=name, model=experiment_model)
            for result in results:
                experiment_view.append(result)

            return experiment_view

        wrapped_experiment.__setattr__("run_async", run_async)
        return t.cast(ExperimentProtocol, wrapped_experiment)

    return decorator

In [None]:
# create experimental test dataset
test_dataset = p.create_dataset(name="test dataset for experiment", model=TestModel)
test_dataset.append(TestModel(name="test item 1", description="test item 1 description", price=100))
test_dataset.append(TestModel(name="test item 2", description="test item 2 description", price=200))
test_dataset.append(TestModel(name="test item 3", description="test item 3 description", price=300))

In [None]:
# create experiment model
class TextExperimentModel(TestModel):
    response: str
    is_correct: t.Literal["yes", "no"]

# create a test experiment function
@p.experiment(TextExperimentModel)
async def test_experiment(item: TestModel):
    print(item)
    return TextExperimentModel(**item.model_dump(), response="test response", is_correct="yes")


In [None]:
# run the experiment
await test_experiment.run_async(test_dataset)

100%|██████████| 3/3 [00:00<00:00, 7752.87it/s]


name='test item 2' description='test item 2 description' price=200.0
name='test item 1' description='test item 1 description' price=100.0
name='test item 3' description='test item 3 description' price=300.0


Experiment(name=keen_backus, model=TextExperimentModel)

In [None]:
# | export
@patch
def langfuse_experiment(
    self: Project, experiment_model, name_prefix: str = ""
):
    """Decorator for creating experiment functions with Langfuse integration.

    Args:
        experiment_model: The NotionModel type to use for experiment results
        name_prefix: Optional prefix for experiment names

    Returns:
        Decorator function that wraps experiment functions with Langfuse observation
    """

    def decorator(func: t.Callable) -> ExperimentProtocol:
        # First, create a base experiment wrapper
        base_experiment = self.experiment(experiment_model, name_prefix)(func)

        # Override the wrapped function to add Langfuse observation
        @wraps(func)
        async def wrapped_with_langfuse(*args, **kwargs):
            # wrap the function with langfuse observation
            observed_func = observe(name=f"{name_prefix}-{func.__name__}")(func)
            return await observed_func(*args, **kwargs)

        # Replace the async function to use Langfuse
        original_run_async = base_experiment.run_async

        # Use the original run_async but with the Langfuse-wrapped function
        async def run_async_with_langfuse(
            dataset: Dataset, name: t.Optional[str] = None
        ):
            # Override the internal wrapped_experiment with our Langfuse version
            base_experiment.__wrapped__ = wrapped_with_langfuse

            # Call the original run_async which will now use our Langfuse-wrapped function
            return await original_run_async(dataset, name)

        # Replace the run_async method
        base_experiment.__setattr__("run_async", run_async_with_langfuse)

        return t.cast(ExperimentProtocol, base_experiment)

    return decorator

In [None]:

import logging
from ragas_experimental.utils import plot_experiments_as_subplots

@patch
def compare_and_plot(self: Project, experiment_ids: t.List[str], model: t.Type[BaseModel], metric_names: t.List[str]):
    """Compare multiple experiments and generate a plot.

    Args:
        experiment_ids: List of experiment IDs to compare
        model: Model class defining the experiment structure
    """
    results = {}
    for experiment_id in tqdm(experiment_ids, desc="Fetching experiments"):
        experiment = self.get_experiment(experiment_id, model)
        experiment.load()
        results[experiment_id] = {}
        for row in experiment:
            for metric in metric_names:
                if metric not in results[experiment_id]:
                    results[experiment_id][metric] = []
                if hasattr(row, metric):
                    results[experiment_id][metric].append(getattr(row, metric))
                else:
                    results[metric].append(None)
                    logging.warning(f"Metric {metric} not found in row: {row}")
                    
    
    
    fig = plot_experiments_as_subplots(results,experiment_ids=experiment_ids)
    fig.show()
        
        
        
        
    

In [None]:
from ragas_experimental import BaseModel

class TestDataset(BaseModel):
    question: str
    citations: list[str]
    grading_notes: str
    

class ExperimentModel(TestDataset):
    response: str
    score: str
    score_reason: str


In [None]:
p.compare_and_plot(
    experiment_ids=["21e2eae0-d21f-465a-adf8-b8026db33d54","6dcf6d24-6982-4098-b430-5976bd8da237"],
    model=ExperimentModel,
    metric_names=["score"]
)

Fetching experiments: 100%|██████████| 2/2 [00:03<00:00,  1.85s/it]
