In [None]:
import pandas as pd
import numpy as np

from typing import Optional
from typing import List

from evidently import Dataset
from evidently import DataDefinition
from evidently import Recsys
from evidently import Report

from evidently.tests import lte, gte, lt, gt, is_in, not_in, eq, not_eq
from evidently.tests import Reference

# Import all recsys metrics
from evidently.metrics import PrecisionTopK
from evidently.metrics import RecallTopK
from evidently.metrics import FBetaTopK
from evidently.metrics import MAP
from evidently.metrics import MRR
from evidently.metrics import NDCG
from evidently.metrics import HitRate
from evidently.metrics import ScoreDistribution
from evidently.metrics import Personalization
from evidently.metrics import Diversity
from evidently.metrics import RecCasesTable
from evidently.metrics import ItemBias
from evidently.metrics import UserBias
from evidently.metrics import PopularityBiasMetric
from evidently.metrics import Serendipity
from evidently.metrics import Novelty

# Import recsys preset
from evidently.presets import RecsysPreset


# Recommendation Systems Metrics

This notebook demonstrates how to use Evidently's recommendation systems metrics to evaluate the performance of recommendation models.

## Overview

Evidently provides comprehensive metrics for evaluating recommendation systems:

### Top-K Metrics (DataframeValue)
- **Precision@K**: Precision at different K values
- **Recall@K**: Recall at different K values  
- **F-Beta@K**: F-Beta score at different K values
- **MAP@K**: Mean Average Precision at different K values
- **MRR@K**: Mean Reciprocal Rank at different K values
- **NDCG@K**: Normalized Discounted Cumulative Gain at different K values
- **Hit Rate@K**: Hit rate at different K values

### Additional Metrics
- **Score Distribution**: Distribution of recommendation scores
- **Personalization**: How diverse recommendations are across users
- **Diversity**: How diverse recommendations are within each user's list
- **Item Bias**: Analysis of item popularity bias
- **User Bias**: Analysis of user-specific bias
- **RecCasesTable**: Detailed recommendation cases for analysis

### Preset
- **RecsysPreset**: Complete set of recommendation metrics


## Sample Data

Let's create sample recommendation data to demonstrate the metrics. We'll simulate a movie recommendation system with:
- Users and items (movies)
- Recommendation scores/predictions
- User interactions (ratings)
- Item features (genres)
- User/item bias features

**Note**: Since the new API doesn't support `additional_data` yet, we need to include the target ratings directly in the main dataset by merging recommendations with interactions.


In [None]:
# Create sample recommendation data
np.random.seed(42)

# Generate users, items, and interactions
n_users = 100
n_items = 50
n_interactions = 1000

# User and item IDs
user_ids = [f"user_{i}" for i in range(n_users)]
item_ids = [f"movie_{i}" for i in range(n_items)]

# Generate interactions (user-item pairs with ratings)
interactions = []
for _ in range(n_interactions):
    user_id = np.random.choice(user_ids)
    item_id = np.random.choice(item_ids)
    rating = np.random.choice([1, 2, 3, 4, 5], p=[0.1, 0.1, 0.2, 0.3, 0.3])  # Higher ratings more likely
    interactions.append({
        'user_id': user_id,
        'item_id': item_id,
        'rating': rating
    })

# Convert to DataFrame
interactions_df = pd.DataFrame(interactions)

# Generate recommendation scores (predictions)
recommendations = []
for user_id in user_ids:
    # Each user gets recommendations for all items
    for item_id in item_ids:
        # Simulate recommendation score (higher = more likely to be recommended)
        score = np.random.uniform(0, 1)
        recommendations.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': score
        })

recommendations_df = pd.DataFrame(recommendations)

print(f"Interactions: {len(interactions_df)}")
print(f"Recommendations: {len(recommendations_df)}")
print(f"Users: {len(user_ids)}")
print(f"Items: {len(item_ids)}")

# Show sample data
print("\nSample interactions:")
print(interactions_df.head())
print("\nSample recommendations:")
print(recommendations_df.head())


In [None]:
# Create item features (numerical) for diversity metrics
# The Diversity metric expects numerical features for distance calculations
# Let's create more structured features to get meaningful diversity values

item_features = []
for i, item_id in enumerate(item_ids):
    # Create more structured features that will show diversity
    # Feature 1: Genre cluster (create distinct clusters)
    genre_cluster = i % 6  # 0-5 clusters, cycling through items
    
    # Feature 2: Year (group items by decades for more structure)
    decade = (1990 + (i % 4) * 10) % 100  # (19)90, (20)00, (20)10, (20)20
    
    # Feature 3: Rating tier (create distinct rating groups)
    rating_tier = 1.0 + (i % 5) * 0.8  # 1.0, 1.8, 2.6, 3.4, 4.2
    
    # Add some noise to make it more realistic
    genre_cluster += np.random.normal(0, 0.1)
    decade += np.random.randint(-2, 3)
    rating_tier += np.random.normal(0, 0.1)
    
    item_features.append({
        'item_id': item_id,
        'genre_cluster': genre_cluster,
        'release_decade': decade,
        'rating_tier': rating_tier
    })

item_features_df = pd.DataFrame(item_features)

# Create user/item bias features (keep these categorical for bias analysis)
user_bias_features = []
for user_id in user_ids:
    age_group = np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'])
    user_bias_features.append({
        'user_id': user_id,
        'age_group': age_group
    })

item_bias_features = []
for item_id in item_ids:
    popularity = np.random.choice(['Low', 'Medium', 'High'])
    item_bias_features.append({
        'item_id': item_id,
        'popularity': popularity
    })

user_bias_df = pd.DataFrame(user_bias_features)
item_bias_df = pd.DataFrame(item_bias_features)

print("Item features sample (numerical for diversity):")
print(item_features_df.head())
print(f"\nFeature ranges:")
print(f"Genre cluster: {item_features_df['genre_cluster'].min():.2f} - {item_features_df['genre_cluster'].max():.2f}")
print(f"Release decade: {item_features_df['release_decade'].min()} - {item_features_df['release_decade'].max()}")
print(f"Rating tier: {item_features_df['rating_tier'].min():.2f} - {item_features_df['rating_tier'].max():.2f}")
print("\nUser bias features sample:")
print(user_bias_df.head())
print("\nItem bias features sample:")
print(item_bias_df.head())


In [None]:
# Create datasets for current and reference periods
# Split interactions into current and reference periods
split_point = len(interactions_df) // 2
current_interactions = interactions_df.iloc[:split_point].copy()
reference_interactions = interactions_df.iloc[split_point:].copy()

# Add some temporal variation to recommendations
current_recommendations = recommendations_df.copy()
reference_recommendations = recommendations_df.copy()

# Add some noise to reference recommendations to simulate model changes
reference_recommendations['prediction'] = reference_recommendations['prediction'] + np.random.normal(0, 0.1, len(reference_recommendations))
reference_recommendations['prediction'] = np.clip(reference_recommendations['prediction'], 0, 1)

print(f"Current interactions: {len(current_interactions)}")
print(f"Reference interactions: {len(reference_interactions)}")
print(f"Current recommendations: {len(current_recommendations)}")
print(f"Reference recommendations: {len(reference_recommendations)}")

# Show sample data
print("\nCurrent interactions sample:")
print(current_interactions.head())
print("\nReference interactions sample:")
print(reference_interactions.head())


In [None]:
# Create datasets with ratings included in the main data
# We need to merge recommendations with interactions to get the target ratings

# Merge current recommendations with current interactions to get ratings
current_data = current_recommendations.merge(
    current_interactions[['user_id', 'item_id', 'rating']], 
    on=['user_id', 'item_id'], 
    how='left'
)
# Fill missing ratings with 0 (no interaction)
current_data['rating'] = current_data['rating'].fillna(0)

# Merge reference recommendations with reference interactions to get ratings
reference_data = reference_recommendations.merge(
    reference_interactions[['user_id', 'item_id', 'rating']], 
    on=['user_id', 'item_id'], 
    how='left'
)
# Fill missing ratings with 0 (no interaction)
reference_data['rating'] = reference_data['rating'].fillna(0)

# Create data definition
data_definition = DataDefinition(
    numerical_columns=["rating", "prediction"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create current dataset
current_dataset = Dataset.from_pandas(
    current_data,
    data_definition=data_definition
)

# Create reference dataset  
reference_dataset = Dataset.from_pandas(
    reference_data,
    data_definition=data_definition
)

print("Datasets created successfully!")
print(f"Current dataset shape: {current_dataset.as_dataframe().shape}")
print(f"Reference dataset shape: {reference_dataset.as_dataframe().shape}")
print(f"Current dataset with ratings: {current_data['rating'].notna().sum()} out of {len(current_data)}")
print(f"Reference dataset with ratings: {reference_data['rating'].notna().sum()} out of {len(reference_data)}")


## Individual Metrics

Let's explore each metric individually to understand their outputs and use cases.


### Top-K Metrics

These metrics return DataframeValue with rank and value columns, showing performance at different K values.


In [None]:
# Precision@K - measures accuracy of recommendations
precision_report = Report([
    PrecisionTopK(k=10, min_rel_score=3)  # Consider ratings >= 3 as relevant
])

precision_snapshot = precision_report.run(current_dataset, reference_dataset)
precision_snapshot


In [None]:
# Recall@K - measures coverage of relevant items
recall_report = Report([
    RecallTopK(k=10, min_rel_score=3)
])

recall_snapshot = recall_report.run(current_dataset, reference_dataset)
recall_snapshot


In [None]:
# F-Beta@K - harmonic mean of precision and recall
fbeta_report = Report([
    FBetaTopK(k=10, min_rel_score=3, beta=1.0)  # F1 score
])

fbeta_snapshot = fbeta_report.run(current_dataset, reference_dataset)
fbeta_snapshot


In [None]:
# MAP@K - Mean Average Precision
map_report = Report([
    MAP(k=10, min_rel_score=3)
])

map_snapshot = map_report.run(current_dataset, reference_dataset)
map_snapshot


In [None]:
# MRR@K - Mean Reciprocal Rank
mrr_report = Report([
    MRR(k=10, min_rel_score=3)
])

mrr_snapshot = mrr_report.run(current_dataset, reference_dataset)
mrr_snapshot


In [None]:
# NDCG@K - Normalized Discounted Cumulative Gain
ndcg_report = Report([
    NDCG(k=10, min_rel_score=3)
])

ndcg_snapshot = ndcg_report.run(current_dataset, reference_dataset)
ndcg_snapshot


In [None]:
# Hit Rate@K - fraction of users with at least one relevant recommendation
hitrate_report = Report([
    HitRate(k=10, min_rel_score=3)
])

hitrate_snapshot = hitrate_report.run(current_dataset, reference_dataset)
hitrate_snapshot


### Additional Metrics


In [None]:
# Score Distribution - distribution of recommendation scores
score_dist_report = Report([
    ScoreDistribution(k=10)
])

score_dist_snapshot = score_dist_report.run(current_dataset, reference_dataset)
score_dist_snapshot


In [None]:
# Personalization - how diverse recommendations are across users
personalization_report = Report([
    Personalization(k=10)
])

personalization_snapshot = personalization_report.run(current_dataset, reference_dataset)
personalization_snapshot


In [None]:
# Diversity - how diverse recommendations are within each user's list
# Note: This requires numerical item features for distance calculations
# The Diversity metric uses cosine distance between item features

# Create diversity data for both current and reference periods
# Use the numerical item features directly (no need for mapping since they're already one-to-one)
diversity_current_data = current_data.copy()  # Use current_data which already has ratings
diversity_current_data = diversity_current_data.merge(item_features_df, on='item_id', how='left')

diversity_reference_data = reference_data.copy()  # Use reference_data which already has ratings
diversity_reference_data = diversity_reference_data.merge(item_features_df, on='item_id', how='left')

# Fill missing numerical features with median values
for col in ['genre_cluster', 'release_decade', 'rating_tier']:
    median_val = diversity_current_data[col].median()
    diversity_current_data[col] = diversity_current_data[col].fillna(median_val)
    diversity_reference_data[col] = diversity_reference_data[col].fillna(median_val)

# Create data definition with numerical item features
diversity_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "genre_cluster", "release_decade", "rating_tier"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create both current and reference datasets with numerical item features
diversity_current_dataset = Dataset.from_pandas(
    diversity_current_data,
    data_definition=diversity_data_definition
)

diversity_reference_dataset = Dataset.from_pandas(
    diversity_reference_data,
    data_definition=diversity_data_definition
)

# Diversity metric with numerical features
diversity_report = Report([
    Diversity(k=10, item_features=["genre_cluster", "release_decade", "rating_tier"])
])

diversity_snapshot = diversity_report.run(diversity_current_dataset, diversity_reference_dataset)
diversity_snapshot


In [None]:
# RecCasesTable - detailed recommendation cases for analysis
rec_cases_report = Report([
    RecCasesTable(
        user_ids=["user_0", "user_1", "user_2"],  # Show cases for specific users
        display_features=["genre_cluster", "release_decade", "rating_tier"]  # Show numerical features
    )
])

rec_cases_snapshot = rec_cases_report.run(diversity_current_dataset, diversity_reference_dataset)
rec_cases_snapshot


## Additional Recommendation Metrics

The following metrics provide additional insights into recommendation system performance:

### Bias Metrics (DataframeValue)
- **ItemBias**: Analyzes bias in item recommendations
- **UserBias**: Analyzes bias in user recommendations  
- **PopularityBiasMetric**: Measures popularity bias in recommendations

### Novelty and Serendipity Metrics (SingleValue)
- **Novelty**: Measures how novel the recommendations are
- **Serendipity**: Measures how surprising the recommendations are

These metrics help identify potential biases and assess the diversity and surprise factor of recommendations.


In [None]:
# ItemBias - analyzes bias in item recommendations
# Note: This requires training data and item bias columns
# The training data should contain historical item features, not current recommendations

# Create training data with item features (this represents historical data used to train the model)
item_bias_train_data = item_bias_df.copy()  # Use the item features as training data
item_bias_train_data['user_id'] = 'train_user'  # Add dummy user_id for training data
item_bias_train_data['item_id'] = item_bias_train_data.index  # Use index as item_id
item_bias_train_data['prediction'] = 0.5  # Dummy prediction for training data
item_bias_train_data['rating'] = 3.0  # Dummy rating for training data

# Convert categorical popularity to numeric (Low=1, Medium=2, High=3)
popularity_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
item_bias_train_data['popularity'] = item_bias_train_data['popularity'].map(popularity_mapping)
item_bias_train_data['popularity'] = item_bias_train_data['popularity'].fillna(item_bias_train_data['popularity'].median())

# Create training data definition
item_bias_train_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "popularity"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create training dataset
item_bias_train_dataset = Dataset.from_pandas(
    item_bias_train_data,
    data_definition=item_bias_train_data_definition
)

# Create current data with item features for recommendations
item_bias_current_data = current_data.copy()
item_bias_current_data = item_bias_current_data.merge(item_bias_df, on='item_id', how='left')

item_bias_reference_data = reference_data.copy()
item_bias_reference_data = item_bias_reference_data.merge(item_bias_df, on='item_id', how='left')

# Convert categorical popularity to numeric for current data
item_bias_current_data['popularity'] = item_bias_current_data['popularity'].map(popularity_mapping)
item_bias_reference_data['popularity'] = item_bias_reference_data['popularity'].map(popularity_mapping)

# Fill missing popularity values with median
item_bias_current_data['popularity'] = item_bias_current_data['popularity'].fillna(item_bias_current_data['popularity'].median())
item_bias_reference_data['popularity'] = item_bias_reference_data['popularity'].fillna(item_bias_reference_data['popularity'].median())

# Create data definition with item bias columns
item_bias_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "popularity"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create datasets
item_bias_current_dataset = Dataset.from_pandas(
    item_bias_current_data,
    data_definition=item_bias_data_definition
)

item_bias_reference_dataset = Dataset.from_pandas(
    item_bias_reference_data,
    data_definition=item_bias_data_definition
)

# ItemBias metric - note: uses column_name, not item_bias_columns
item_bias_report = Report([
    ItemBias(k=10, column_name="popularity", distribution="default")
])

item_bias_snapshot = item_bias_report.run(
    item_bias_current_dataset, 
    item_bias_reference_dataset,
    additional_data={
        "current_train_data": item_bias_train_dataset,  # Historical training data
        "reference_train_data": item_bias_train_dataset  # Same training data for reference
    }
)
item_bias_snapshot


In [None]:
# UserBias - analyzes bias in user recommendations
# Note: This requires training data and user bias columns
# The training data should contain historical user features, not current recommendations

# Create training data with user features (this represents historical data used to train the model)
user_bias_train_data = user_bias_df.copy()  # Use the user features as training data
user_bias_train_data['item_id'] = 'train_item'  # Add dummy item_id for training data
user_bias_train_data['user_id'] = user_bias_train_data.index  # Use index as user_id
user_bias_train_data['prediction'] = 0.5  # Dummy prediction for training data
user_bias_train_data['rating'] = 3.0  # Dummy rating for training data

# Fill missing age_group values
user_bias_train_data['age_group'] = user_bias_train_data['age_group'].fillna('Unknown')

# Create training data definition
user_bias_train_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction"],
    categorical_columns=["user_id", "item_id", "age_group"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create training dataset
user_bias_train_dataset = Dataset.from_pandas(
    user_bias_train_data,
    data_definition=user_bias_train_data_definition
)

# Create current data with user features for recommendations
user_bias_current_data = current_data.copy()
user_bias_current_data = user_bias_current_data.merge(user_bias_df, on='user_id', how='left')

user_bias_reference_data = reference_data.copy()
user_bias_reference_data = user_bias_reference_data.merge(user_bias_df, on='user_id', how='left')

# Fill missing age_group values
user_bias_current_data['age_group'] = user_bias_current_data['age_group'].fillna('Unknown')
user_bias_reference_data['age_group'] = user_bias_reference_data['age_group'].fillna('Unknown')

# Create data definition with user bias columns
user_bias_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction"],
    categorical_columns=["user_id", "item_id", "age_group"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create datasets
user_bias_current_dataset = Dataset.from_pandas(
    user_bias_current_data,
    data_definition=user_bias_data_definition
)

user_bias_reference_dataset = Dataset.from_pandas(
    user_bias_reference_data,
    data_definition=user_bias_data_definition
)

# UserBias metric - note: uses column_name, not user_bias_columns
user_bias_report = Report([
    UserBias(column_name="age_group", distribution="default")
])

user_bias_snapshot = user_bias_report.run(
    user_bias_current_dataset, 
    user_bias_reference_dataset,
    additional_data={
        "current_train_data": user_bias_train_dataset,  # Historical training data
        "reference_train_data": user_bias_train_dataset  # Same training data for reference
    }
)
user_bias_snapshot


In [None]:
# PopularityBiasMetric - measures popularity bias in recommendations
# Create more realistic popularity distribution for better metrics
import numpy as np

# Create training data with realistic popularity distribution
# Some items are very popular, others are niche
np.random.seed(42)  # For reproducibility
n_items = len(current_data['item_id'].unique())
n_users = len(current_data['user_id'].unique())

# Create popularity distribution (some items much more popular than others)
item_popularity = {}
for i, item_id in enumerate(current_data['item_id'].unique()):
    # Create power-law distribution: some items very popular, others rare
    popularity = np.random.pareto(1.5) + 1  # Pareto distribution for realistic popularity
    item_popularity[item_id] = popularity

# Create training data with this popularity distribution
popularity_train_data = []
for user_id in current_data['user_id'].unique():
    # Each user interacts with different number of items (some users more active)
    n_interactions = np.random.poisson(5) + 1  # Average 6 interactions per user
    user_items = np.random.choice(
        list(item_popularity.keys()), 
        size=min(n_interactions, n_items), 
        replace=False
    )
    for item_id in user_items:
        popularity_train_data.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': 0.5,  # Dummy prediction
            'rating': 3.0,  # Dummy rating
            'popularity': item_popularity[item_id]
        })

popularity_train_df = pd.DataFrame(popularity_train_data)

# Create training dataset
popularity_train_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "popularity"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

popularity_train_dataset = Dataset.from_pandas(
    popularity_train_df,
    data_definition=popularity_train_data_definition
)

# Create current data with popularity features
popularity_current_data = current_data.copy()
popularity_current_data['popularity'] = popularity_current_data['item_id'].map(item_popularity)
popularity_current_data['popularity'] = popularity_current_data['popularity'].fillna(
    popularity_current_data['popularity'].median()
)

popularity_reference_data = reference_data.copy()
popularity_reference_data['popularity'] = popularity_reference_data['item_id'].map(item_popularity)
popularity_reference_data['popularity'] = popularity_reference_data['popularity'].fillna(
    popularity_reference_data['popularity'].median()
)

# Create data definition with popularity column
popularity_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "popularity"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

popularity_current_dataset = Dataset.from_pandas(
    popularity_current_data,
    data_definition=popularity_data_definition
)

popularity_reference_dataset = Dataset.from_pandas(
    popularity_reference_data,
    data_definition=popularity_data_definition
)

# PopularityBiasMetric with realistic data
popularity_bias_report = Report([
    PopularityBiasMetric(k=10, normalize_arp=True)
])

popularity_bias_snapshot = popularity_bias_report.run(
    popularity_current_dataset, 
    popularity_reference_dataset,
    additional_data={
        "current_train_data": popularity_train_dataset,  # Training data with realistic popularity
        "reference_train_data": popularity_train_dataset
    }
)
popularity_bias_snapshot


In [None]:
# Novelty - measures how novel the recommendations are
# Note: This requires training data with historical item interactions
# Create proper training data with historical item popularity

# Create training data with realistic item popularity distribution
# This represents historical data used to train the model
np.random.seed(42)  # For reproducibility
n_items = len(current_data['item_id'].unique())
n_users = len(current_data['user_id'].unique())

# Create historical item popularity (some items much more popular than others)
item_popularity = {}
for i, item_id in enumerate(current_data['item_id'].unique()):
    # Create power-law distribution: some items very popular, others rare
    popularity = np.random.pareto(1.5) + 1  # Pareto distribution for realistic popularity
    item_popularity[item_id] = popularity

# Create training data with this popularity distribution
novelty_train_data = []
for user_id in current_data['user_id'].unique():
    # Each user interacts with different number of items (some users more active)
    n_interactions = np.random.poisson(5) + 1  # Average 6 interactions per user
    user_items = np.random.choice(
        list(item_popularity.keys()), 
        size=min(n_interactions, n_items), 
        replace=False
    )
    for item_id in user_items:
        novelty_train_data.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': 0.5,  # Dummy prediction
            'rating': 3.0,  # Dummy rating
        })

novelty_train_df = pd.DataFrame(novelty_train_data)

# Create training dataset
novelty_train_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

novelty_train_dataset = Dataset.from_pandas(
    novelty_train_df,
    data_definition=novelty_train_data_definition
)

# Novelty metric with proper training data
novelty_report = Report([
    Novelty(k=10)
])

novelty_snapshot = novelty_report.run(
    current_dataset, 
    reference_dataset,
    additional_data={
        "current_train_data": novelty_train_dataset,  # Historical training data for novelty calculation
        "reference_train_data": novelty_train_dataset
    }
)
novelty_snapshot


In [None]:
# Serendipity - measures how surprising the recommendations are
# Note: This requires training data and item features
serendipity_report = Report([
    Serendipity(k=10, item_features=["genre_cluster", "release_decade", "rating_tier"])
])

serendipity_snapshot = serendipity_report.run(
    diversity_current_dataset, 
    diversity_reference_dataset,
    additional_data={
        "current_train_data": diversity_current_dataset,  # Training data for serendipity calculation
        "reference_train_data": diversity_reference_dataset
    }
)
serendipity_snapshot


## Using the RecsysPreset

The RecsysPreset provides a comprehensive set of recommendation metrics in one go. It automatically includes all relevant metrics based on the available data.


In [None]:
# Basic RecsysPreset with minimal configuration
basic_preset_report = Report([
    RecsysPreset(
        k=10,
        min_rel_score=3,
        ranking_name="default"
    )
])

basic_preset_snapshot = basic_preset_report.run(current_dataset, reference_dataset)
basic_preset_snapshot


In [None]:
# Comprehensive RecsysPreset with all features
# Create comprehensive training data that includes all required features

# Recreate item popularity for comprehensive training data
np.random.seed(42)  # For reproducibility
item_popularity = {}
for i, item_id in enumerate(current_data['item_id'].unique()):
    # Create power-law distribution: some items very popular, others rare
    popularity = np.random.pareto(1.5) + 1  # Pareto distribution for realistic popularity
    item_popularity[item_id] = popularity

# Create comprehensive training data with all features needed by the preset
comprehensive_train_data = []
for user_id in current_data['user_id'].unique():
    # Each user interacts with different number of items (some users more active)
    n_interactions = np.random.poisson(5) + 1  # Average 6 interactions per user
    user_items = np.random.choice(
        list(item_popularity.keys()), 
        size=min(n_interactions, n_items), 
        replace=False
    )
    for item_id in user_items:
        # Get item features for this item
        item_features = item_features_df[item_features_df['item_id'] == item_id].iloc[0]
        item_bias = item_bias_df[item_bias_df['item_id'] == item_id].iloc[0]
        user_bias = user_bias_df[user_bias_df['user_id'] == user_id].iloc[0]
        
        # Convert popularity to numeric before adding to training data
        popularity_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
        popularity_numeric = popularity_mapping[item_bias['popularity']]
        
        comprehensive_train_data.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': 0.5,  # Dummy prediction
            'rating': 3.0,  # Dummy rating
            'genre_cluster': item_features['genre_cluster'],
            'release_decade': item_features['release_decade'],
            'rating_tier': item_features['rating_tier'],
            'popularity': popularity_numeric,  # Already numeric
            'age_group': user_bias['age_group']
        })

comprehensive_train_df = pd.DataFrame(comprehensive_train_data)

# Create comprehensive training dataset
comprehensive_train_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "genre_cluster", "release_decade", "rating_tier", "popularity"],
    categorical_columns=["user_id", "item_id", "age_group"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

comprehensive_train_dataset = Dataset.from_pandas(
    comprehensive_train_df,
    data_definition=comprehensive_train_data_definition
)

comprehensive_preset_report = Report([
    RecsysPreset(
        k=10,
        min_rel_score=3,
        ranking_name="default",
        user_ids=["user_0", "user_1", "user_2"],  # Specific users for RecCasesTable
        display_features=["genre_cluster", "release_decade", "rating_tier"],  # Features to display in RecCasesTable
        item_features=["genre_cluster", "release_decade", "rating_tier"],  # Item features for diversity metrics
        item_bias_columns=["popularity"],  # Item bias analysis
        user_bias_columns=["age_group"],  # User bias analysis
        normalize_arp=True,  # Normalize ARP in popularity bias
        beta=1.0  # Beta parameter for F-Beta score
    )
])

# Create comprehensive current and reference datasets with all required columns
# These need to include item features, bias features, etc. for the comprehensive preset

# Create comprehensive current dataset
comprehensive_current_data = diversity_current_data.copy()
comprehensive_current_data = comprehensive_current_data.merge(item_bias_df, on='item_id', how='left')
comprehensive_current_data = comprehensive_current_data.merge(user_bias_df, on='user_id', how='left')

# Convert categorical popularity to numeric for current data
popularity_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
comprehensive_current_data['popularity'] = comprehensive_current_data['popularity'].map(popularity_mapping)
comprehensive_current_data['popularity'] = comprehensive_current_data['popularity'].fillna(comprehensive_current_data['popularity'].median())

# Fill missing age_group values
comprehensive_current_data['age_group'] = comprehensive_current_data['age_group'].fillna('Unknown')

# Create comprehensive reference dataset
comprehensive_reference_data = diversity_reference_data.copy()
comprehensive_reference_data = comprehensive_reference_data.merge(item_bias_df, on='item_id', how='left')
comprehensive_reference_data = comprehensive_reference_data.merge(user_bias_df, on='user_id', how='left')

# Convert categorical popularity to numeric for reference data
comprehensive_reference_data['popularity'] = comprehensive_reference_data['popularity'].map(popularity_mapping)
comprehensive_reference_data['popularity'] = comprehensive_reference_data['popularity'].fillna(comprehensive_reference_data['popularity'].median())

# Fill missing age_group values
comprehensive_reference_data['age_group'] = comprehensive_reference_data['age_group'].fillna('Unknown')

# Create comprehensive data definition
comprehensive_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "genre_cluster", "release_decade", "rating_tier", "popularity"],
    categorical_columns=["user_id", "item_id", "age_group"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create comprehensive datasets
comprehensive_current_dataset = Dataset.from_pandas(
    comprehensive_current_data,
    data_definition=comprehensive_data_definition
)

comprehensive_reference_dataset = Dataset.from_pandas(
    comprehensive_reference_data,
    data_definition=comprehensive_data_definition
)

comprehensive_preset_snapshot = comprehensive_preset_report.run(
    comprehensive_current_dataset, 
    comprehensive_reference_dataset,
    additional_data={
        "current_train_data": comprehensive_train_dataset,  # Use comprehensive training data
        "reference_train_data": comprehensive_train_dataset
    }
)
comprehensive_preset_snapshot

## Metric Results Analysis

Let's examine the structure of metric results to understand how to access the data.


In [None]:
# Access metric results
precision_result = precision_snapshot.context.get_metric_result(PrecisionTopK(k=10, min_rel_score=3))

print("Precision@K Result Structure:")
print(f"Type: {type(precision_result)}")
print(f"Current value: {precision_result.value}")
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")

# Access the DataFrame values
print("\nCurrent Precision@K DataFrame:")
print(f"Current value: {precision_result.value}")
    
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")
    print("\nReference Precision@K DataFrame:")
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")


In [None]:
# Access single value results (like Personalization)
personalization_result = personalization_snapshot.context.get_metric_result(Personalization(k=10))

print("Personalization Result Structure:")
print(f"Type: {type(personalization_result)}")
print(f"Current value: {personalization_result.value}")
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")

# Access the single values
print(f"Current value: {personalization_result.value}")
print(f"Current value: {personalization_result.value}")
    
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")


In [None]:
# Access RecCasesTable results (DataframeValue with user_id column)
rec_cases_result = rec_cases_snapshot.context.get_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]))

print("RecCasesTable Result Structure:")
print(f"Type: {type(rec_cases_result)}")
print(f"Current value: {rec_cases_result.value}")
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")

# Access the DataFrame values
print("\nCurrent RecCasesTable DataFrame:")
print(f"Current value: {rec_cases_result.value}")
    
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")
    print("\nReference RecCasesTable DataFrame:")
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")


## Summary

This notebook demonstrated how to use Evidently's recommendation systems metrics:

### Key Features:
1. **Top-K Metrics**: Return DataframeValue with rank and value columns showing performance at different K values
2. **Single Value Metrics**: Return SingleValue for metrics like Personalization, Diversity, etc.
3. **Bias Analysis**: Return DataframeValue with x, y columns for distribution analysis
4. **RecCasesTable**: Return DataframeValue with user_id column for detailed case analysis

### Data Requirements:
- **Basic metrics**: User ID, Item ID, Prediction scores, Target ratings
- **Diversity metrics**: Additional item features (e.g., genres)
- **Bias metrics**: User/item bias features and training data
- **RecCasesTable**: Optional user_ids and display_features

### Preset Usage:
- **RecsysPreset**: Automatically includes relevant metrics based on available data
- **Conditional inclusion**: Metrics are added based on data availability (training data, features, etc.)

### Result Access:
- Use `.current` and `.reference` to access current and reference period results
- Use `.value` to access the actual data (DataFrame for DataframeValue, number for SingleValue)
- Results can be used for further analysis, visualization, or monitoring
