In [None]:
import pandas as pd
import numpy as np

from typing import Optional
from typing import List

from evidently import Dataset
from evidently import DataDefinition
from evidently import Recsys
from evidently import Report

from evidently.tests import lte, gte, lt, gt, is_in, not_in, eq, not_eq
from evidently.tests import Reference

# Import all recsys metrics
from evidently.metrics import PrecisionTopK
from evidently.metrics import RecallTopK
from evidently.metrics import FBetaTopK
from evidently.metrics import MAP
from evidently.metrics import MRR
from evidently.metrics import NDCG
from evidently.metrics import HitRate
from evidently.metrics import ScoreDistribution
from evidently.metrics import Personalization
from evidently.metrics import Diversity
from evidently.metrics import RecCasesTable

# Import recsys preset
from evidently.presets import RecsysPreset


# Recommendation Systems Metrics

This notebook demonstrates how to use Evidently's recommendation systems metrics to evaluate the performance of recommendation models.

## Overview

Evidently provides comprehensive metrics for evaluating recommendation systems:

### Top-K Metrics (DataframeValue)
- **Precision@K**: Precision at different K values
- **Recall@K**: Recall at different K values  
- **F-Beta@K**: F-Beta score at different K values
- **MAP@K**: Mean Average Precision at different K values
- **MRR@K**: Mean Reciprocal Rank at different K values
- **NDCG@K**: Normalized Discounted Cumulative Gain at different K values
- **Hit Rate@K**: Hit rate at different K values

### Additional Metrics
- **Score Distribution**: Distribution of recommendation scores
- **Personalization**: How diverse recommendations are across users
- **Diversity**: How diverse recommendations are within each user's list
- **Item Bias**: Analysis of item popularity bias
- **User Bias**: Analysis of user-specific bias
- **RecCasesTable**: Detailed recommendation cases for analysis

### Preset
- **RecsysPreset**: Complete set of recommendation metrics


## Sample Data

Let's create sample recommendation data to demonstrate the metrics. We'll simulate a movie recommendation system with:
- Users and items (movies)
- Recommendation scores/predictions
- User interactions (ratings)
- Item features (genres)
- User/item bias features

**Note**: Since the new API doesn't support `additional_data` yet, we need to include the target ratings directly in the main dataset by merging recommendations with interactions.


In [None]:
# Create sample recommendation data
np.random.seed(42)

# Generate users, items, and interactions
n_users = 100
n_items = 50
n_interactions = 1000

# User and item IDs
user_ids = [f"user_{i}" for i in range(n_users)]
item_ids = [f"movie_{i}" for i in range(n_items)]

# Generate interactions (user-item pairs with ratings)
interactions = []
for _ in range(n_interactions):
    user_id = np.random.choice(user_ids)
    item_id = np.random.choice(item_ids)
    rating = np.random.choice([1, 2, 3, 4, 5], p=[0.1, 0.1, 0.2, 0.3, 0.3])  # Higher ratings more likely
    interactions.append({
        'user_id': user_id,
        'item_id': item_id,
        'rating': rating
    })

# Convert to DataFrame
interactions_df = pd.DataFrame(interactions)

# Generate recommendation scores (predictions)
recommendations = []
for user_id in user_ids:
    # Each user gets recommendations for all items
    for item_id in item_ids:
        # Simulate recommendation score (higher = more likely to be recommended)
        score = np.random.uniform(0, 1)
        recommendations.append({
            'user_id': user_id,
            'item_id': item_id,
            'prediction': score
        })

recommendations_df = pd.DataFrame(recommendations)

print(f"Interactions: {len(interactions_df)}")
print(f"Recommendations: {len(recommendations_df)}")
print(f"Users: {len(user_ids)}")
print(f"Items: {len(item_ids)}")

# Show sample data
print("\nSample interactions:")
print(interactions_df.head())
print("\nSample recommendations:")
print(recommendations_df.head())


In [None]:
# Create item features (numerical) for diversity metrics
# The Diversity metric expects numerical features for distance calculations
# Let's create more structured features to get meaningful diversity values

item_features = []
for i, item_id in enumerate(item_ids):
    # Create more structured features that will show diversity
    # Feature 1: Genre cluster (create distinct clusters)
    genre_cluster = i % 6  # 0-5 clusters, cycling through items
    
    # Feature 2: Year (group items by decades for more structure)
    decade = (1990 + (i % 4) * 10) % 100  # (19)90, (20)00, (20)10, (20)20
    
    # Feature 3: Rating tier (create distinct rating groups)
    rating_tier = 1.0 + (i % 5) * 0.8  # 1.0, 1.8, 2.6, 3.4, 4.2
    
    # Add some noise to make it more realistic
    genre_cluster += np.random.normal(0, 0.1)
    decade += np.random.randint(-2, 3)
    rating_tier += np.random.normal(0, 0.1)
    
    item_features.append({
        'item_id': item_id,
        'genre_cluster': genre_cluster,
        'release_decade': decade,
        'rating_tier': rating_tier
    })

item_features_df = pd.DataFrame(item_features)

# Create user/item bias features (keep these categorical for bias analysis)
user_bias_features = []
for user_id in user_ids:
    age_group = np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'])
    user_bias_features.append({
        'user_id': user_id,
        'age_group': age_group
    })

item_bias_features = []
for item_id in item_ids:
    popularity = np.random.choice(['Low', 'Medium', 'High'])
    item_bias_features.append({
        'item_id': item_id,
        'popularity': popularity
    })

user_bias_df = pd.DataFrame(user_bias_features)
item_bias_df = pd.DataFrame(item_bias_features)

print("Item features sample (numerical for diversity):")
print(item_features_df.head())
print(f"\nFeature ranges:")
print(f"Genre cluster: {item_features_df['genre_cluster'].min():.2f} - {item_features_df['genre_cluster'].max():.2f}")
print(f"Release decade: {item_features_df['release_decade'].min()} - {item_features_df['release_decade'].max()}")
print(f"Rating tier: {item_features_df['rating_tier'].min():.2f} - {item_features_df['rating_tier'].max():.2f}")
print("\nUser bias features sample:")
print(user_bias_df.head())
print("\nItem bias features sample:")
print(item_bias_df.head())


In [None]:
# Create datasets for current and reference periods
# Split interactions into current and reference periods
split_point = len(interactions_df) // 2
current_interactions = interactions_df.iloc[:split_point].copy()
reference_interactions = interactions_df.iloc[split_point:].copy()

# Add some temporal variation to recommendations
current_recommendations = recommendations_df.copy()
reference_recommendations = recommendations_df.copy()

# Add some noise to reference recommendations to simulate model changes
reference_recommendations['prediction'] = reference_recommendations['prediction'] + np.random.normal(0, 0.1, len(reference_recommendations))
reference_recommendations['prediction'] = np.clip(reference_recommendations['prediction'], 0, 1)

print(f"Current interactions: {len(current_interactions)}")
print(f"Reference interactions: {len(reference_interactions)}")
print(f"Current recommendations: {len(current_recommendations)}")
print(f"Reference recommendations: {len(reference_recommendations)}")

# Show sample data
print("\nCurrent interactions sample:")
print(current_interactions.head())
print("\nReference interactions sample:")
print(reference_interactions.head())


In [None]:
# Create datasets with ratings included in the main data
# We need to merge recommendations with interactions to get the target ratings

# Merge current recommendations with current interactions to get ratings
current_data = current_recommendations.merge(
    current_interactions[['user_id', 'item_id', 'rating']], 
    on=['user_id', 'item_id'], 
    how='left'
)
# Fill missing ratings with 0 (no interaction)
current_data['rating'] = current_data['rating'].fillna(0)

# Merge reference recommendations with reference interactions to get ratings
reference_data = reference_recommendations.merge(
    reference_interactions[['user_id', 'item_id', 'rating']], 
    on=['user_id', 'item_id'], 
    how='left'
)
# Fill missing ratings with 0 (no interaction)
reference_data['rating'] = reference_data['rating'].fillna(0)

# Create data definition
data_definition = DataDefinition(
    numerical_columns=["rating", "prediction"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create current dataset
current_dataset = Dataset.from_pandas(
    current_data,
    data_definition=data_definition
)

# Create reference dataset  
reference_dataset = Dataset.from_pandas(
    reference_data,
    data_definition=data_definition
)

print("Datasets created successfully!")
print(f"Current dataset shape: {current_dataset.as_dataframe().shape}")
print(f"Reference dataset shape: {reference_dataset.as_dataframe().shape}")
print(f"Current dataset with ratings: {current_data['rating'].notna().sum()} out of {len(current_data)}")
print(f"Reference dataset with ratings: {reference_data['rating'].notna().sum()} out of {len(reference_data)}")


## Individual Metrics

Let's explore each metric individually to understand their outputs and use cases.


### Top-K Metrics

These metrics return DataframeValue with rank and value columns, showing performance at different K values.


In [None]:
# Precision@K - measures accuracy of recommendations
precision_report = Report([
    PrecisionTopK(k=10, min_rel_score=3)  # Consider ratings >= 3 as relevant
])

precision_snapshot = precision_report.run(current_dataset, reference_dataset)
precision_snapshot


In [None]:
# Recall@K - measures coverage of relevant items
recall_report = Report([
    RecallTopK(k=10, min_rel_score=3)
])

recall_snapshot = recall_report.run(current_dataset, reference_dataset)
recall_snapshot


In [None]:
# F-Beta@K - harmonic mean of precision and recall
fbeta_report = Report([
    FBetaTopK(k=10, min_rel_score=3, beta=1.0)  # F1 score
])

fbeta_snapshot = fbeta_report.run(current_dataset, reference_dataset)
fbeta_snapshot


In [None]:
# MAP@K - Mean Average Precision
map_report = Report([
    MAP(k=10, min_rel_score=3)
])

map_snapshot = map_report.run(current_dataset, reference_dataset)
map_snapshot


In [None]:
# MRR@K - Mean Reciprocal Rank
mrr_report = Report([
    MRR(k=10, min_rel_score=3)
])

mrr_snapshot = mrr_report.run(current_dataset, reference_dataset)
mrr_snapshot


In [None]:
# NDCG@K - Normalized Discounted Cumulative Gain
ndcg_report = Report([
    NDCG(k=10, min_rel_score=3)
])

ndcg_snapshot = ndcg_report.run(current_dataset, reference_dataset)
ndcg_snapshot


In [None]:
# Hit Rate@K - fraction of users with at least one relevant recommendation
hitrate_report = Report([
    HitRate(k=10, min_rel_score=3)
])

hitrate_snapshot = hitrate_report.run(current_dataset, reference_dataset)
hitrate_snapshot


### Additional Metrics


In [None]:
# Score Distribution - distribution of recommendation scores
score_dist_report = Report([
    ScoreDistribution(k=10)
])

score_dist_snapshot = score_dist_report.run(current_dataset, reference_dataset)
score_dist_snapshot


In [None]:
# Personalization - how diverse recommendations are across users
personalization_report = Report([
    Personalization(k=10)
])

personalization_snapshot = personalization_report.run(current_dataset, reference_dataset)
personalization_snapshot


In [None]:
# Diversity - how diverse recommendations are within each user's list
# Note: This requires numerical item features for distance calculations
# The Diversity metric uses cosine distance between item features

# Create diversity data for both current and reference periods
# Use the numerical item features directly (no need for mapping since they're already one-to-one)
diversity_current_data = current_data.copy()  # Use current_data which already has ratings
diversity_current_data = diversity_current_data.merge(item_features_df, on='item_id', how='left')

diversity_reference_data = reference_data.copy()  # Use reference_data which already has ratings
diversity_reference_data = diversity_reference_data.merge(item_features_df, on='item_id', how='left')

# Fill missing numerical features with median values
for col in ['genre_cluster', 'release_decade', 'rating_tier']:
    median_val = diversity_current_data[col].median()
    diversity_current_data[col] = diversity_current_data[col].fillna(median_val)
    diversity_reference_data[col] = diversity_reference_data[col].fillna(median_val)

# Create data definition with numerical item features
diversity_data_definition = DataDefinition(
    numerical_columns=["rating", "prediction", "genre_cluster", "release_decade", "rating_tier"],
    categorical_columns=["user_id", "item_id"],
    ranking=[Recsys(
        user_id="user_id",
        item_id="item_id", 
        prediction="prediction",
        target="rating"
    )]
)

# Create both current and reference datasets with numerical item features
diversity_current_dataset = Dataset.from_pandas(
    diversity_current_data,
    data_definition=diversity_data_definition
)

diversity_reference_dataset = Dataset.from_pandas(
    diversity_reference_data,
    data_definition=diversity_data_definition
)

# Diversity metric with numerical features
diversity_report = Report([
    Diversity(k=10, item_features=["genre_cluster", "release_decade", "rating_tier"])
])

diversity_snapshot = diversity_report.run(diversity_current_dataset, diversity_reference_dataset)
diversity_snapshot


In [None]:
# RecCasesTable - detailed recommendation cases for analysis
rec_cases_report = Report([
    RecCasesTable(
        user_ids=["user_0", "user_1", "user_2"],  # Show cases for specific users
        display_features=["genre_cluster", "release_decade", "rating_tier"]  # Show numerical features
    )
])

rec_cases_snapshot = rec_cases_report.run(diversity_current_dataset, diversity_reference_dataset)
rec_cases_snapshot


## Using the RecsysPreset

The RecsysPreset provides a comprehensive set of recommendation metrics in one go. It automatically includes all relevant metrics based on the available data.


In [None]:
# Basic RecsysPreset with minimal configuration
basic_preset_report = Report([
    RecsysPreset(
        k=10,
        min_rel_score=3,
        ranking_name="default"
    )
])

basic_preset_snapshot = basic_preset_report.run(current_dataset, reference_dataset)
basic_preset_snapshot


In [None]:
# Comprehensive RecsysPreset with all features
comprehensive_preset_report = Report([
    RecsysPreset(
        k=10,
        min_rel_score=3,
        ranking_name="default",
        user_ids=["user_0", "user_1", "user_2"],  # Specific users for RecCasesTable
        display_features=["genre_cluster", "release_decade", "rating_tier"],  # Features to display in RecCasesTable
        item_features=["genre_cluster", "release_decade", "rating_tier"],  # Item features for diversity metrics
        item_bias_columns=["popularity"],  # Item bias analysis
        user_bias_columns=["age_group"],  # User bias analysis
        normalize_arp=True,  # Normalize ARP in popularity bias
        beta=1.0  # Beta parameter for F-Beta score
    )
])

comprehensive_preset_snapshot = comprehensive_preset_report.run(diversity_current_dataset, diversity_reference_dataset)
comprehensive_preset_snapshot


## Metric Results Analysis

Let's examine the structure of metric results to understand how to access the data.


In [None]:
# Access metric results
precision_result = precision_snapshot.context.get_metric_result(PrecisionTopK(k=10, min_rel_score=3))

print("Precision@K Result Structure:")
print(f"Type: {type(precision_result)}")
print(f"Current value: {precision_result.value}")
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")

# Access the DataFrame values
print("\nCurrent Precision@K DataFrame:")
print(f"Current value: {precision_result.value}")
    
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")
    print("\nReference Precision@K DataFrame:")
try:
    precision_reference = precision_snapshot.context.get_reference_metric_result(PrecisionTopK(k=10, min_rel_score=3).metric_id)
    print(f"Reference value: {precision_reference.value}")
except:
    print("Reference value: None")


In [None]:
# Access single value results (like Personalization)
personalization_result = personalization_snapshot.context.get_metric_result(Personalization(k=10))

print("Personalization Result Structure:")
print(f"Type: {type(personalization_result)}")
print(f"Current value: {personalization_result.value}")
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")

# Access the single values
print(f"Current value: {personalization_result.value}")
print(f"Current value: {personalization_result.value}")
    
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")
try:
    personalization_reference = personalization_snapshot.context.get_reference_metric_result(Personalization(k=10).metric_id)
    print(f"Reference value: {personalization_reference.value}")
except:
    print("Reference value: None")


In [None]:
# Access RecCasesTable results (DataframeValue with user_id column)
rec_cases_result = rec_cases_snapshot.context.get_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]))

print("RecCasesTable Result Structure:")
print(f"Type: {type(rec_cases_result)}")
print(f"Current value: {rec_cases_result.value}")
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")

# Access the DataFrame values
print("\nCurrent RecCasesTable DataFrame:")
print(f"Current value: {rec_cases_result.value}")
    
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")
    print("\nReference RecCasesTable DataFrame:")
try:
    rec_cases_reference = rec_cases_snapshot.context.get_reference_metric_result(RecCasesTable(user_ids=["user_0", "user_1", "user_2"], display_features=["genre_cluster", "release_decade", "rating_tier"]).metric_id)
    print(f"Reference value: {rec_cases_reference.value}")
except:
    print("Reference value: None")


## Summary

This notebook demonstrated how to use Evidently's recommendation systems metrics:

### Key Features:
1. **Top-K Metrics**: Return DataframeValue with rank and value columns showing performance at different K values
2. **Single Value Metrics**: Return SingleValue for metrics like Personalization, Diversity, etc.
3. **Bias Analysis**: Return DataframeValue with x, y columns for distribution analysis
4. **RecCasesTable**: Return DataframeValue with user_id column for detailed case analysis

### Data Requirements:
- **Basic metrics**: User ID, Item ID, Prediction scores, Target ratings
- **Diversity metrics**: Additional item features (e.g., genres)
- **Bias metrics**: User/item bias features and training data
- **RecCasesTable**: Optional user_ids and display_features

### Preset Usage:
- **RecsysPreset**: Automatically includes relevant metrics based on available data
- **Conditional inclusion**: Metrics are added based on data availability (training data, features, etc.)

### Result Access:
- Use `.current` and `.reference` to access current and reference period results
- Use `.value` to access the actual data (DataFrame for DataframeValue, number for SingleValue)
- Results can be used for further analysis, visualization, or monitoring
