# Fairness Evaluation Tutorial

This notebook demonstrates comprehensive fairness evaluation using multiple metrics and interactive visualizations.


### Key Metrics Explained:

1. **Demographic Parity**: Equal positive prediction rates across groups
2. **Equalized Odds**: Equal true positive and false positive rates across groups
3. **Predictive Parity**: Equal positive predictive values across groups
4. **Calibration**: Predicted probabilities should reflect actual outcomes equally across group


## 1. Setup and Imports

In [2]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, precision_recall_curve

from modules.fairness_evaluation import FairnessEvaluator
from modules.data_loader import DataLoader
from modules.utils import validate_fairness_metrics, create_bias_mitigation_recommendations


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/py

ImportError: numpy.core.multiarray failed to import

## 2. Load and Prepare Data

In [None]:
# Load German Credit dataset
data_loader = DataLoader()
data = data_loader.load_german_credit_dataset()

print(f"Dataset shape: {data.shape}")
print(f"\nTarget distribution:")
print(data['credit_risk'].value_counts())
print(f"\nProtected attribute distribution:")
print(data['protected_age'].value_counts())

In [None]:
# Prepare data for ML
X_train, X_test, y_train, y_test, protected_train, protected_test = data_loader.prepare_ml_data(
    data, 'credit_risk', 'protected_age'
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

## 3. Train Multiple Models

In [None]:
# Train different models for comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

model_results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Get predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Store results
    model_results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'accuracy': model.score(X_test, y_test)
    }
    
    print(f"{name} accuracy: {model_results[name]['accuracy']:.3f}")

## 4. Comprehensive Fairness Evaluation

In [None]:
# Initialize fairness evaluator
fairness_evaluator = FairnessEvaluator()

# Evaluate fairness for each model
fairness_results = {}

for model_name, results in model_results.items():
    print(f"\n{model_name} Fairness Analysis:")
    print("=" * 50)
    
    # Calculate all fairness metrics
    metrics = fairness_evaluator.calculate_all_metrics(
        y_test, results['y_pred'], results['y_proba'], protected_test
    )
    
    fairness_results[model_name] = metrics
    
    # Display metrics
    for category, cat_metrics in metrics.items():
        print(f"\n{category.upper()}:")
        for metric_name, value in cat_metrics.items():
            if not np.isnan(value):
                print(f"  {metric_name}: {value:.3f}")
            else:
                print(f"  {metric_name}: N/A")

## 5. Fairness Validation and Recommendations

In [None]:
# Validate fairness metrics for each model
for model_name, metrics in fairness_results.items():
    print(f"\n{model_name} Fairness Validation:")
    print("=" * 50)
    
    # Validate metrics
    validation = validate_fairness_metrics(metrics)
    
    # Display validation results
    passed = sum(1 for v in validation.values() if v['status'] == 'PASS')
    total = len(validation)
    
    print(f"Fairness Score: {passed}/{total} metrics passed")
    
    for metric_name, result in validation.items():
        status_emoji = "✅" if result['status'] == 'PASS' else "❌"
        print(f"{status_emoji} {metric_name}: {result['value']:.3f} (threshold: {result['threshold']})")
    
    # Generate recommendations
    recommendations = create_bias_mitigation_recommendations(validation)
    print("\nRecommendations:")
    for rec in recommendations:
        print(f"• {rec}")

## 6. Comparative Analysis

In [None]:
# Create comparative analysis of models
comparison_data = []

for model_name, metrics in fairness_results.items():
    row = {'Model': model_name}
    
    # Add accuracy
    row['Accuracy'] = model_results[model_name]['accuracy']
    
    # Add key fairness metrics
    row['Demographic Parity'] = metrics['group_fairness']['demographic_parity']
    row['Disparate Impact'] = metrics['group_fairness']['disparate_impact']
    row['Equalized Odds (TPR)'] = metrics['group_fairness']['equalized_odds_tpr']
    row['Calibration Diff'] = metrics['calibration']['calibration_difference']
    
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
print("Model Comparison:")
print(comparison_df.round(3))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Model Fairness Comparison', fontsize=16)

# Demographic Parity
axes[0, 0].bar(comparison_df['Model'], comparison_df['Demographic Parity'])
axes[0, 0].axhline(y=0.1, color='r', linestyle='--', label='Threshold (0.1)')
axes[0, 0].set_title('Demographic Parity')
axes[0, 0].set_ylabel('Difference')
axes[0, 0].legend()

# Disparate Impact
axes[0, 1].bar(comparison_df['Model'], comparison_df['Disparate Impact'])
axes[0, 1].axhline(y=0.8, color='r', linestyle='--', label='Threshold (0.8)')
axes[0, 1].set_title('Disparate Impact')
axes[0, 1].set_ylabel('Ratio')
axes[0, 1].legend()

# Equalized Odds
axes[1, 0].bar(comparison_df['Model'], comparison_df['Equalized Odds (TPR)'])
axes[1, 0].axhline(y=0.1, color='r', linestyle='--', label='Threshold (0.1)')
axes[1, 0].set_title('Equalized Odds (TPR)')
axes[1, 0].set_ylabel('Difference')
axes[1, 0].legend()

# Accuracy vs Fairness
axes[1, 1].scatter(comparison_df['Accuracy'], comparison_df['Demographic Parity'], s=100)
for i, model in enumerate(comparison_df['Model']):
    axes[1, 1].annotate(model, (comparison_df['Accuracy'].iloc[i], comparison_df['Demographic Parity'].iloc[i]))
axes[1, 1].set_xlabel('Accuracy')
axes[1, 1].set_ylabel('Demographic Parity')
axes[1, 1].set_title('Accuracy vs Fairness Trade-off')

plt.tight_layout()
plt.show()

## 7. Interactive Dashboard Creation

In [None]:
# Create fairness dashboard for the best performing model
best_model_name = 'Random Forest'  # Choose based on your criteria
best_results = model_results[best_model_name]

# Generate interactive dashboard
dashboard_fig = fairness_evaluator.create_fairness_dashboard(
    y_test, best_results['y_pred'], best_results['y_proba'], protected_test
)

dashboard_fig.show()

## 8. Group-Specific Analysis

In [None]:
# Analyze performance by group
best_results = model_results['Random Forest']
y_pred = best_results['y_pred']
y_proba = best_results['y_proba']

from sklearn.metrics import classification_report, confusion_matrix

groups = np.unique(protected_test)
print("Group-Specific Performance Analysis:")
print("=" * 50)

for group in groups:
    group_mask = protected_test == group
    group_name = f"Group {group} ({'Younger' if group == 0 else 'Older'})"
    
    print(f"\n{group_name}:")
    print(f"Sample size: {np.sum(group_mask)}")
    print(f"Positive rate: {np.mean(y_test[group_mask]):.3f}")
    print(f"Predicted positive rate: {np.mean(y_pred[group_mask]):.3f}")
    print(f"Average prediction probability: {np.mean(y_proba[group_mask]):.3f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test[group_mask], y_pred[group_mask])
    print(f"Confusion Matrix:")
    print(cm)
    
    # Classification report
    if len(np.unique(y_test[group_mask])) > 1:
        report = classification_report(y_test[group_mask], y_pred[group_mask], output_dict=True)
        print(f"Precision: {report['1']['precision']:.3f}")
        print(f"Recall: {report['1']['recall']:.3f}")
        print(f"F1-Score: {report['1']['f1-score']:.3f}")

## 9. Calibration Analysis

In [None]:
# Analyze model calibration by group
def plot_calibration_curve(y_true, y_proba, protected_attr, n_bins=10):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    groups = np.unique(protected_attr)
    colors = ['blue', 'red']
    
    for i, group in enumerate(groups):
        group_mask = protected_attr == group
        
        if np.sum(group_mask) > 0:
            group_true = y_true[group_mask]
            group_proba = y_proba[group_mask]
            
            # Create bins
            bin_boundaries = np.linspace(0, 1, n_bins + 1)
            bin_lowers = bin_boundaries[:-1]
            bin_uppers = bin_boundaries[1:]
            
            bin_centers = []
            bin_accuracies = []
            
            for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
                in_bin = (group_proba > bin_lower) & (group_proba <= bin_upper)
                prop_in_bin = in_bin.mean()
                
                if prop_in_bin > 0:
                    accuracy_in_bin = group_true[in_bin].mean()
                    avg_confidence_in_bin = group_proba[in_bin].mean()
                    
                    bin_centers.append(avg_confidence_in_bin)
                    bin_accuracies.append(accuracy_in_bin)
            
            # Plot calibration curve
            ax.plot(bin_centers, bin_accuracies, 'o-', color=colors[i], 
                   label=f'Group {group} ({'Younger' if group == 0 else 'Older'})', linewidth=2, markersize=8)
    
    # Perfect calibration line
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    
    ax.set_xlabel('Mean Predicted Probability')
    ax.set_ylabel('Fraction of Positives')
    ax.set_title('Calibration Plot by Group')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Create calibration plot
plot_calibration_curve(y_test, y_proba, protected_test)

SyntaxError: f-string: expecting '}' (3785501601.py, line 36)