# Model Transparency Tutorial

This notebook demonstrates how to use the AI Safety Toolkit's transparency module to explain model decisions using LIME, SHAP, and other interpretability techniques.

### Interpretability Techniques Comparison:

| Technique | Best For | Pros | Cons |
|-----------|----------|------|------|
| **Feature Importance** | Global understanding | Fast, built-in for tree models | Limited to supported models |
| **LIME** | Individual predictions | Model-agnostic, intuitive | Can be unstable, local only |
| **SHAP** | Both global and local | Theoretically grounded, comprehensive | Computationally expensive |
| **Permutation Importance** | Global understanding | Model-agnostic | Computationally expensive |

## 1. Setup and Imports

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from modules.transparency import TransparencyAnalyzer
from modules.data_loader import DataLoader
from modules.utils import load_sample_model

## 2. Load and Prepare Data

In [None]:
# Load Adult Income dataset
data_loader = DataLoader()
data = data_loader.load_adult_dataset()

print(f"Dataset shape: {data.shape}")
print(f"\nFeatures: {[col for col in data.columns if col not in ['income', 'protected_race']]}")
print(f"\nTarget variable distribution:")
print(data['income'].value_counts())

In [None]:
# Prepare data for ML
X_train, X_test, y_train, y_test, protected_train, protected_test = data_loader.prepare_ml_data(
    data, 'income', 'protected_race'
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Feature names: {X_train.columns.tolist()}")

## 3. Train Multiple Models for Comparison

In [None]:
# Train different types of models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    
    # Evaluate
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    trained_models[name] = model
    
    print(f"  Train Accuracy: {train_acc:.3f}")
    print(f"  Test Accuracy: {test_acc:.3f}")
    print()

## 4. Feature Importance Analysis

In [None]:
# Initialize transparency analyzer
transparency_analyzer = TransparencyAnalyzer()

# Analyze feature importance for tree-based models
tree_models = ['Random Forest', 'Gradient Boosting']

fig, axes = plt.subplots(1, len(tree_models), figsize=(15, 6))
if len(tree_models) == 1:
    axes = [axes]

for i, model_name in enumerate(tree_models):
    model = trained_models[model_name]
    
    # Get feature importance
    feature_names = X_train.columns.tolist()
    importances = model.feature_importances_
    
    # Sort by importance
    indices = np.argsort(importances)[::-1][:10]  # Top 10
    
    # Plot
    axes[i].bar(range(len(indices)), [importances[j] for j in indices])
    axes[i].set_title(f'{model_name} - Feature Importance')
    axes[i].set_xlabel('Features')
    axes[i].set_ylabel('Importance')
    axes[i].set_xticks(range(len(indices)))
    axes[i].set_xticklabels([feature_names[j] for j in indices], rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Create interactive feature importance plot using Plotly
model = trained_models['Random Forest']
feature_names = X_train.columns.tolist()

importance_fig = transparency_analyzer.plot_feature_importance(model, feature_names)
importance_fig.show()

## 5. LIME Explanations

In [None]:
# Select a few interesting instances to explain
model = trained_models['Random Forest']
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Find interesting cases
# High confidence positive prediction
high_pos_idx = np.argmax(y_proba)
# High confidence negative prediction  
high_neg_idx = np.argmin(y_proba)
# Uncertain prediction (around 0.5)
uncertain_idx = np.argmin(np.abs(y_proba - 0.5))

interesting_cases = {
    'High Positive Confidence': high_pos_idx,
    'High Negative Confidence': high_neg_idx,
    'Uncertain Prediction': uncertain_idx
}

print("Interesting cases for explanation:")
for case_name, idx in interesting_cases.items():
    print(f"{case_name}: Index {idx}, Probability: {y_proba[idx]:.3f}, Prediction: {y_pred[idx]}")

In [None]:
# Generate LIME explanations for interesting cases
for case_name, instance_idx in interesting_cases.items():
    print(f"\nGenerating LIME explanation for {case_name} (Index: {instance_idx})")
    
    try:
        lime_fig = transparency_analyzer.explain_with_lime(
            model, X_test, instance_idx, num_features=8
        )
        lime_fig.update_layout(title=f"LIME Explanation - {case_name}")
        lime_fig.show()
        
        # Show actual feature values for context
        instance_data = X_test.iloc[instance_idx]
        print("\nActual feature values:")
        for feature, value in instance_data.items():
            print(f"  {feature}: {value}")
        
    except Exception as e:
        print(f"Error generating LIME explanation: {e}")

## 6. SHAP Explanations

In [None]:
# Generate SHAP explanations
print("Generating SHAP analysis...")
print("This may take a few moments for complex models.")

try:
    # Use a subset of test data for performance
    shap_fig = transparency_analyzer.explain_with_shap(
        model, X_test.iloc[:100], max_samples=100
    )
    shap_fig.show()
    
except Exception as e:
    print(f"Error generating SHAP explanation: {e}")
    print("Note: SHAP explanations can be computationally intensive for some models.")

## 7. Model Behavior Analysis

In [None]:
# Comprehensive model behavior analysis
behavior_analysis = transparency_analyzer.analyze_model_behavior(
    model, X_test, feature_names=X_test.columns.tolist()
)

print("Model Behavior Analysis:")
print("=" * 50)

# Feature importance
if 'feature_importance' in behavior_analysis:
    print("\nBuilt-in Feature Importance (Top 10):")
    sorted_importance = sorted(behavior_analysis['feature_importance'].items(), 
                             key=lambda x: x[1], reverse=True)[:10]
    for feature, importance in sorted_importance:
        print(f"  {feature}: {importance:.4f}")

# Permutation importance
if 'permutation_importance' in behavior_analysis:
    print("\nPermutation Importance (Top 10):")
    sorted_perm_importance = sorted(behavior_analysis['permutation_importance'].items(), 
                                  key=lambda x: x[1], reverse=True)[:10]
    for feature, importance in sorted_perm_importance:
        print(f"  {feature}: {importance:.4f}")

# Prediction statistics
if 'prediction_stats' in behavior_analysis:
    stats = behavior_analysis['prediction_stats']
    print("\nPrediction Statistics:")
    print(f"  Mean: {stats['mean']:.3f}")
    print(f"  Std: {stats['std']:.3f}")
    print(f"  Min: {stats['min']:.3f}")
    print(f"  Max: {stats['max']:.3f}")
    print(f"  Quartiles: {[f'{q:.3f}' for q in stats['quartiles']]}")

## 8. Decision Boundary Visualization

In [None]:
# Visualize decision boundary for two most important features
if 'feature_importance' in behavior_analysis:
    # Get top 2 features
    sorted_features = sorted(behavior_analysis['feature_importance'].items(), 
                           key=lambda x: x[1], reverse=True)
    
    feature1_name = sorted_features[0][0]
    feature2_name = sorted_features[1][0]
    
    feature1_idx = X_test.columns.get_loc(feature1_name)
    feature2_idx = X_test.columns.get_loc(feature2_name)
    
    print(f"Visualizing decision boundary for {feature1_name} vs {feature2_name}")
    
    try:
        boundary_fig = transparency_analyzer.plot_decision_boundary_2d(
            model, X_test, y_test, feature1_idx, feature2_idx
        )
        boundary_fig.show()
    except Exception as e:
        print(f"Error creating decision boundary plot: {e}")

## 9. Model Comparison: Interpretability vs Performance

In [None]:
# Compare interpretability aspects of different models
comparison_results = []

for model_name, model in trained_models.items():
    result = {
        'Model': model_name,
        'Test Accuracy': model.score(X_test, y_test),
        'Has Feature Importance': hasattr(model, 'feature_importances_'),
        'Model Complexity': 'High' if 'Forest' in model_name or 'Boosting' in model_name else 'Low'
    }
    
    # Prediction confidence spread (as proxy for model certainty)
    proba = model.predict_proba(X_test)[:, 1]
    confidence_spread = np.std(proba)
    result['Confidence Spread'] = confidence_spread
    
    comparison_results.append(result)

comparison_df = pd.DataFrame(comparison_results)
print("Model Interpretability Comparison:")
print(comparison_df)

In [None]:
# Visualize accuracy vs interpretability trade-off
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy comparison
ax1.bar(comparison_df['Model'], comparison_df['Test Accuracy'])
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Test Accuracy')
ax1.set_xticklabels(comparison_df['Model'], rotation=45)

# Confidence spread (interpretability proxy)
colors = ['green' if x else 'red' for x in comparison_df['Has Feature Importance']]
ax2.bar(comparison_df['Model'], comparison_df['Confidence Spread'], color=colors)
ax2.set_title('Model Confidence Spread\n(Green = Has Built-in Feature Importance)')
ax2.set_ylabel('Prediction Confidence Spread')
ax2.set_xticklabels(comparison_df['Model'], rotation=45)

plt.tight_layout()
plt.show()

## 10. Interactive Interpretability Dashboard

In [None]:
# Create comprehensive interpretability dashboard
try:
    dashboard_fig = transparency_analyzer.create_interpretability_dashboard(
        model, X_test, y_test, instance_idx=interesting_cases['Uncertain Prediction']
    )
    dashboard_fig.show()
except Exception as e:
    print(f"Error creating interpretability dashboard: {e}")

## 11. Practical Guidelines for Model Interpretability

In [None]:
# Generate practical recommendations based on analysis
print("Model Interpretability Guidelines:")
print("=" * 50)

# Model selection recommendations
best_accuracy = comparison_df.loc[comparison_df['Test Accuracy'].idxmax()]
most_interpretable = comparison_df.loc[comparison_df['Model'] == 'Logistic Regression']

print(f"\n📊 Best Performance: {best_accuracy['Model']} (Accuracy: {best_accuracy['Test Accuracy']:.3f})")
print(f"🔍 Most Interpretable: {most_interpretable['Model'].iloc[0]} (Accuracy: {most_interpretable['Test Accuracy'].iloc[0]:.3f})")

# Feature importance insights
if 'feature_importance' in behavior_analysis:
    top_features = sorted(behavior_analysis['feature_importance'].items(), 
                         key=lambda x: x[1], reverse=True)[:3]
    print(f"\n🎯 Top 3 Most Important Features:")
    for i, (feature, importance) in enumerate(top_features, 1):
        print(f"  {i}. {feature} (importance: {importance:.3f})")

# Recommendations
print("\n💡 Recommendations:")
print("  • Use LIME for individual prediction explanations")
print("  • Use SHAP for global model understanding")
print("  • Monitor feature importance drift over time")
print("  • Consider simpler models for high-stakes decisions")
print("  • Validate explanations with domain experts")
print("  • Document model decisions for audit trails")