# 5.5 Model Evaluation Tutorial

This notebook covers key concepts in model evaluation including:
- Cross-validation
- Hyperparameter Tuning
- Performance Metrics
- Model Selection
- Model Diagnostics

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, cross_val_score, KFold,
    GridSearchCV, learning_curve, validation_curve
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, auc, precision_recall_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

## 1. Cross-validation

Let's explore different cross-validation techniques.

In [None]:
# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=42)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Simple K-Fold Cross-validation
model = LogisticRegression(random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=5)

print("Cross-validation scores:")
print(f"Individual scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Visualize cross-validation
plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), cv_scores, 'o-')
plt.axhline(y=cv_scores.mean(), color='r', linestyle='--', label='Mean CV score')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Cross-validation Scores Across Folds')
plt.legend()
plt.show()

# Learning curves
train_sizes, train_scores, test_scores = learning_curve(
    model, X_scaled, y, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training score')
plt.plot(train_sizes, test_mean, label='Cross-validation score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.grid(True)
plt.show()

## 2. Hyperparameter Tuning

Let's explore grid search for hyperparameter optimization.

In [None]:
# Create Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_scaled, y)

# Print results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Plot grid search results
results = pd.DataFrame(grid_search.cv_results_)
plt.figure(figsize=(15, 5))

# Plot scores for different n_estimators
plt.subplot(131)
sns.boxplot(x='param_n_estimators', y='mean_test_score', data=results)
plt.title('Scores vs n_estimators')

# Plot scores for different max_depth
plt.subplot(132)
sns.boxplot(x='param_max_depth', y='mean_test_score', data=results)
plt.title('Scores vs max_depth')

# Plot scores for different min_samples_split
plt.subplot(133)
sns.boxplot(x='param_min_samples_split', y='mean_test_score', data=results)
plt.title('Scores vs min_samples_split')

plt.tight_layout()
plt.show()

## 3. Performance Metrics

Let's analyze different performance metrics for classification.

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Get predictions and probabilities
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
print("Performance Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

## 4. Model Selection

Let's compare different models and their performance.

In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Compare models using cross-validation
model_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    model_scores[name] = scores

# Plot model comparison
plt.figure(figsize=(10, 6))
plt.boxplot([scores for scores in model_scores.values()], labels=model_scores.keys())
plt.title('Model Comparison using Cross-validation')
plt.ylabel('Accuracy')
plt.show()

# Print summary statistics
for name, scores in model_scores.items():
    print(f"\n{name}:")
    print(f"Mean accuracy: {scores.mean():.4f}")
    print(f"Standard deviation: {scores.std():.4f}")

## 5. Model Diagnostics

Let's analyze model behavior and potential issues.

In [None]:
# Generate validation curves
param_range = np.logspace(-4, 4, 10)
train_scores, test_scores = validation_curve(
    LogisticRegression(), X_scaled, y,
    param_name="C", param_range=param_range,
    cv=5, scoring="accuracy", n_jobs=-1
)

# Calculate mean and std for training scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot validation curve
plt.figure(figsize=(10, 6))
plt.semilogx(param_range, train_mean, label="Training score", color="darkorange")
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2,
                 color="darkorange")
plt.semilogx(param_range, test_mean, label="Cross-validation score", color="navy")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2,
                 color="navy")
plt.xlabel("C parameter")
plt.ylabel("Score")
plt.title("Validation Curve for Logistic Regression")
plt.legend(loc="best")
plt.show()

# Analyze prediction probabilities
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)

plt.figure(figsize=(10, 6))
plt.hist(probabilities[:, 1], bins=50)
plt.xlabel('Predicted Probability')
plt.ylabel('Count')
plt.title('Distribution of Prediction Probabilities')
plt.show()

## Practice Exercises

1. Implement stratified k-fold cross-validation and compare with regular k-fold.

2. Use RandomizedSearchCV for hyperparameter tuning and compare with GridSearchCV.

3. Implement custom scoring metrics for model evaluation.

4. Create a pipeline that includes preprocessing and model training.

5. Analyze feature importance and its impact on model performance.