In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing, load_wine, make_classification
from sklearn.model_selection import (
    train_test_split, cross_val_score, KFold, StratifiedKFold,
    GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
print("🚀 Machine Learning Model Selection Tutorial")
print("=" * 50)

## Part 1: Understanding Cross-Validation
print("\n📊 PART 1: CROSS-VALIDATION FUNDAMENTALS")
print("-" * 40)

# Load a real dataset - California Housing (regression problem)
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
X_california, y_california = california.data, california.target
feature_names = california.feature_names

print(f"Dataset: California Housing Prices")
print(f"Features: {len(feature_names)} ({', '.join(feature_names[:5])}...)")
print(f"Samples: {len(X_california)}")
print(f"Target: Median house value in hundreds of thousands of dollars")

# Create a simple model to demonstrate CV
rf_basic = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# 1. Traditional Train-Test Split (what NOT to rely on alone)
X_train, X_test, y_train, y_test = train_test_split(
    X_california, y_california, test_size=0.2, random_state=42
)

rf_basic.fit(X_train, y_train)
single_score = rf_basic.score(X_test, y_test)
print(f"\n❌ Single train-test split R² score: {single_score:.4f}")
print("Problem: This gives us only ONE estimate of performance!")

In [None]:
# 2. K-Fold Cross-Validation - The Better Way
print(f"\n✅ K-Fold Cross-Validation (k=5):")
cv_scores = cross_val_score(rf_basic, X_california, y_california, cv=5, scoring='r2')
print(f"Individual fold scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Mean CV score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
print("Benefit: More robust estimate with confidence interval!")

In [None]:
# 3. Different CV Strategies Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('Cross-Validation Strategies Comparison', fontsize=16)

# Compare different k values
k_values = [3, 5, 10, 20]
cv_means = []
cv_stds = []

for i, k in enumerate(k_values):
    scores = cross_val_score(rf_basic, X_california, y_california, cv=k, scoring='r2')
    cv_means.append(scores.mean())
    cv_stds.append(scores.std())
    
    ax = axes[i//2, i%2]
    ax.bar(range(len(scores)), scores, alpha=0.7)
    ax.axhline(y=scores.mean(), color='red', linestyle='--', 
               label=f'Mean: {scores.mean():.3f}')
    ax.set_title(f'{k}-Fold CV (std: {scores.std():.3f})')
    ax.set_xlabel('Fold')
    ax.set_ylabel('R² Score')
    ax.legend()

plt.tight_layout()
plt.show()

# Summary of CV strategy impact
cv_comparison = pd.DataFrame({
    'K_Folds': k_values,
    'Mean_Score': cv_means,
    'Std_Dev': cv_stds,
    'CI_Width': [2*std for std in cv_stds]
})
print("\n📈 Cross-Validation Strategy Comparison:")
print(cv_comparison.round(4))

In [None]:
## Part 2: Hyperparameter Tuning
print("\n\n🔧 PART 2: HYPERPARAMETER TUNING")
print("-" * 40)

# Load classification dataset for hyperparameter tuning
wine = load_wine()
X_wine, y_wine = wine.data, wine.target

print("Dataset: Wine Classification")
print(f"Features: {X_wine.shape[1]}, Samples: {X_wine.shape[0]}, Classes: {len(np.unique(y_wine))}")

# Scale features for SVM (important!)
scaler = StandardScaler()
X_wine_scaled = scaler.fit_transform(X_wine)


In [None]:
# 1. Manual Hyperparameter Testing (inefficient way)
print("\n❌ Manual Hyperparameter Testing:")
manual_results = []
C_values = [0.1, 1, 10, 100]
gamma_values = ['scale', 'auto', 0.01, 0.1]

for C in C_values[:2]:  # Limited for demo
    for gamma in gamma_values[:2]:
        svm = SVC(C=C, gamma=gamma, random_state=42)
        scores = cross_val_score(svm, X_wine_scaled, y_wine, cv=5)
        manual_results.append({
            'C': C, 'gamma': gamma, 'mean_score': scores.mean(), 'std': scores.std()
        })
        print(f"C={C}, gamma={gamma}: {scores.mean():.4f} (±{scores.std():.4f})")

In [None]:
# 2. Grid Search - Systematic Approach
print(f"\n✅ Grid Search Cross-Validation:")
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

# Using GridSearchCV
grid_search = GridSearchCV(
    SVC(random_state=42), 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU corebs
    verbose=1
)

grid_search.fit(X_wine_scaled, y_wine)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")
print(f"Total combinations tested: {len(grid_search.cv_results_['params'])}")

In [None]:
# 3. Randomized Search - Efficient Alternative
print(f"\n⚡ Randomized Search (faster alternative):")
from scipy.stats import uniform, randint

param_dist = {
    'C': uniform(0.1, 100),  # Continuous distribution
    'gamma': uniform(0.001, 1),
    'kernel': ['rbf', 'poly', 'sigmoid']
}

random_search = RandomizedSearchCV(
    SVC(random_state=42),
    param_dist,
    n_iter=50,  # Number of parameter combinations to try
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_wine_scaled, y_wine)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best CV score: {random_search.best_score_:.4f}")
print(f"Combinations tested: {random_search.n_iter} (vs {len(grid_search.cv_results_['params'])} in Grid Search)")

# Comparison of methods
tuning_comparison = pd.DataFrame({
    'Method': ['Manual', 'Grid Search', 'Random Search'],
    'Best_Score': [
        max([r['mean_score'] for r in manual_results]),
        grid_search.best_score_,
        random_search.best_score_
    ],
    'Combinations_Tested': [4, len(grid_search.cv_results_['params']), 50],
    'Time_Efficiency': ['Low', 'Low', 'High']
})
print(f"\n📊 Hyperparameter Tuning Comparison:")
print(tuning_comparison.round(4))

In [None]:
## Part 3: Model Selection
print("\n\n🏆 PART 3: MODEL SELECTION")
print("-" * 40)

# Create a more complex dataset for model comparison
X_complex, y_complex = make_classification(
    n_samples=1000, n_features=20, n_informative=15, 
    n_redundant=5, n_classes=3, random_state=42
)

print("Dataset: Complex Multi-class Classification")
print(f"Features: {X_complex.shape[1]}, Samples: {X_complex.shape[0]}, Classes: 3")

# Define multiple models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

In [None]:
# 1. Basic Model Comparison
print(f"\n🔍 Basic Model Comparison (5-Fold CV):")
model_results = {}

for name, model in models.items():
    if name == 'SVM':
        # Scale features for SVM
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_complex)
        scores = cross_val_score(model, X_scaled, y_complex, cv=5)
    else:
        scores = cross_val_score(model, X_complex, y_complex, cv=5)
    
    model_results[name] = scores
    print(f"{name:20}: {scores.mean():.4f} (±{scores.std():.4f})")

# Visualize model comparison
plt.figure(figsize=(10, 6))
model_names = list(model_results.keys())
means = [model_results[name].mean() for name in model_names]
stds = [model_results[name].std() for name in model_names]

bars = plt.bar(model_names, means, yerr=stds, capsize=5, alpha=0.7)
plt.title('Model Performance Comparison')
plt.ylabel('Accuracy Score')
plt.xticks(rotation=45)

# Add value labels on bars
for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.01, 
             f'{mean:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# 2. Advanced Model Selection with Hyperparameter Tuning
print(f"\n🚀 Advanced Model Selection with Hyperparameter Tuning:")

# Define parameter grids for each model
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto', 0.1],
        'kernel': ['rbf', 'poly']
    },
    'Decision Tree': {
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
}

best_models = {}
tuned_results = {}

for name, model in models.items():
    if name in param_grids:
        print(f"\nTuning {name}...")
        
        if name == 'SVM':
            X_model = StandardScaler().fit_transform(X_complex)
        else:
            X_model = X_complex
            
        grid = GridSearchCV(
            model, param_grids[name], cv=5, 
            scoring='accuracy', n_jobs=-1
        )
        grid.fit(X_model, y_complex)
        
        best_models[name] = grid.best_estimator_
        tuned_results[name] = grid.best_score_
        
        print(f"Best params: {grid.best_params_}")
        print(f"Best score: {grid.best_score_:.4f}")

In [None]:
print(f"\n🏅 FINAL MODEL RANKINGS:")
print("-" * 30)

# Combine basic and tuned results
final_scores = {}
for name in models.keys():
    if name in tuned_results:
        final_scores[name] = tuned_results[name]
    else:
        final_scores[name] = model_results[name].mean()

# Sort by performance
sorted_models = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

for i, (name, score) in enumerate(sorted_models, 1):
    status = "⭐ WINNER!" if i == 1 else f"#{i}"
    tuned_status = "(Tuned)" if name in tuned_results else "(Basic)"
    print(f"{status:12} {name:20} {tuned_status:8}: {score:.4f}")

In [None]:
## Part 4: Practical Validation Strategies
print(f"\n\n✅ PART 4: PRACTICAL VALIDATION STRATEGIES")
print("-" * 40)

# 1. Learning Curves - Diagnose overfitting/underfitting
best_model_name = sorted_models[0][0]
best_model = best_models.get(best_model_name, models[best_model_name])

print(f"Analyzing learning curves for: {best_model_name}")

if best_model_name == 'SVM':
    X_analysis = StandardScaler().fit_transform(X_complex)
else:
    X_analysis = X_complex

train_sizes, train_scores, val_scores = learning_curve(
    best_model, X_analysis, y_complex, cv=5, 
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy', n_jobs=-1
)

# Plot learning curves
plt.figure(figsize=(10, 6))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                 alpha=0.1, color='blue')

plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, 
                 alpha=0.1, color='red')

plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.title(f'Learning Curves - {best_model_name}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Interpretation
gap = train_mean[-1] - val_mean[-1]
if gap > 0.05:
    print(f"⚠️  Large gap ({gap:.3f}) suggests overfitting")
    print("   Recommendation: Regularization, more data, or simpler model")
elif val_mean[-1] < 0.8:
    print(f"📈 Low validation score suggests underfitting")
    print("   Recommendation: More complex model or feature engineering")
else:
    print(f"✅ Good balance - training/validation gap: {gap:.3f}")

In [None]:
# 2. Validation Curve - Analyze single hyperparameter
print(f"\n📈 Validation Curve Analysis:")

if best_model_name == 'Random Forest':
    param_name = 'n_estimators'
    param_range = [10, 50, 100, 200, 500]
elif best_model_name == 'SVM':
    param_name = 'C'
    param_range = [0.01, 0.1, 1, 10, 100]
else:
    param_name = 'max_depth'
    param_range = [1, 3, 5, 10, 15, None]

if param_range[-1] is not None:  # Avoid None in validation curve for some models
    train_scores, val_scores = validation_curve(
        models[best_model_name], X_analysis, y_complex, 
        param_name=param_name, param_range=param_range[:-1], 
        cv=5, scoring='accuracy'
    )
    
    plt.figure(figsize=(10, 6))
    train_mean = np.mean(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    
    plt.plot(param_range[:-1], train_mean, 'o-', label='Training Score')
    plt.plot(param_range[:-1], val_mean, 'o-', label='Validation Score')
    plt.xlabel(param_name)
    plt.ylabel('Accuracy Score')
    plt.title(f'Validation Curve - {param_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    if param_name == 'C':
        plt.xscale('log')
    plt.show()

In [None]:
## Part 5: Real-World Application Checklist
print(f"\n\n📋 PART 5: REAL-WORLD APPLICATION CHECKLIST")
print("-" * 50)

checklist = """
✅ DATA PREPARATION:
   □ Handle missing values appropriately
   □ Scale features when needed (SVM, Neural Networks)
   □ Check for data leakage
   □ Ensure proper train/validation/test splits

✅ CROSS-VALIDATION STRATEGY:
   □ Use StratifiedKFold for imbalanced classification
   □ Use TimeSeriesSplit for temporal data
   □ Choose appropriate k (5-10 typically good)
   □ Consider computational constraints

✅ HYPERPARAMETER TUNING:
   □ Start with RandomizedSearchCV for efficiency
   □ Use GridSearchCV for final fine-tuning
   □ Set up proper parameter ranges
   □ Use nested CV for unbiased evaluation

✅ MODEL SELECTION:
   □ Compare multiple algorithm families
   □ Consider interpretability requirements
   □ Evaluate computational constraints
   □ Check learning curves for overfitting

✅ VALIDATION:
   □ Use holdout test set for final evaluation
   □ Analyze confusion matrices for classification
   □ Check residual plots for regression
   □ Validate on similar but different datasets if possible

✅ PRODUCTION CONSIDERATIONS:
   □ Model versioning and reproducibility
   □ Performance monitoring setup
   □ Retrain schedule planning
   □ Feature drift detection
"""

print(checklist)

# Final Summary
print(f"\n🎯 KEY TAKEAWAYS:")
print("-" * 20)
print("1. Always use cross-validation - never trust a single train-test split")
print("2. Start with RandomizedSearchCV, then fine-tune with GridSearchCV")
print("3. Compare multiple model types before settling on one")
print("4. Use learning curves to diagnose overfitting/underfitting")
print("5. Keep a holdout test set that you NEVER touch during development")
print("6. Consider real-world constraints (speed, interpretability, maintenance)")

print(f"\n🚀 Next Steps:")
print("- Apply these techniques to your own dataset")
print("- Experiment with ensemble methods (Voting, Stacking)")  
print("- Learn about advanced CV strategies (Nested CV, Group CV)")
print("- Explore automated ML libraries (Auto-sklearn, TPOT)")

print(f"\n" + "="*50)
print("Tutorial Complete! Happy Machine Learning! 🤖")