# Machine Learning Specialization - Advanced Learning Algorithms
## Week 4: Decision Trees and Ensemble Methods

### Learning Objectives:
- Understand how decision trees work and their advantages
- Learn ensemble methods like Random Forests and Boosting
- Implement XGBoost for high-performance machine learning
- Compare tree-based methods with neural networks
- Handle both regression and classification tasks

### Key Concepts:
- **Decision Trees**: Hierarchical models that make decisions by splitting data
- **Ensemble Methods**: Combining multiple models for better performance
- **Random Forests**: Bagging ensemble of decision trees
- **Boosting**: Sequential ensemble methods (AdaBoost, Gradient Boosting)
- **XGBoost**: Optimized gradient boosting implementation

Tree-based methods offer interpretability, handle mixed data types, and often perform exceptionally well without extensive preprocessing.

### 1. Import Required Libraries

Let's import the necessary libraries for our tree-based exercises.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression, load_breast_cancer
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
import seaborn as sns
from IPython.display import Image
import graphviz

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")

### 2. Generate Datasets for Tree-Based Methods

We'll create both regression and classification datasets to demonstrate tree-based methods.

In [None]:
# Generate classification dataset with non-linear patterns
X_clf, y_clf = make_classification(
    n_samples=1000, n_features=10, n_informative=6, n_redundant=2, 
    n_clusters_per_class=2, random_state=42
)

# Generate regression dataset
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=8, noise=10, random_state=42
)

print(f"Classification dataset: X = {X_clf.shape}, y = {y_clf.shape}")
print(f"Regression dataset: X = {X_reg.shape}, y = {y_reg.shape}")
print(f"Classification classes: {np.unique(y_clf)}")

# Split datasets
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(f"\nClassification - Train: {X_clf_train.shape[0]}, Test: {X_clf_test.shape[0]}")
print(f"Regression - Train: {X_reg_train.shape[0]}, Test: {X_reg_test.shape[0]}")

### 3. Decision Trees Fundamentals

Decision trees work by recursively splitting data based on feature values to minimize impurity:

- **Root Node**: Starting point with all data
- **Internal Nodes**: Decision points that split data
- **Leaf Nodes**: Final predictions
- **Splitting Criteria**: Gini impurity (classification) or MSE (regression)

Key advantages: Interpretable, handle mixed data types, no need for feature scaling.

In [None]:
# Build and visualize a simple decision tree
def build_simple_tree(max_depth=3):
    """Build a simple decision tree and visualize it"""
    # Use only first 100 samples and 2 features for visualization
    X_simple = X_clf_train[:100, :2]
    y_simple = y_clf_train[:100]
    
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    tree.fit(X_simple, y_simple)
    
    # Visualize the tree
    plt.figure(figsize=(20, 10))
    plot_tree(tree, feature_names=['Feature 1', 'Feature 2'], 
              class_names=['Class 0', 'Class 1'], filled=True, rounded=True)
    plt.title(f'Decision Tree (max_depth={max_depth})')
    plt.show()
    
    # Visualize decision boundaries
    plot_decision_boundaries_tree(X_simple, y_simple, tree, f'Decision Tree (depth={max_depth})')
    
    return tree

def plot_decision_boundaries_tree(X, y, model, title):
    """Plot decision boundaries for tree-based models"""
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                        np.linspace(y_min, y_max, 100))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', alpha=0.8)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()

# Build trees with different depths
trees = []
depths = [1, 2, 3, 5, 8]

for depth in depths:
    print(f"\nBuilding decision tree with max_depth = {depth}")
    tree = build_simple_tree(depth)
    trees.append(tree)
    
    # Evaluate
    train_acc = tree.score(X_clf_train[:100, :2], y_clf_train[:100])
    print(f"Training accuracy: {train_acc:.4f}")

# Compare training vs test performance for different depths
train_scores = []
test_scores = []

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_clf_train, y_clf_train)
    
    train_acc = tree.score(X_clf_train, y_clf_train)
    test_acc = tree.score(X_clf_test, y_clf_test)
    
    train_scores.append(train_acc)
    test_scores.append(test_acc)
    
    print(f"Depth {depth}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

# Plot bias-variance tradeoff for trees
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, 'b-o', label='Training Accuracy')
plt.plot(depths, test_scores, 'r-o', label='Test Accuracy')
plt.xlabel('Tree Depth')
plt.ylabel('Accuracy')
plt.title('Bias-Variance Tradeoff: Tree Depth vs Performance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### 4. Random Forests - Bagging Ensemble

Random Forests combine multiple decision trees trained on random subsets of data:

- **Bootstrap Sampling**: Random sampling with replacement
- **Feature Randomness**: Random feature subset at each split
- **Voting/Averaging**: Combine predictions from all trees

Advantages: Reduced overfitting, better generalization, feature importance.

In [None]:
# Build and evaluate Random Forest
def evaluate_random_forest(n_estimators_list, max_depth_list):
    """Evaluate Random Forest with different hyperparameters"""
    results = []
    
    for n_estimators in n_estimators_list:
        for max_depth in max_depth_list:
            rf = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=42,
                n_jobs=-1
            )
            
            # Cross-validation for robust evaluation
            cv_scores = cross_val_score(rf, X_clf_train, y_clf_train, cv=3, scoring='accuracy')
            mean_cv_score = np.mean(cv_scores)
            
            results.append({
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'cv_accuracy': mean_cv_score,
                'cv_std': np.std(cv_scores)
            })
            
            print(f"RF(n_est={n_estimators}, depth={max_depth}): CV Acc = {mean_cv_score:.4f} (+/- {np.std(cv_scores):.4f})")
    
    return results

# Evaluate different Random Forest configurations
n_estimators_list = [10, 50, 100, 200]
max_depth_list = [3, 5, 10, None]

rf_results = evaluate_random_forest(n_estimators_list, max_depth_list)

# Train best Random Forest model
best_rf_result = max(rf_results, key=lambda x: x['cv_accuracy'])
best_rf = RandomForestClassifier(
    n_estimators=best_rf_result['n_estimators'],
    max_depth=best_rf_result['max_depth'],
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_clf_train, y_clf_train)

# Final evaluation
rf_train_acc = best_rf.score(X_clf_train, y_clf_train)
rf_test_acc = best_rf.score(X_clf_test, y_clf_test)

print(f"\nBest Random Forest Configuration:")
print(f"n_estimators: {best_rf_result['n_estimators']}")
print(f"max_depth: {best_rf_result['max_depth']}")
print(f"Training Accuracy: {rf_train_acc:.4f}")
print(f"Test Accuracy: {rf_test_acc:.4f}")

# Feature importance
feature_importance_rf = best_rf.feature_importances_
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importance_rf)), feature_importance_rf)
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
plt.grid(True, alpha=0.3)
plt.show()

### 5. Boosting Methods

Boosting builds models sequentially, where each new model corrects errors of previous ones:

- **AdaBoost**: Adapts by changing sample weights
- **Gradient Boosting**: Fits residuals (gradients) of previous models
- **XGBoost**: Optimized gradient boosting with regularization

In [None]:
# Compare boosting methods
boosting_models = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

boosting_results = {}

for name, model in boosting_models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_clf_train, y_clf_train)
    
    # Evaluate
    train_acc = model.score(X_clf_train, y_clf_train)
    test_acc = model.score(X_clf_test, y_clf_test)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_clf_train, y_clf_train, cv=3, scoring='accuracy')
    
    boosting_results[name] = {
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'cv_mean': np.mean(cv_scores),
        'cv_std': np.std(cv_scores),
        'model': model
    }
    
    print(f"{name} Results:")
    print(f"  Training Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

# Plot boosting comparison
model_names = list(boosting_results.keys())
train_accs = [boosting_results[name]['train_accuracy'] for name in model_names]
test_accs = [boosting_results[name]['test_accuracy'] for name in model_names]
cv_means = [boosting_results[name]['cv_mean'] for name in model_names]
cv_stds = [boosting_results[name]['cv_std'] for name in model_names]

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
x = np.arange(len(model_names))
plt.bar(x - 0.2, train_accs, 0.2, label='Training', alpha=0.8)
plt.bar(x, test_accs, 0.2, label='Test', alpha=0.8)
plt.bar(x + 0.2, cv_means, 0.2, label='CV Mean', alpha=0.8)
plt.xlabel('Boosting Method')
plt.ylabel('Accuracy')
plt.title('Boosting Methods Comparison')
plt.xticks(x, model_names, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.errorbar(model_names, cv_means, yerr=cv_stds, fmt='o-', capsize=5)
plt.ylabel('Cross-Validation Accuracy')
plt.title('CV Accuracy with Error Bars')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 6. XGBoost Deep Dive

XGBoost is an optimized implementation of gradient boosting with several advantages:

- **Regularization**: Built-in L1 and L2 regularization
- **Parallel Processing**: Fast training on multi-core CPUs
- **Tree Pruning**: Prevents overfitting through pruning
- **Handling Missing Values**: Built-in missing value handling
- **Early Stopping**: Automatic stopping when validation performance stops improving

In [None]:
# XGBoost hyperparameter tuning
def tune_xgboost(X_train, y_train, X_val, y_val):
    """Tune XGBoost hyperparameters using validation set"""
    
    # Define parameter grid
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'reg_alpha': [0, 0.1],  # L1 regularization
        'reg_lambda': [1, 1.5]  # L2 regularization
    }
    
    best_score = 0
    best_params = None
    best_model = None
    
    # Simple grid search (in practice, use more sophisticated methods)
    from itertools import product
    
    # Test a subset of combinations for time efficiency
    test_params = [
        {'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100},
        {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 100},
        {'max_depth': 3, 'learning_rate': 0.2, 'n_estimators': 100},
        {'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200},
        {'max_depth': 4, 'learning_rate': 0.1, 'n_estimators': 150, 'reg_alpha': 0.1}
    ]
    
    print("Tuning XGBoost hyperparameters...")
    
    for i, params in enumerate(test_params):
        model = XGBClassifier(
            **params,
            random_state=42,
            eval_metric='logloss',
            early_stopping_rounds=10
        )
        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        
        val_score = model.score(X_val, y_val)
        
        print(f"Config {i+1}: {params} -> Val Acc = {val_score:.4f}")
        
        if val_score > best_score:
            best_score = val_score
            best_params = params
            best_model = model
    
    print(f"\nBest XGBoost configuration: {best_params}")
    print(f"Best validation accuracy: {best_score:.4f}")
    
    return best_model, best_params

# Tune XGBoost
X_clf_train_tune, X_clf_val_tune, y_clf_train_tune, y_clf_val_tune = train_test_split(
    X_clf_train, y_clf_train, test_size=0.2, random_state=42
)

best_xgb, best_xgb_params = tune_xgboost(X_clf_train_tune, y_clf_train_tune, 
                                         X_clf_val_tune, y_clf_val_tune)

# Final evaluation
xgb_train_acc = best_xgb.score(X_clf_train, y_clf_train)
xgb_test_acc = best_xgb.score(X_clf_test, y_clf_test)

print(f"\nXGBoost Final Results:")
print(f"Training Accuracy: {xgb_train_acc:.4f}")
print(f"Test Accuracy: {xgb_test_acc:.4f}")

# XGBoost feature importance
xgb.plot_importance(best_xgb, max_num_features=10)
plt.title('XGBoost Feature Importance')
plt.show()

# Plot training curves
plt.figure(figsize=(10, 6))
plt.plot(best_xgb.evals_result_['validation_0']['logloss'], label='Training Loss')
plt.xlabel('Boosting Round')
plt.ylabel('Log Loss')
plt.title('XGBoost Training Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### 7. Tree-Based Regression

Tree-based methods work for regression too, using MSE as the splitting criterion.

In [None]:
# Compare tree-based regression methods
regression_models = {
    'Decision Tree': DecisionTreeRegressor(max_depth=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

regression_results = {}

for name, model in regression_models.items():
    print(f"\nTraining {name} for regression...")
    
    model.fit(X_reg_train, y_reg_train)
    
    # Predictions
    train_pred = model.predict(X_reg_train)
    test_pred = model.predict(X_reg_test)
    
    # Metrics
    train_mse = mean_squared_error(y_reg_train, train_pred)
    test_mse = mean_squared_error(y_reg_test, test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    
    regression_results[name] = {
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse
    }
    
    print(f"{name} Results:")
    print(f"  Training MSE: {train_mse:.4f}, RMSE: {train_rmse:.4f}")
    print(f"  Test MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}")

# Plot regression comparison
model_names_reg = list(regression_results.keys())
train_mses = [regression_results[name]['train_mse'] for name in model_names_reg]
test_mses = [regression_results[name]['test_mse'] for name in model_names_reg]

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
x_reg = np.arange(len(model_names_reg))
plt.bar(x_reg - 0.2, train_mses, 0.4, label='Training MSE', alpha=0.8)
plt.bar(x_reg + 0.2, test_mses, 0.4, label='Test MSE', alpha=0.8)
plt.xlabel('Regression Method')
plt.ylabel('Mean Squared Error')
plt.title('Tree-Based Regression Comparison')
plt.xticks(x_reg, model_names_reg, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# Plot predictions vs actual for best model
best_reg_model = min(regression_results.items(), key=lambda x: x[1]['test_mse'])[0]
best_reg = regression_models[best_reg_model]
y_pred_best = best_reg.predict(X_reg_test)

plt.subplot(1, 2, 2)
plt.scatter(y_reg_test, y_pred_best, alpha=0.6, color='blue')
plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'r--', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'{best_reg_model} Predictions vs Actual')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nBest regression model: {best_reg_model}")
print(f"Test RMSE: {regression_results[best_reg_model]['test_rmse']:.4f}")

### 8. Model Comparison: Trees vs Neural Networks

Let's compare tree-based methods with neural networks on our datasets.

In [None]:
# Compare best models from each category
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Neural network for classification
nn_clf = Sequential([
    Dense(64, activation='relu', input_shape=(X_clf_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
nn_clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_clf.fit(X_clf_train, y_clf_train, epochs=50, batch_size=32, verbose=0)

# Neural network for regression
nn_reg = Sequential([
    Dense(64, activation='relu', input_shape=(X_reg_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])
nn_reg.compile(optimizer='adam', loss='mse')
nn_reg.fit(X_reg_train, y_reg_train, epochs=50, batch_size=32, verbose=0)

# Evaluate neural networks
nn_clf_acc = nn_clf.evaluate(X_clf_test, y_clf_test, verbose=0)[1]
nn_reg_mse = nn_reg.evaluate(X_reg_test, y_reg_test, verbose=0)
nn_reg_rmse = np.sqrt(nn_reg_mse)

# Final comparison
print("\n" + "="*60)
print("FINAL MODEL COMPARISON")
print("="*60)

print("\nCLASSIFICATION:")
print("-" * 30)
models_clf = {
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest': best_rf,
    'XGBoost': best_xgb,
    'Neural Network': nn_clf
}

for name, model in models_clf.items():
    if name == 'Neural Network':
        acc = nn_clf_acc
    else:
        model.fit(X_clf_train, y_clf_train)
        acc = model.score(X_clf_test, y_clf_test)
    print(f"{name:15}: Test Accuracy = {acc:.4f}")

print("\nREGRESSION:")
print("-" * 30)
models_reg = {
    'Decision Tree': DecisionTreeRegressor(max_depth=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
    'Neural Network': nn_reg
}

for name, model in models_reg.items():
    if name == 'Neural Network':
        rmse = nn_reg_rmse
    else:
        model.fit(X_reg_train, y_reg_train)
        pred = model.predict(X_reg_test)
        rmse = np.sqrt(mean_squared_error(y_reg_test, pred))
    print(f"{name:15}: Test RMSE = {rmse:.4f}")

# Training time comparison (rough estimate)
import time

print("\nTRAINING TIME COMPARISON (approximate):")
print("-" * 40)

# Time XGBoost training
start_time = time.time()
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_clf_train, y_clf_train)
xgb_time = time.time() - start_time

# Time Random Forest training
start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_clf_train, y_clf_train)
rf_time = time.time() - start_time

print(f"Random Forest: {rf_time:.2f} seconds")
print(f"XGBoost:       {xgb_time:.2f} seconds")
print(f"Speedup:       {rf_time/xgb_time:.1f}x")

### 9. Experimentation and Questions

Now let's explore some advanced tree-based concepts:

1. **Tree Pruning**: Experiment with different pruning strategies and minimum samples per leaf.

2. **Feature Engineering**: How do tree-based methods handle categorical features vs neural networks?

3. **Scalability**: How do different methods scale with dataset size and feature count?

4. **Interpretability**: Compare the interpretability of decision trees vs complex ensembles.

5. **Missing Values**: How do different methods handle missing data?

**Challenge**: Implement a simple gradient boosting algorithm from scratch.

In [None]:
# Experiment: Tree hyperparameters and their effects
def experiment_tree_params():
    """Experiment with different tree hyperparameters"""
    
    # Different configurations
    configs = [
        {'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1},
        {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1},
        {'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5},
        {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1},  # No pruning
    ]
    
    results = []
    
    for i, config in enumerate(configs):
        tree = DecisionTreeClassifier(random_state=42, **config)
        tree.fit(X_clf_train, y_clf_train)
        
        train_acc = tree.score(X_clf_train, y_clf_train)
        test_acc = tree.score(X_clf_test, y_clf_test)
        n_leaves = tree.get_n_leaves()
        depth = tree.get_depth()
        
        results.append({
            'config': config,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'n_leaves': n_leaves,
            'depth': depth
        })
        
        print(f"Config {i+1}: {config}")
        print(f"  Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")
        print(f"  Tree Depth: {depth}, Leaves: {n_leaves}")
        print()
    
    return results

# Run experiment
param_results = experiment_tree_params()

# Plot results
configs_labels = [f"Config {i+1}" for i in range(len(param_results))]
train_accs = [r['train_acc'] for r in param_results]
test_accs = [r['test_acc'] for r in param_results]
complexities = [r['n_leaves'] for r in param_results]

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Performance vs complexity
axes[0].scatter(complexities, train_accs, label='Training', s=100, alpha=0.7)
axes[0].scatter(complexities, test_accs, label='Test', s=100, alpha=0.7)
for i, label in enumerate(configs_labels):
    axes[0].annotate(label, (complexities[i], test_accs[i]), xytext=(5, 5), textcoords='offset points')
axes[0].set_xlabel('Number of Leaves (Complexity)')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Complexity vs Performance')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Bias-variance visualization
x_pos = np.arange(len(configs_labels))
axes[1].bar(x_pos - 0.2, train_accs, 0.4, label='Training', alpha=0.7)
axes[1].bar(x_pos + 0.2, test_accs, 0.4, label='Test', alpha=0.7)
axes[1].set_xlabel('Configuration')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Bias-Variance Analysis')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(configs_labels, rotation=45)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Best configuration analysis
best_config = max(param_results, key=lambda x: x['test_acc'])
print(f"Best configuration: {best_config['config']}")
print(f"Best test accuracy: {best_config['test_acc']:.4f}")
print(f"Gap between train and test: {best_config['train_acc'] - best_config['test_acc']:.4f}")

### Key Takeaways

1. **Decision Trees** are interpretable but prone to overfitting without proper constraints.

2. **Random Forests** reduce overfitting through bagging and provide feature importance.

3. **Boosting Methods** (especially XGBoost) often achieve state-of-the-art performance.

4. **Tree-based methods** handle mixed data types, missing values, and don't require feature scaling.

5. **Model Selection** depends on the problem: trees for interpretability, ensembles for performance.

### Next Steps

In the Unsupervised Learning section, we'll explore clustering and anomaly detection techniques that work without labeled data.