# 5.3 Supervised Learning Part 2 Tutorial

This notebook covers advanced supervised learning algorithms including:
- Random Forests
- Gradient Boosting
- Neural Networks
- Ensemble Methods

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

## 1. Random Forests

Let's explore Random Forests and their capabilities.

In [None]:
# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=42)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Print performance metrics
print("Random Forest Performance:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': [f'Feature {i+1}' for i in range(X.shape[1])],
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importance in Random Forest')
plt.show()

# Analyze number of trees
n_trees = [10, 50, 100, 200]
scores = []

for n in n_trees:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    score = cross_val_score(rf, X_train, y_train, cv=5).mean()
    scores.append(score)

plt.figure(figsize=(10, 6))
plt.plot(n_trees, scores, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('Cross-validation Score')
plt.title('Impact of Number of Trees on Performance')
plt.grid(True)
plt.show()

## 2. Gradient Boosting

Let's implement Gradient Boosting and analyze its behavior.

In [None]:
# Create and train Gradient Boosting model
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb.predict(X_test)

# Print performance metrics
print("Gradient Boosting Performance:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

# Plot feature importance
feature_importance_gb = pd.DataFrame({
    'feature': [f'Feature {i+1}' for i in range(X.shape[1])],
    'importance': gb.feature_importances_
})
feature_importance_gb = feature_importance_gb.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance_gb.head(10))
plt.title('Top 10 Feature Importance in Gradient Boosting')
plt.show()

# Analyze learning rate impact
learning_rates = [0.01, 0.1, 0.5, 1.0]
scores_lr = []

for lr in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=lr, max_depth=3, random_state=42)
    score = cross_val_score(gb, X_train, y_train, cv=5).mean()
    scores_lr.append(score)

plt.figure(figsize=(10, 6))
plt.semilogx(learning_rates, scores_lr, marker='o')
plt.xlabel('Learning Rate')
plt.ylabel('Cross-validation Score')
plt.title('Impact of Learning Rate on Performance')
plt.grid(True)
plt.show()

## 3. Neural Networks

Let's explore neural networks for classification.

In [None]:
# Scale features for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train neural network
nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
nn.fit(X_train_scaled, y_train)

# Make predictions
y_pred_nn = nn.predict(X_test_scaled)

# Print performance metrics
print("Neural Network Performance:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nn))

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(nn.loss_curve_)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Neural Network Learning Curve')
plt.grid(True)
plt.show()

# Compare different architectures
architectures = [(50,), (100,), (100, 50), (100, 100)]
scores_nn = []

for arch in architectures:
    nn = MLPClassifier(hidden_layer_sizes=arch, max_iter=1000, random_state=42)
    score = cross_val_score(nn, X_train_scaled, y_train, cv=5).mean()
    scores_nn.append(score)

plt.figure(figsize=(10, 6))
plt.bar(range(len(architectures)), scores_nn)
plt.xticks(range(len(architectures)), [str(arch) for arch in architectures])
plt.xlabel('Network Architecture')
plt.ylabel('Cross-validation Score')
plt.title('Performance of Different Neural Network Architectures')
plt.show()

## 4. Ensemble Methods

Let's combine multiple models using ensemble techniques.

In [None]:
# Create base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Create voting classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('nn', nn)],
    voting='hard'
)

# Train and evaluate all models
models = {
    'Random Forest': rf,
    'Gradient Boosting': gb,
    'Neural Network': nn,
    'Ensemble': voting_clf
}

# Compare model performances
model_scores = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    score = accuracy_score(y_test, y_pred)
    model_scores[name] = score
    
    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred))

# Plot model comparison
plt.figure(figsize=(10, 6))
plt.bar(model_scores.keys(), model_scores.values())
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Practice Exercises

1. Implement bagging classifier and compare it with random forest.

2. Experiment with different neural network activation functions and optimizers.

3. Create a stacking ensemble using different base models.

4. Analyze the trade-off between model complexity and performance.

5. Implement early stopping in gradient boosting and neural networks.