# Boosting Machine Learning Models


---

## Boosting

Boosting is an ensemble technique that combines multiple weak learners to create a strong learner. It focuses on training new models to correct the errors made by existing models, which helps improve performance.


### Adaptive Boosting Overview

Adaptive Boosting, or AdaBoost, is one of the first boosting algorithms. It combines multiple weak classifiers (often decision trees) to create a strong classifier.


In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load dataset to a pandas DataFrame
path_to_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "accep"]
df = pd.read_csv(path_to_data, names=column_names)

target_column = "accep"
raw_feature_columns = [col for col in column_names if col != target_column]

# Create dummy variables from the feature columns
X = pd.get_dummies(df[raw_feature_columns], drop_first=True)

# Convert target column to binary variable; 0 if 'unacc', 1 otherwise
df[target_column] = np.where(df[target_column] == "unacc", 0, 1)
y = df[target_column]

# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.3
)

# 1. Create a decision stump base model using the Decision Tree Classifier and print its parameters
decision_stump = DecisionTreeClassifier(max_depth=1)
print(decision_stump.get_params())

# 2. Create an Adaptive Boost Classifier and print its parameters
ada_classifier = AdaBoostClassifier(estimator=decision_stump, n_estimators=5)
print(ada_classifier.get_params())

# 3. Fit the Adaptive Boost Classifier to the training data and get the list of predictions
ada_classifier.fit(X_train, y_train)
y_pred = ada_classifier.predict(X_test)

# 4. Calculate the accuracy, precision, recall, and f1-score on the testing data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test set accuracy:\t{accuracy}")
print(f"Test set precision:\t{precision}")
print(f"Test set recall:\t{recall}")
print(f"Test set f1-score:\t{f1}")

# 5. Print the confusion matrix
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=[1, 0]),
    index=["actual yes", "actual no"],
    columns=["predicted yes", "predicted no"],
)

print(f"Confusion Matrix:\n{test_conf_matrix.to_string()}")


{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
{'algorithm': 'SAMME.R', 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': 1, 'estimator__max_features': None, 'estimator__max_leaf_nodes': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__monotonic_cst': None, 'estimator__random_state': None, 'estimator__splitter': 'best', 'estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 1.0, 'n_estimators': 5, 'random_state': None}
Test set accuracy:	0.8574181117533719
Test set precision:	0.7247191011235955
Test set recall:	0.8376623376623377
Test set 



---

### Gradient Boosting Overview

Gradient Boosting is another powerful boosting technique that builds models in a stage-wise fashion and generalizes them by allowing optimization of an arbitrary differentiable loss function.


In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split

# Load dataset to a pandas DataFrame
path_to_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "accep"]

df = pd.read_csv(path_to_data, names=column_names)
target_column = "accep"
raw_feature_columns = [col for col in column_names if col != target_column]

# Create dummy variables from the feature columns
X = pd.get_dummies(df[raw_feature_columns], drop_first=True)

# Convert target column to binary variable; 0 if 'unacc', 1 otherwise
df[target_column] = np.where(df[target_column] == "unacc", 0, 1)
y = df[target_column]

# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.3
)

# 1. Create a Gradient Boosting Classifier and print its parameters
grad_classifier = GradientBoostingClassifier(n_estimators=15)

print(grad_classifier.get_params())

# 2. Fit the Gradient Boosted Trees Classifier to the training data and get the list of predictions
grad_classifier.fit(X_train, y_train)
y_pred = grad_classifier.predict(X_test)

# 3. Calculate the accuracy, precision, recall, and f1-score on the testing data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test set accuracy:\t{accuracy}")
print(f"Test set precision:\t{precision}")
print(f"Test set recall:\t{recall}")
print(f"Test set f1-score:\t{f1}")

# 4. Print the confusion matrix
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=[1, 0]),
    index=["actual yes", "actual no"],
    columns=["predicted yes", "predicted no"],
)

print(f"Confusion Matrix:\n{test_conf_matrix.to_string()}")

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 15, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Test set accuracy:	0.8978805394990366
Test set precision:	0.7885714285714286
Test set recall:	0.8961038961038961
Test set f1-score:	0.8389057750759878
Confusion Matrix:
            predicted yes  predicted no
actual yes            138            16
actual no              37           328
