### Adaboost (Adaptive Boosting)

In [21]:
# load packages
import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse



In [2]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, shuffle=True, random_state=42)


In [3]:
# instantiate a decision tree classifier
dtc = DecisionTreeClassifier(max_depth=1, random_state=42)

# instantiate an adaboost classifier
adaboost = AdaBoostClassifier(base_estimator=dtc, n_estimators=100).fit(X_train, y_train)
y_pred_proba = adaboost.predict_proba(X_test)[:, 1]  # check why it's [:, 1]
# print(y_pred_proba)

In [4]:
# evaluate test set roc_auc_score
roc_score = roc_auc_score(y_test, y_pred_proba)
print(f'ADABOOST ROC AUC score: {round(roc_score, 2)}')

ADABOOST ROC AUC score: 0.99


In [5]:
# initial Decision Tree's score with max-depth=1
dtc_prob = DecisionTreeClassifier(max_depth=1, random_state=42)
dtc_prob.fit(X_train, y_train)
dtc_roc_auc = roc_auc_score(y_test, dtc_prob.predict_proba(X_test)[:, 1])
print(f'DECISION TREE ROC AUC score: {round(dtc_roc_auc, 2)}')

DECISION TREE ROC AUC score: 0.9


### How to find estimators?


In [18]:
params = {
    'max_depth': [2, 3, 4, 5, 5],
    'min_samples_leaf': [.04, .06, .08],
    'max_features': [.2, .4, .6, .8]
}

dt = DecisionTreeClassifier()
print(dt.get_params())

grid_search = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=10, n_jobs=-1).fit(X_train, y_train)
print(f'Best params: {grid_search.best_params_}')
print(f'Best CV score: {grid_search.best_score_}')

best_estimator = grid_search.best_estimator_
print(f'Tuned DT Classifier\'s score: {best_estimator.score(X_test, y_test)}')

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Best params: {'max_depth': 2, 'max_features': 0.8, 'min_samples_leaf': 0.06}
Best CV score: 0.9423076923076923
Tuned DT Classifier's score: 0.9181286549707602


In [20]:
# roc-auc version
params_dt = {'max_depth': [2, 3, 4], 'min_samples_leaf': [.12, .14, .16, .18]}

# Instantiate grid_dt
grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1).fit(X_train, y_train)


# Extract the best estimator
best_model = grid_dt.best_estimator_

# Predict the test set probabilities of the positive class
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))

Test set ROC AUC score: 0.965


In [22]:
# tuning Forest Hyperparameters
params = {
    'n_estimators': [300, 400, 500],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [.1, .2],
    'max_features': ['log2', 'sqrt']
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=3,
                              scoring='neg_mean_squared_error', verbose=1, n_jobs=-1).fit(X_train, y_train)
print(f'Best parameters: {grid_search_rf.best_params_}')
best_model = grid_search_rf.best_estimator_
y_pred = best_estimator.predict(X_test)
rmse = mse(y_test, y_pred)**(1/2)
print(f'RMSE: {rmse}')

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'n_estimators': 400}
RMSE: 0.2861316917596507


### Gradient Boosting


In [9]:
X_b, y_b = load_boston(return_X_y=True)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=.3,
                                                            shuffle=True, random_state=32)

# instantiate GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=300, max_depth=1, random_state=15).fit(X_train_b, y_train_b)
rmse_gradient = mse(y_test_b, gbr.predict(X_test_b))**(1/2)
print(f'RMSE score with Gradient Boosting Regressor: {round(rmse_gradient, 2)}')

# compare with LinearRegression
lin_reg = LinearRegression().fit(X_train_b, y_train_b)
rmse_linear = mse(y_test_b, lin_reg.predict(X_test_b))**(1/2)
print(f'RMSE score with Linear Regression : {round(rmse_linear, 2)}')

# compare with AdaBoostRegressor
ada_reg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=1),
                            n_estimators=200, random_state=32).fit(X_train_b, y_train_b)
rmse_adaboost = mse(y_test_b, ada_reg.predict(X_test_b))**(1/2)
print(f'RMSE score with AdaBoost : {round(rmse_adaboost, 2)}')

RMSE score with Gradient Boosting Regressor: 4.0
RMSE score with Linear Regression : 5.09
RMSE score with AdaBoost : 6.88


### Stochastic Gradient Boosting


In [7]:
# instantiate a stochastic regressor
stochastic_boosting = GradientBoostingRegressor(max_depth=1, subsample=0.8, max_features=.2,
                                                n_estimators=300, random_state=42)
stochastic_boosting.fit(X_train_b, y_train_b)
y_pred = stochastic_boosting.predict(X_test_b)
rmse_stochastic = mse(y_test_b, y_pred)**(1/2)
print(f'RMSE score with Stochastic Gradient Boosting : {round(rmse_stochastic, 2)}')

RMSE score with Stochastic Gradient Boosting : 3.76
