Tech Project 2 - Comparing Ensemble Methods - Stacking Regressor

Diogo Pessoa


In [53]:
%run data_loader.ipynb

Data loaded:
- x_train_sc: Scaled training features.
- x_test_sc: Scaled testing features.
- x_train - Training features.
- x_test - Testing features.
- y_train - Training labels.
- y_test - Testing labels.


## Training models to generate the baseline scores

* [Stacking Regressor](https://scikit-learn.org/stable/modules/ensemble.html#stacking)
* [GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor) as Final estimator
* [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)
* [LassoCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)
* [KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor)

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor

# default values
n_estimators=140
rd_state=42
estimators = [('ridge', RidgeCV()),('lasso', LassoCV(random_state=rd_state)), ('knr', KNeighborsRegressor(n_neighbors=20,metric='euclidean'))]


In [8]:
final_estimator = GradientBoostingRegressor(random_state=rd_state, n_estimators=n_estimators, max_depth=40, min_samples_leaf=25)
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=final_estimator) 

In [15]:
"""Traing models with unscaled data"""
stacking_regressor.fit(x_train, y_train)

In [17]:
stacking_regressor_pred = stacking_regressor.predict(x_test)

In [22]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Stacking Regressor|{mean_squared_error(y_test, stacking_regressor_pred)} | {r2_score(y_test, stacking_regressor_pred)} | {explained_variance_score(y_test, stacking_regressor_pred)}|')
"""Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor"""

|Ensemble|mean squared error| |r2 score| |explained variance score|
|Stacking Regressor|0.4227565962173764 | 0.8718890758241176 | 0.8719155515438354|


'Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor'

In [23]:
stacking_regressor.transform(x_test[:5])

array([[22.40572766, 23.37070596, 14.50793681],
       [22.60511809, 22.46346369, 13.61999483],
       [13.15150847, 15.09706885, 14.51991039],
       [31.96042596, 31.90347318, 13.82580957],
       [20.96778725, 21.88939949, 14.52554169]])

## Training models with scaled data

In [None]:
stacking_regressor.fit(x_train_sc, y_train)
stacking_regressor_pred_sc = stacking_regressor.predict(x_test_sc)

print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Stacking Regressor (scaled set)|{mean_squared_error(y_test, stacking_regressor_pred)} | {r2_score(y_test, stacking_regressor_pred)} | {explained_variance_score(y_test, stacking_regressor_pred)}|')
"""Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor"""

## Tuning with PCA and GridSearchCV

In [9]:
# code here
x_train, x_test, y_train, y_test = train_test_split(features.values, label.values, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler

# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=34)
x_train_Trans = pca.fit_transform(x_train)
x_test_Trans = pca.transform(x_test)

In [31]:
import time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor

# Define the base models
# Record the start time
start_time = time.time()

base_models = [
    ('ridgecv', RidgeCV()),
    ('lasso', LassoCV(random_state=42)),
    ('knr', KNeighborsRegressor(metric='euclidean'))
]

# Define the meta-model
meta_model = GradientBoostingRegressor(random_state=42)

# Create the stacking model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)


param_grid = {
    'knr__n_neighbors': [20, 30, 40],
    'final_estimator__n_estimators': [50, 100, 200],
    'final_estimator__learning_rate': [0.01, 0.2]
}
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Only tune the max depth of the trees in the RF hyperparameter.
grid = GridSearchCV(estimator=stacked_model,
                    param_grid=param_grid, cv=5, scoring=['neg_mean_squared_error','r2','explained_variance'], n_jobs=-1)
grid.fit(x_train_Trans, y_train)
# Record the end time
end_time = time.time()

# Calculate the duration
duration = end_time - start_time
grid.best_params_


{'final_estimator__learning_rate': 0.2,
 'final_estimator__n_estimators': 200,
 'knr__n_neighbors': 20}

In [33]:
print(f"GridSearchCV took {duration:.2f} seconds.")
print("Best parameters found: ", grid.best_params_)
print("Best score found: ", -grid.best_score_)
print("Best estimator found: ", grid.best_estimator_)

Best parameters found:  {'final_estimator__learning_rate': 0.2, 'final_estimator__n_estimators': 200, 'knr__n_neighbors': 20}
Best score found:  0.40120837512676627
Best estimator found:  StackingRegressor(estimators=[('ridgecv', RidgeCV()),
                              ('lasso', LassoCV(random_state=42)),
                              ('knr',
                               KNeighborsRegressor(metric='euclidean',
                                                   n_neighbors=20))],
                  final_estimator=GradientBoostingRegressor(learning_rate=0.2,
                                                            n_estimators=200,
                                                            random_state=42))


In [36]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
y_pred = grid.predict(x_test_Trans)
mse = mean_squared_error(y_test, y_pred)
r_two_score = r2_score(y_test, y_pred)
ex_variance_score = explained_variance_score(y_test, y_pred)


Mean Squared Error on Test Set: 0.39391360028603656
r2 score: 0.8810951160621501
explained variance score: 0.8811403306632517


In [37]:
print(f'Mean Squared Error on Test Set: {mse}\nr2 score: {r_two_score}\nexplained variance score: {ex_variance_score}')

Mean Squared Error on Test Set: 0.39391360028603656
r2 score: 0.8810951160621501
explained variance score: 0.8811403306632517


Running Kfold Validation for Stacking with optmized parameters
---

In [39]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
import time
# Define the base models, with opmitized parameters

base_models = [
    ('ridge', RidgeCV()),
    ('lasso', LassoCV(random_state=42)),
    ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))
]
# Define the meta-model
meta_model = GradientBoostingRegressor(random_state=42, n_estimators=200, learning_rate=0.2)
# Create the stacking model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Define the KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
# Record the start time
start_time = time.time()
scores = cross_val_score(stacked_model, x_train_Trans, y_train,  scoring='neg_root_mean_squared_error', cv=kf)

# Convert scores to positive (since they are negative mean squared errors)
mse_scores = -scores

# Calculate RMSE for each fold
rmse_scores = np.sqrt(mse_scores)
end_time = time.time()

# Calculate the duration
duration = end_time - start_time

In [44]:
print(f"KFold run took {duration:.2f} seconds.")
print(f"RMSE scores for each fold: {rmse_scores}")
print(f"Average RMSE: {np.mean(rmse_scores)}")

KFold run took 47.79 seconds.
RMSE scores for each fold: [0.79531567 0.79746593 0.79502865 0.79649287 0.79416668]
Average RMSE: 0.7956939598410445


StackingRegressor execution
---

In [43]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
import time

# Define the base models, with opmitized parameters
base_models = [
    ('ridge', RidgeCV()),
    ('lasso', LassoCV(random_state=42)),
    ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))
]
# Define the meta-model
meta_model = GradientBoostingRegressor(random_state=42, n_estimators=200, learning_rate=0.2)
# Create the stacking model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Record the start time
start_time = time.time()
stacked_model.fit(x_train_Trans, y_train)
# Calculate the duration
end_time = time.time()
duration = end_time - start_time

In [45]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

gbr_pred = stacked_model.predict(x_test_Trans)

mse = mean_squared_error(y_test, gbr_pred)
r_two_score = r2_score(y_test, gbr_pred)
ex_variance_score = explained_variance_score(y_test, gbr_pred)

print(f"Stacking Model training took {duration:.2f} seconds.")
print(f'Mean Squared Error on Test Set: {mse}')
print(f'r2 score: {r_two_score}')
print(f'explained variance score: {ex_variance_score}')

Stacking Model training took 47.79 seconds.
Mean Squared Error on Test Set: 0.39391360028603656
r2 score: 0.8810951160621501
explained variance score: 0.8811403306632517
