Tech Project 2 - Comparing Ensemble Methods - Stacking Regressor

Diogo Pessoa


In [1]:
import pandas as pd

In [2]:
combined_data=pd.read_csv('combined_data.csv').set_index('video_id')
label = combined_data['label']
features = combined_data.drop(['label'],axis=1)

## Scaling Dataset features

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features.values, label.values, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.transform(x_test)

In [10]:
print('check_data_split',[x_train_sc.shape,x_test_sc.shape,y_train.shape,y_test.shape])

check_data_split [(123653, 38), (30914, 38), (123653,), (30914,)]


## Training models to generate the baseline scores

* [Stacking Regressor](https://scikit-learn.org/stable/modules/ensemble.html#stacking)
* [GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor) as Final estimator
* [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)
* [LassoCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)
* [KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor)

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor

# default values
n_estimators=140
rd_state=42
estimators = [('ridge', RidgeCV()),('lasso', LassoCV(random_state=rd_state)), ('knr', KNeighborsRegressor(n_neighbors=20,metric='euclidean'))]


In [14]:
final_estimator = GradientBoostingRegressor(random_state=rd_state, n_estimators=n_estimators, max_depth=40, min_samples_leaf=25)
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=final_estimator) 

In [15]:
"""Traing models with unscaled data"""
stacking_regressor.fit(x_train, y_train)

In [17]:
stacking_regressor_pred = stacking_regressor.predict(x_test)

In [22]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Stacking Regressor|{mean_squared_error(y_test, stacking_regressor_pred)} | {r2_score(y_test, stacking_regressor_pred)} | {explained_variance_score(y_test, stacking_regressor_pred)}|')
"""Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor"""

|Ensemble|mean squared error| |r2 score| |explained variance score|
|Stacking Regressor|0.4227565962173764 | 0.8718890758241176 | 0.8719155515438354|


'Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor'

In [23]:
stacking_regressor.transform(x_test[:5])

array([[22.40572766, 23.37070596, 14.50793681],
       [22.60511809, 22.46346369, 13.61999483],
       [13.15150847, 15.09706885, 14.51991039],
       [31.96042596, 31.90347318, 13.82580957],
       [20.96778725, 21.88939949, 14.52554169]])

## Training models with scaled data

In [20]:
stacking_regressor.fit(x_train_sc, y_train)
stacking_regressor_pred_sc = stacking_regressor.predict(x_test_sc)

print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Stacking Regressor (scaled set)|{mean_squared_error(y_test, stacking_regressor_pred)} | {r2_score(y_test, stacking_regressor_pred)} | {explained_variance_score(y_test, stacking_regressor_pred)}|')
"""Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor"""

|Ensemble|mean squared error| |r2 score| |explained variance score|
|Stacking Regressor (scaled set)|0.4227565962173764 | 0.8718890758241176 | 0.8719155515438354|


'Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor'