Tech Project 2 - Comparing Ensemble Methods - Collecting Baseline Scores

Diogo Pessoa


In [19]:
%run data_loader.ipynb

check_data_split [(123653, 38), (30914, 38), (123653,), (30914,)]


## Training models to generate the baseline scores

* [Linear Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)
* [Voting Regressor](https://scikit-learn.org/stable/modules/ensemble.html#voting-regressor)
* [Random Forest Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
* [Gradient Boosting Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
* [AdaBoost Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html)

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor

# default values
n_estimators=140
rd_state=42

In [21]:
"""Training classifiers"""

grad_boosting_regressor = GradientBoostingRegressor(random_state=rd_state, n_estimators=n_estimators)
bagging_regressor = BaggingRegressor(random_state=rd_state, n_estimators=n_estimators)
linear_regr = LinearRegression()
voting_reg = VotingRegressor(estimators=[('gb', grad_boosting_regressor), ('rf', random_forest_regressor), ('lr', linear_regr), ('ada_b', ada_boosting_regressor), ('bagging_r', bagging_regressor)]) 

In [22]:
"""Traing models with scaled data"""

grad_boosting_regressor.fit(x_train_sc, y_train)
ada_boosting_regressor.fit(x_train_sc, y_train)
bagging_regressor.fit(x_train_sc, y_train)
random_forest_regressor.fit(x_train_sc, y_train)
linear_regr.fit(x_train_sc, y_train)
voting_reg.fit(x_train_sc, y_train)

In [23]:
gdb_prediction = grad_boosting_regressor.predict(x_test_sc)
rf_prediction = random_forest_regressor.predict(x_test_sc)
lg_prediction = linear_regr.predict(x_test_sc)
ada_b_prediction = ada_boosting_regressor.predict(x_test_sc)
bagging_r_prediction = bagging_regressor.predict(x_test_sc)
voting_reg_prediction = voting_reg.predict(x_test_sc)


In [24]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

print('mean squared error', 'r2 score', 'explained variance score')
print('GradientBoostingRegressor',mean_squared_error(y_test, gdb_prediction),r2_score(y_test, gdb_prediction), explained_variance_score(y_test, gdb_prediction))
print('RandomForestRegressor',mean_squared_error(y_test, rf_prediction),r2_score(y_test, rf_prediction), explained_variance_score(y_test, rf_prediction))
print('LinearRegression',mean_squared_error(y_test, lg_prediction),r2_score(y_test, lg_prediction), explained_variance_score(y_test, lg_prediction))
print('AdaBoostRegressor',mean_squared_error(y_test, ada_b_prediction),r2_score(y_test, ada_b_prediction), explained_variance_score(y_test, ada_b_prediction))
print('BaggingRegressor',mean_squared_error(y_test, bagging_r_prediction),r2_score(y_test, bagging_r_prediction), explained_variance_score(y_test, bagging_r_prediction))
print('VotingRegressor',mean_squared_error(y_test, voting_reg_prediction),r2_score(y_test, voting_reg_prediction), explained_variance_score(y_test, voting_reg_prediction))

mean squared error r2 score explained variance score
GradientBoostingRegressor 0.3799829950579041 0.8853001422319302 0.885301770354857
RandomForestRegressor 0.19906876900824988 0.9399100491646171 0.9399541017739272
LinearRegression 0.446259142254775 0.8652943399834946 0.8653391206880249
AdaBoostRegressor 0.7022505612825702 0.7880219890251048 0.8031254897239358
BaggingRegressor 0.1989198958725586 0.9399549873005636 0.9400026998167074
VotingRegressor 0.2962409649534499 0.9105781127388121 0.9114122683042218


Baseline run without scaling dataset
---


In [27]:
"""Traing models with scaled data"""

grad_boosting_regressor.fit(x_train, y_train)
ada_boosting_regressor.fit(x_train, y_train)
bagging_regressor.fit(x_train, y_train)
random_forest_regressor.fit(x_train, y_train)
linear_regr.fit(x_train, y_train)
voting_reg.fit(x_train, y_train)

gdb_prediction = grad_boosting_regressor.predict(x_test)
rf_prediction = random_forest_regressor.predict(x_test)
lg_prediction = linear_regr.predict(x_test)
ada_b_prediction = ada_boosting_regressor.predict(x_test)
bagging_r_prediction = bagging_regressor.predict(x_test)
voting_reg_prediction = voting_reg.predict(x_test)


print('mean squared error', 'r2 score', 'explained variance score')
print('GradientBoostingRegressor',mean_squared_error(y_test, gdb_prediction),r2_score(y_test, gdb_prediction), explained_variance_score(y_test, gdb_prediction))
print('RandomForestRegressor',mean_squared_error(y_test, rf_prediction),r2_score(y_test, rf_prediction), explained_variance_score(y_test, rf_prediction))
print('LinearRegression',mean_squared_error(y_test, lg_prediction),r2_score(y_test, lg_prediction), explained_variance_score(y_test, lg_prediction))
print('AdaBoostRegressor',mean_squared_error(y_test, ada_b_prediction),r2_score(y_test, ada_b_prediction), explained_variance_score(y_test, ada_b_prediction))
print('BaggingRegressor',mean_squared_error(y_test, bagging_r_prediction),r2_score(y_test, bagging_r_prediction), explained_variance_score(y_test, bagging_r_prediction))
print('VotingRegressor',mean_squared_error(y_test, voting_reg_prediction),r2_score(y_test, voting_reg_prediction), explained_variance_score(y_test, voting_reg_prediction))

mean squared error r2 score explained variance score
GradientBoostingRegressor 0.3799829950579041 0.8853001422319302 0.885301770354857
RandomForestRegressor 0.1991333798990529 0.9398905460287353 0.939935318622876
LinearRegression 0.44612348891543074 0.865335287654656 0.8653361695326518
AdaBoostRegressor 0.6742065545504626 0.7964872193781877 0.8077946072986688
BaggingRegressor 0.19904752299768105 0.939916462384214 0.9399615996543444
VotingRegressor 0.2951647086331412 0.9109029863475444 0.9114979650403896
