Tuning the best performing models
---

After initial review of Bagging, Boosting and Stacking. I'll now focus on the best 3 three models and apply PCA, GridSearch seeking to improve accuracy.

Diogo Pessoa

In [None]:
import pandas as pd

combined_data = pd.read_csv('combined_data.csv').set_index('video_id')
label = combined_data['label']
features = combined_data.drop(['label'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features.values, label.values, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.transform(x_test)

In [None]:
# TODO - List the best models
"""

RandomForestRegressor 0.19906876900824988 0.9399100491646171 0.9399541017739272
- Performed quite well, however we'll skip this model since it was used on notebook provided.
  - Keeping it in table for comparison.
Baseline: LinearRegression 0.446259142254775 0.8652943399834946 0.8653391206880249


Bagging performing well. 
Ensemble Method 'mean squared error', 'r2 score', 'explained variance score'
BaggingRegressor 0.1989198958725586 0.9399549873005636 0.9400026998167074
VotingRegressor 0.2962409649534499 0.9105781127388121 0.9114122683042218 (Interesting as we can combine the other models under review. Yet, it's not the best performing model.)


Boosting -  keeing at least one for comparison

Ensemble Method 'mean squared error', 'r2 score', 'explained variance score'
GradientBoostingRegressor 0.3799829950579041 0.8853001422319302 0.885301770354857

Stacking - In its own notebook

|Ensemble|mean squared error| |r2 score| |explained variance score|
|Stacking Regressor|0.4227565962173764 | 0.8718890758241176 | 0.8719155515438354|

'Estimators: RidgeCV, LassoCV, KNeighborsRegressor, GradientBoostingRegressor'

"""


https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=34)
x_train_Trans=pca.fit_transform(x_train)
x_test_Trans=pca.transform(x_test)


In [ ]:


from sklearn.ensemble import BaggingRegressor
from sklearn.enemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV, LassoCV

# default values
n_estimators=140
rd_state=42
# Record the start time
start_time = time.time()
gbr = GradientBoostingRegressor(random_state=rd_state)

bagging_regressor = BaggingRegressor(random_state=rd_state, n_estimators=n_estimators)
voting_reg = VotingRegressor(estimators=[('gb', grad_boosting_regressor), ('rf', random_forest_regressor), ('lr', linear_regr), ('ada_b', ada_boosting_regressor), ('bagging_r', bagging_regressor)]) # TODO

# Only tune the max depth of the trees in the RF hyperparameter.
grid = GridSearchCV(estimator=[voting_reg,bagging_regressor],
                    scoring=['neg_mean_squared_error','r2','explained_variance'], n_jobs=-1)
grid.fit(x_train_Trans, y_train)
# Record the end time
end_time = time.time()
duration = end_time - start_time
grid.best_params_
print(f"GridSearchCV took {duration:.2f} seconds.")

In [ ]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
y_pred = grid.predict(x_test_Trans)
mse = mean_squared_error(y_test, y_pred)
r_two_score = r2_score(y_test, y_pred)
ex_variance_score = explained_variance_score(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')
print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Bagging & Voting Regressor|{mse} | {r_two_score} | {ex_variance_score}|')

# BaggingRegressor 0.1989198958725586 0.9399549873005636 0.9400026998167074 (before optmization)
# VotingRegressor 0.2962409649534499 0.9105781127388121 0.9114122683042218  (before optmization)

In [ ]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# Define the base models, with opmitized parameters

base_models = [
    ('ridge', RidgeCV()),
    ('lasso', LassoCV(random_state=42)),
    ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))
]
# Define the meta-model
meta_model = GradientBoostingRegressor(random_state=42)
# Create the stacking model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Define the KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation

scores = cross_val_score(stacked_model, x_train_Trans, y_train,  scoring=['neg_mean_squared_error','r2','explained_variance'], cv=kf)

# Convert scores to positive (since they are negative mean squared errors)
mse_scores = -scores

# Calculate RMSE for each fold
rmse_scores = np.sqrt(mse_scores)

print(f"RMSE scores for each fold: {rmse_scores}")
print(f"Average RMSE: {np.mean(rmse_scores)}")

In [ ]:
# TODO Predict and print metrics

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

y_pred = grid.predict(x_test_Trans)
mse = mean_squared_error(y_test, y_pred)
r_two_score = r2_score(y_test, y_pred)
ex_variance_score = explained_variance_score(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')
print('|Ensemble|mean squared error|', '|r2 score|', '|explained variance score|')
print(f'|Stacking Regressor|{mse} | {r_two_score} | {ex_variance_score}|')