Using Kfold to evaluate the performance of ensembles
---
* [sklearn Kfold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)

In [1]:
import pandas as pd

combined_data = pd.read_csv('combined_data.csv').set_index('video_id')
label = combined_data['label']
features = combined_data.drop(['label'], axis=1)

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

features_sc = sc.fit_transform(features.values)
label_sc = sc.fit_transform(label.values.reshape(-1, 1)).flatten()

In [ ]:
# TODO load ensemble models, then use Kfold to evaluate the performance of the models

In [ ]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Define the KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
scores = cross_val_score(model, features, label, scoring='neg_mean_squared_error', cv=kf)

# Convert scores to positive (since they are negative mean squared errors)
mse_scores = -scores

# Calculate RMSE for each fold
rmse_scores = np.sqrt(mse_scores)

print(f"RMSE scores for each fold: {rmse_scores}")
print(f"Average RMSE: {np.mean(rmse_scores)}")

In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

n_estimators=140
# Training classifiers
grad_boosting_regressor = GradientBoostingRegressor(random_state=42, n_estimators=n_estimators)
ada_boosting_regressor = AdaBoostRegressor(random_state=42, n_estimators=n_estimators)
random_forest_regressor = RandomForestRegressor(random_state=42, n_estimators=n_estimators)
bagging_regressor = BaggingRegressor(random_state=1, n_estimators=n_estimators)
linear_regr = LinearRegression()
voting_reg = VotingRegressor(estimators=[('gb', grad_boosting_regressor), ('rf', random_forest_regressor), ('lr', linear_regr), ('ada_b', ada_boosting_regressor), ('bagging_r', bagging_regressor)]) 

In [ ]:
grad_boosting_regressor.fit(x_train_Trans, y_train)
ada_boosting_regressor.fit(x_train_Trans, y_train)
bagging_regressor.fit(x_train_Trans, y_train)
random_forest_regressor.fit(x_train_Trans, y_train)
linear_regr.fit(x_train_Trans, y_train)
voting_reg.fit(x_train_Trans, y_train)

In [ ]:
gdb_prediction = grad_boosting_regressor.predict(x_test_Trans)
rf_prediction = random_forest_regressor.predict(x_test_Trans)
lg_prediction = linear_regr.predict(x_test_Trans)
ada_b_prediction = ada_boosting_regressor.predict(x_test_Trans)
bagging_r_prediction = bagging_regressor.predict(x_test_Trans)
voting_reg_prediction = voting_reg.predict(x_test_Trans)

In [ ]:
print('mean squared error', 'r2 score', 'explained variance score')
print('GradientBoostingRegressor',mean_squared_error(y_test, gdb_prediction),r2_score(y_test, gdb_prediction), explained_variance_score(y_test, gdb_prediction))
print('RandomForestRegressor',mean_squared_error(y_test, rf_prediction),r2_score(y_test, rf_prediction), explained_variance_score(y_test, rf_prediction))
print('LinearRegression',mean_squared_error(y_test, lg_prediction),r2_score(y_test, lg_prediction), explained_variance_score(y_test, lg_prediction))
print('AdaBoostRegressor',mean_squared_error(y_test, ada_b_prediction),r2_score(y_test, ada_b_prediction), explained_variance_score(y_test, ada_b_prediction))
print('BaggingRegressor',mean_squared_error(y_test, bagging_r_prediction),r2_score(y_test, bagging_r_prediction), explained_variance_score(y_test, bagging_r_prediction))
print('VotingRegressor',mean_squared_error(y_test, voting_reg_prediction),r2_score(y_test, voting_reg_prediction), explained_variance_score(y_test, voting_reg_prediction))