## Import Libaries and Packages

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning);

## Import and Investigate the Data

In [57]:
df_train = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/project_5/data/df_2000_to_2017_pollution_renewables.csv')

In [58]:
df_test = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/project_5/data/df_2018_to_2020_pollution_renewables.csv')

In [59]:
df_train.dtypes

State                                                                object
Year                                                                  int64
Month                                                                 int64
O3 Mean                                                             float64
O3 1st Max Value                                                    float64
O3 1st Max Hour                                                     float64
O3 AQI                                                              float64
CO Mean                                                             float64
CO 1st Max Value                                                    float64
CO 1st Max Hour                                                     float64
CO AQI                                                              float64
SO2 Mean                                                            float64
SO2 1st Max Value                                                   float64
SO2 1st Max 

In [60]:
df_train.shape

(6063, 27)

In [61]:
df_test.shape

(1536, 27)

In [62]:
df_train.columns

Index(['State', 'Year', 'Month', 'O3 Mean', 'O3 1st Max Value',
       'O3 1st Max Hour', 'O3 AQI', 'CO Mean', 'CO 1st Max Value',
       'CO 1st Max Hour', 'CO AQI', 'SO2 Mean', 'SO2 1st Max Value',
       'SO2 1st Max Hour', 'SO2 AQI', 'NO2 Mean', 'NO2 1st Max Value',
       'NO2 1st Max Hour', 'NO2 AQI',
       'Renewable energy share in the total final energy consumption (%)',
       'Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
       'Electricity from renewables (TWh)',
       'Low-carbon electricity (% electricity)',
       'Primary energy consumption per capita (kWh/person)',
       'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
       'Renewables (% equivalent primary energy)'],
      dtype='object')

### Renewable energy amounts are by number of pollution observations, these are approximations based on the pollution amounts. Approximations make renewable, fossil fuel, energy usage uniform across states in order to get a rough estimate for totals upon the test set. 

## Modeling Section

### Define features and perform train-test split

In [63]:
df_train = pd.get_dummies(df_train, columns=['State'], prefix='State')

In [64]:
df_train

Unnamed: 0,Year,Month,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,...,State_South Carolina,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_Wisconsin,State_Wyoming
0,2000,1,0.018242,0.033239,9.945652,31.891304,1.336178,2.400000,10.130435,27.250000,...,False,False,False,False,False,False,False,False,False,False
1,2000,2,0.023883,0.042200,10.387500,44.050000,0.985815,1.772500,9.212500,20.162500,...,False,False,False,False,False,False,False,False,False,False
2,2000,3,0.027969,0.045933,10.483146,44.157303,0.690650,1.179775,7.966292,13.426966,...,False,False,False,False,False,False,False,False,False,False
3,2000,4,0.034200,0.054920,10.760000,58.040000,0.661423,1.154000,9.160000,13.220000,...,False,False,False,False,False,False,False,False,False,False
4,2000,5,0.038819,0.059276,10.379310,70.620690,0.539292,0.920690,6.086207,10.568966,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6058,2017,8,0.043358,0.052194,9.064516,49.709677,0.189534,0.209677,1.161290,2.096774,...,False,False,False,False,False,False,False,False,False,True
6059,2017,9,0.037946,0.046000,10.350000,45.400000,0.196542,0.255000,5.400000,2.700000,...,False,False,False,False,False,False,False,False,False,True
6060,2017,10,0.034492,0.041727,9.863636,38.727273,0.121151,0.150000,3.000000,1.500000,...,False,False,False,False,False,False,False,False,False,True
6061,2017,11,0.028938,0.035960,10.760000,33.320000,0.119833,0.164000,5.520000,1.640000,...,False,False,False,False,False,False,False,False,False,True


In [65]:
X_features = ['Year', 'Month', 'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour',
               'O3 AQI', 'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI',
               'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI',
               'NO2 Mean', 'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI'] + list(df_train.columns[df_train.columns.str.startswith('State_')])

y_features = ['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
              'Electricity from renewables (TWh)', 'Low-carbon electricity (% electricity)']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

### Scale Data

In [67]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Start with Linear, Lasso, Ridge, Regressions

In [68]:
lin = MultiOutputRegressor(LinearRegression())

In [69]:
lin.fit(X_train_sc, y_train)

In [70]:
#predict
predictions_lin = lin.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Linear model is {}".format(train_score_lin))
print("The test score for Linear model is {}".format(test_score_lin))

R-squared score for Electricity from fossil fuels (TWh): 0.9462156731929502 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.3754109949686345
 
R-squared score for Electricity from nuclear (TWh): 0.9535835337001718 

Mean Squared Error for Electricity from nuclear (TWh): 0.022132917675315507
 
R-squared score for Electricity from renewables (TWh): 0.32147453382668256 

Mean Squared Error for Electricity from renewables (TWh): 0.010454117157766358
 
R-squared score for Low-carbon electricity (% electricity): 0.9315634526535604 

Mean Squared Error for Low-carbon electricity (% electricity): 3.140227856763706e-05
 
The train score for Linear model is 0.7903806733036038
The test score for Linear model is 0.7882092983433413


In [71]:
# Lasso
lasso = MultiOutputRegressor(Lasso())

lasso.fit(X_train_sc,y_train)

In [72]:
predictions_lasso = lasso.predict(X_test_sc)


# Calculate scores for each variable
scores_r2_lasso = [r2_score(y_test.iloc[:, i], predictions_lasso[:, i]) for i in range(len(y_features))]
scores_mse_lasso = [mean_squared_error(y_test.iloc[:, i], predictions_lasso[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Lasso): {scores_r2_lasso[i]} \n")
    print(f"Mean Squared Error for {feature} (Lasso): {scores_mse_lasso[i]} \n ")

# Display overall scores
train_score_lasso = lasso.score(X_train_sc, y_train)
test_score_lasso = lasso.score(X_test_sc, y_test)
print("The train score for Lasso model is {}".format(train_score_lasso))
print("The test score for Lasso model is {}".format(test_score_lasso))

R-squared score for Electricity from fossil fuels (TWh) (Lasso): 0.7965041484660975 

Mean Squared Error for Electricity from fossil fuels (TWh) (Lasso): 1.4203873996674883 
 
R-squared score for Electricity from nuclear (TWh) (Lasso): -0.0048639747771392905 

Mean Squared Error for Electricity from nuclear (TWh) (Lasso): 0.4791526240918315 
 
R-squared score for Electricity from renewables (TWh) (Lasso): -0.0017352130477865657 

Mean Squared Error for Electricity from renewables (TWh) (Lasso): 0.015433845596572859 
 
R-squared score for Low-carbon electricity (% electricity) (Lasso): -0.0050124687070458585 

Mean Squared Error for Low-carbon electricity (% electricity) (Lasso): 0.0004611524504081981 
 
The train score for Lasso model is 0.1993848610661758
The test score for Lasso model is 0.19622312298353145


#### Lasso will not be of any help here

In [73]:
# Ridge
ridge = MultiOutputRegressor(Ridge())

ridge.fit(X_train_sc,y_train)

In [74]:
# Predictions
predictions_ridge = ridge.predict(X_test_sc)

# Calculate scores for each variable
scores_r2_ridge = [r2_score(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]
scores_mse_ridge = [mean_squared_error(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Ridge): {scores_r2_ridge[i]} \n")
    print(f"Mean Squared Error for {feature} (Ridge): {scores_mse_ridge[i]} \n")

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

R-squared score for Electricity from fossil fuels (TWh) (Ridge): 0.9463575216647688 

Mean Squared Error for Electricity from fossil fuels (TWh) (Ridge): 0.3744209021460309 

R-squared score for Electricity from nuclear (TWh) (Ridge): 0.9536025739595679 

Mean Squared Error for Electricity from nuclear (TWh) (Ridge): 0.022123838645235764 

R-squared score for Electricity from renewables (TWh) (Ridge): 0.32217787459592884 

Mean Squared Error for Electricity from renewables (TWh) (Ridge): 0.010443280708480229 

R-squared score for Low-carbon electricity (% electricity) (Ridge): 0.9316527121495083 

Mean Squared Error for Low-carbon electricity (% electricity) (Ridge): 3.136132162773805e-05 

The train score for Ridge model is 0.7903468010451873
The test score for Ridge model is 0.7884476705924435


## Polynomial Features

In [75]:
poly = PolynomialFeatures(degree=2,include_bias=False)

In [76]:
X_train_sc = poly.fit_transform(X_train_sc)

X_test_sc = poly.transform(X_test_sc)

In [77]:
lin.fit(X_train_sc, y_train)

In [78]:
ridge.fit(X_train_sc,y_train)

In [79]:
#predict
predictions_lin = lin.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Lin model is {}".format(train_score_lin))
print("The test score for Lin model is {}".format(test_score_lin))

R-squared score for Electricity from fossil fuels (TWh): 0.9496173662767988 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.3516674053209133
 
R-squared score for Electricity from nuclear (TWh): 0.9506363799260442 

Mean Squared Error for Electricity from nuclear (TWh): 0.02353821879061183
 
R-squared score for Electricity from renewables (TWh): 0.3702513606467138 

Mean Squared Error for Electricity from renewables (TWh): 0.009702607173864829
 
R-squared score for Low-carbon electricity (% electricity): 0.9384851945380743 

Mean Squared Error for Low-carbon electricity (% electricity): 2.8226220229531897e-05
 
The train score for Lin model is 0.8315635519623237
The test score for Lin model is 0.8022475753469078


#### Lasso will not be of any help here

In [80]:
# Predictions
predictions_ridge = ridge.predict(X_test_sc)

# Calculate scores for each variable
scores_r2_ridge = [r2_score(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]
scores_mse_ridge = [mean_squared_error(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Ridge): {scores_r2_ridge[i]} \n")
    print(f"Mean Squared Error for {feature} (Ridge): {scores_mse_ridge[i]} \n")

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

R-squared score for Electricity from fossil fuels (TWh) (Ridge): 0.9506618565537822 

Mean Squared Error for Electricity from fossil fuels (TWh) (Ridge): 0.34437693321880575 

R-squared score for Electricity from nuclear (TWh) (Ridge): 0.9521143300217445 

Mean Squared Error for Electricity from nuclear (TWh) (Ridge): 0.02283348294137548 

R-squared score for Electricity from renewables (TWh) (Ridge): 0.3825441220784168 

Mean Squared Error for Electricity from renewables (TWh) (Ridge): 0.009513211234277986 

R-squared score for Low-carbon electricity (% electricity) (Ridge): 0.9395549838707824 

Mean Squared Error for Low-carbon electricity (% electricity) (Ridge): 2.773534475528676e-05 

The train score for Ridge model is 0.8294261405899502
The test score for Ridge model is 0.8062188231311815


## Logistic Regression 

In [81]:
# KNeighborsRegressor()
knn = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=15))

In [82]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [83]:
knn.fit(X_train,y_train)

In [84]:
#predict
predictions_knn = knn.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_knn = knn.score(X_train_sc, y_train)
test_score_knn = knn.score(X_test_sc, y_test)
print("The train score for KNN model is {}".format(train_score_knn))
print("The test score for KNN model is {}".format(test_score_knn))

R-squared score for Electricity from fossil fuels (TWh): -2.623343744281072 

Mean Squared Error for Electricity from fossil fuels (TWh): 25.29069639625275
 
R-squared score for Electricity from nuclear (TWh): -2.81908442312836 

Mean Squared Error for Electricity from nuclear (TWh): 1.8210667004716101
 
R-squared score for Electricity from renewables (TWh): -7.386150763608887 

Mean Squared Error for Electricity from renewables (TWh): 0.12920635548123316
 
R-squared score for Low-carbon electricity (% electricity): -4.260795859077295 

Mean Squared Error for Low-carbon electricity (% electricity): 0.0024139291571495585
 




The train score for KNN model is -4.826613888611487
The test score for KNN model is -4.272343697523904




In [85]:
# Define the parameter grid to search
param_grid = {
    'estimator__n_neighbors': [5, 10, 15, 20, 25],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2, 3]  # Experiment with different values for p
}

# Create the KNN model with MultiOutputRegressor
knn = MultiOutputRegressor(KNeighborsRegressor())

# Create the GridSearchCV object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_knn = grid_search.best_estimator_
predictions_knn = best_knn.predict(X_test_sc)

Best Hyperparameters: {'estimator__n_neighbors': 5, 'estimator__p': 3, 'estimator__weights': 'distance'}


In [86]:
# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_knn = best_knn.score(X_train_sc, y_train)
test_score_knn = best_knn.score(X_test_sc, y_test)
print("The train score for the best KNN model is {}".format(train_score_knn))
print("The test score for the best KNN model is {}".format(test_score_knn))

R-squared score for Electricity from fossil fuels (TWh): 0.9154774027860599 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.5899620614617281
 
R-squared score for Electricity from nuclear (TWh): 0.9156867365847401 

Mean Squared Error for Electricity from nuclear (TWh): 0.04020337321787994
 
R-squared score for Electricity from renewables (TWh): 0.30445183460627767 

Mean Squared Error for Electricity from renewables (TWh): 0.010716387773776037
 
R-squared score for Low-carbon electricity (% electricity): 0.8883246809413519 

Mean Squared Error for Low-carbon electricity (% electricity): 5.124249562820562e-05
 
The train score for the best KNN model is 1.0
The test score for the best KNN model is 0.7559851637296073


In [87]:
# Create a KNN model with MultiOutputRegressor
knn = MultiOutputRegressor(KNeighborsRegressor())

# Define the parameter grid to search
param_grid = {
    'estimator__n_neighbors': [5, 10, 15, 20, 25],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2, 3]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_knn = grid_search.best_estimator_
predictions_knn = best_knn.predict(X_test_sc)

Best Hyperparameters: {'estimator__n_neighbors': 5, 'estimator__p': 3, 'estimator__weights': 'distance'}


In [88]:
# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_knn[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_knn = best_knn.score(X_train_sc, y_train)
test_score_knn = best_knn.score(X_test_sc, y_test)
print("The train score for the best KNN model is {}".format(train_score_knn))
print("The test score for the best KNN model is {}".format(test_score_knn))

R-squared score for Electricity from fossil fuels (TWh): 0.9154774027860599 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.5899620614617281
 
R-squared score for Electricity from nuclear (TWh): 0.9156867365847401 

Mean Squared Error for Electricity from nuclear (TWh): 0.04020337321787994
 
R-squared score for Electricity from renewables (TWh): 0.30445183460627767 

Mean Squared Error for Electricity from renewables (TWh): 0.010716387773776037
 
R-squared score for Low-carbon electricity (% electricity): 0.8883246809413519 

Mean Squared Error for Low-carbon electricity (% electricity): 5.124249562820562e-05
 
The train score for the best KNN model is 1.0
The test score for the best KNN model is 0.7559851637296073


### More Models

#### Decision Tree Regressor

In [89]:
dtr = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))

In [90]:
dtr.fit(X_train_sc, y_train)

In [91]:
#predict
predictions_dtr = dtr.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_dtr[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_dtr[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_dtr = dtr.score(X_train_sc, y_train)
test_score_dtr = dtr.score(X_test_sc, y_test)
print("The train score for DTR model is {}".format(train_score_dtr))
print("The test score for DTR model is {}".format(test_score_dtr))

R-squared score for Electricity from fossil fuels (TWh): 1.0 

Mean Squared Error for Electricity from fossil fuels (TWh): 6.052501152523774e-28
 
R-squared score for Electricity from nuclear (TWh): 1.0 

Mean Squared Error for Electricity from nuclear (TWh): 1.972339235443503e-29
 
R-squared score for Electricity from renewables (TWh): 1.0 

Mean Squared Error for Electricity from renewables (TWh): 2.5402475499399985e-29
 
R-squared score for Low-carbon electricity (% electricity): 1.0 

Mean Squared Error for Low-carbon electricity (% electricity): 1.2368328453819195e-31
 
The train score for DTR model is 1.0
The test score for DTR model is 1.0


In [92]:
# Create a Decision Tree Regressor model with MultiOutputRegressor
dtr = MultiOutputRegressor(DecisionTreeRegressor())

# Define the parameter grid to search
param_grid = {
    'estimator__max_depth': [None, 5, 10, 15],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dtr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_dtr = grid_search.best_estimator_
predictions_dtr = best_dtr.predict(X_test_sc)

# Calculate and display scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_dtr[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_dtr[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_dtr = best_dtr.score(X_train_sc, y_train)
test_score_dtr = best_dtr.score(X_test_sc, y_test)
print("The train score for the best DTR model is {}".format(train_score_dtr))
print("The test score for the best DTR model is {}".format(test_score_dtr))

Best Hyperparameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 10}
R-squared score for Electricity from fossil fuels (TWh): 1.0 

Mean Squared Error for Electricity from fossil fuels (TWh): 5.713440541191518e-28
 
R-squared score for Electricity from nuclear (TWh): 1.0 

Mean Squared Error for Electricity from nuclear (TWh): 2.937901243986202e-29
 
R-squared score for Electricity from renewables (TWh): 1.0 

Mean Squared Error for Electricity from renewables (TWh): 2.451122688703492e-29
 
R-squared score for Low-carbon electricity (% electricity): 1.0 

Mean Squared Error for Low-carbon electricity (% electricity): 1.1924014986682077e-31
 
The train score for the best DTR model is 1.0
The test score for the best DTR model is 1.0


#### Bagging Tree

In [95]:
# Create a BaggingRegressor model with MultiOutputRegressor
br = MultiOutputRegressor(BaggingRegressor())

# Define the parameter grid to search
param_grid = {
    'estimator__n_estimators': [10, 50, 100],
    'estimator__max_samples': [0.5, 0.7, 1.0],
    'estimator__max_features': [0.5, 0.7, 1.0]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(br, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params_br = grid_search.best_params_
print("Best Hyperparameters:", best_params_br)

# Use the best model for predictions
best_br = grid_search.best_estimator_
predictions_br = best_br.predict(X_test_sc)

# Calculate and display scores for each variable
scores_r2_br = [r2_score(y_test.iloc[:, i], predictions_br[:, i]) for i in range(len(y_features))]
scores_mse_br = [mean_squared_error(y_test.iloc[:, i], predictions_br[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2_br[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse_br[i]}\n ")

# Display overall scores
train_score_br = best_br.score(X_train_sc, y_train)
test_score_br = best_br.score(X_test_sc, y_test)
print("The train score for the best Bagging Regressor model is {}".format(train_score_br))
print("The test score for the best Bagging Regressor model is {}".format(test_score_br))

Best Hyperparameters: {'estimator__max_features': 1.0, 'estimator__max_samples': 0.5, 'estimator__n_estimators': 10}
R-squared score for Electricity from fossil fuels (TWh): 1.0 

Mean Squared Error for Electricity from fossil fuels (TWh): 8.179375507877319e-29
 
R-squared score for Electricity from nuclear (TWh): 1.0 

Mean Squared Error for Electricity from nuclear (TWh): 5.890321300593228e-30
 
R-squared score for Electricity from renewables (TWh): 1.0 

Mean Squared Error for Electricity from renewables (TWh): 1.679743702038517e-30
 
R-squared score for Low-carbon electricity (% electricity): 1.0 

Mean Squared Error for Low-carbon electricity (% electricity): 1.231515500465388e-32
 
The train score for the best Bagging Regressor model is 1.0
The test score for the best Bagging Regressor model is 1.0


#### Random Forest

In [99]:
# Create a RandomForestRegressor model with MultiOutputRegressor
rf = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Define the parameter grid to search
param_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [None, 5],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2],
    'estimator__max_features': ['auto', 'sqrt']
}

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_rf.fit(X_train_sc, y_train)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters:", best_params_rf)

# Use the best model for predictions
best_rf = grid_search_rf.best_estimator_
predictions_rf = best_rf.predict(X_test_sc)

# Calculate and display scores for each variable
scores_r2_rf = [r2_score(y_test.iloc[:, i], predictions_rf[:, i]) for i in range(len(y_features))]
scores_mse_rf = [mean_squared_error(y_test.iloc[:, i], predictions_rf[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2_rf[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse_rf[i]}\n ")

# Display overall scores
train_score_rf = best_rf.score(X_train_sc, y_train)
test_score_rf = best_rf.score(X_test_sc, y_test)
print("The train score for the best RandomForest model is {}".format(train_score_rf))
print("The test score for the best RandomForest model is {}".format(test_score_rf))

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
71 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/multioutput.py", line 273, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 65, in __cal

Best Hyperparameters: {'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}
R-squared score for Electricity from fossil fuels (TWh): 0.9869769812968648 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.09089979737737046
 
R-squared score for Electricity from nuclear (TWh): 0.9895124063442164 

Mean Squared Error for Electricity from nuclear (TWh): 0.005000834089700572
 
R-squared score for Electricity from renewables (TWh): 0.9042215097433823 

Mean Squared Error for Electricity from renewables (TWh): 0.0014756698285527663
 
R-squared score for Low-carbon electricity (% electricity): 0.9880154795394802 

Mean Squared Error for Low-carbon electricity (% electricity): 5.4991267764528375e-06
 
The train score for the best RandomForest model is 0.9948940603045757
The test score for the best RandomForest model is 0.967181594230986


#### ADA Boost

In [100]:
# Create an AdaBoostRegressor model with MultiOutputRegressor
abr = MultiOutputRegressor(AdaBoostRegressor(random_state=42))

# Define the parameter grid to search
param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__learning_rate': [0.001, 0.01, 0.1, 1.0],
    'estimator__loss': ['linear', 'square', 'exponential']
}

# Create the GridSearchCV object
grid_search_abr = GridSearchCV(abr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_abr.fit(X_train_sc, y_train)

# Get the best parameters
best_params_abr = grid_search_abr.best_params_
print("Best Hyperparameters:", best_params_abr)

# Use the best model for predictions
best_abr = grid_search_abr.best_estimator_
predictions_abr = best_abr.predict(X_test_sc)

# Calculate and display scores for each variable
scores_r2_abr = [r2_score(y_test.iloc[:, i], predictions_abr[:, i]) for i in range(len(y_features))]
scores_mse_abr = [mean_squared_error(y_test.iloc[:, i], predictions_abr[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2_abr[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse_abr[i]}\n ")

# Display overall scores
train_score_abr = best_abr.score(X_train_sc, y_train)
test_score_abr = best_abr.score(X_test_sc, y_test)
print("The train score for the best AdaBoostRegressor model is {}".format(train_score_abr))
print("The test score for the best AdaBoostRegressor model is {}".format(test_score_abr))

Best Hyperparameters: {'estimator__learning_rate': 1.0, 'estimator__loss': 'exponential', 'estimator__n_estimators': 50}
R-squared score for Electricity from fossil fuels (TWh): 0.9938006939521971 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.04327074056109046
 
R-squared score for Electricity from nuclear (TWh): 0.9964139387040841 

Mean Squared Error for Electricity from nuclear (TWh): 0.0017099535093526867
 
R-squared score for Electricity from renewables (TWh): 0.9054122865882823 

Mean Squared Error for Electricity from renewables (TWh): 0.0014573233975550526
 
R-squared score for Low-carbon electricity (% electricity): 0.9952593880016455 

Mean Squared Error for Low-carbon electricity (% electricity): 2.1752415094791734e-06
 
The train score for the best AdaBoostRegressor model is 0.9717907526804168
The test score for the best AdaBoostRegressor model is 0.9727215768115521


#### Support Vector Machine

In [103]:
# Create an SVR model with MultiOutputRegressor
svr = MultiOutputRegressor(SVR())

# Define the parameter grid to search
param_grid = {
    'estimator__C': [0.001, 0.01, 0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf', 'poly'],
    'estimator__degree': [2, 3],
    'estimator__epsilon': [0.1, 0.2, 0.5]
}

# Create the GridSearchCV object
grid_search_svr = GridSearchCV(svr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_svr.fit(X_train_sc, y_train)

# Get the best parameters
best_params_svr = grid_search_svr.best_params_
print("Best Hyperparameters:", best_params_svr)

# Use the best model for predictions
best_svr = grid_search_svr.best_estimator_
predictions_svr = best_svr.predict(X_test_sc)

# Calculate and display scores for each variable
scores_r2_svr = [r2_score(y_test.iloc[:, i], predictions_svr[:, i]) for i in range(len(y_features))]
scores_mse_svr = [mean_squared_error(y_test.iloc[:, i], predictions_svr[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2_svr[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse_svr[i]}\n ")

# Display overall scores
train_score_svr = best_svr.score(X_train_sc, y_train)
test_score_svr = best_svr.score(X_test_sc, y_test)
print("The train score for the best SVR model is {}".format(train_score_svr))
print("The test score for the best SVR model is {}".format(test_score_svr))

Best Hyperparameters: {'estimator__C': 10, 'estimator__degree': 2, 'estimator__epsilon': 0.1, 'estimator__kernel': 'rbf'}
R-squared score for Electricity from fossil fuels (TWh): 0.962802516717281 

Mean Squared Error for Electricity from fossil fuels (TWh): 0.25963593928750794
 
R-squared score for Electricity from nuclear (TWh): 0.9612455926001813 

Mean Squared Error for Electricity from nuclear (TWh): 0.018479392700753744
 
R-squared score for Electricity from renewables (TWh): 0.3359602468765217 

Mean Squared Error for Electricity from renewables (TWh): 0.010230934169232627
 
R-squared score for Low-carbon electricity (% electricity): -0.21643205076663707 

Mean Squared Error for Low-carbon electricity (% electricity): 0.0005581628471612727
 
The train score for the best SVR model is 0.5688587174354891
The test score for the best SVR model is 0.5108940763568366
