## Import Libaries and Packages

In [231]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning);

In [232]:
df = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/project_5/data/Energy_CO2_1970to2021.csv')

In [233]:
df.set_index('Year', inplace=True)

In [234]:
df

Unnamed: 0_level_0,State,CO2_Value,Coal_Value,Natural_Gas_Value,Nuclear_Energy_Value,Petroleum_Energy_Value,Renewable_Energy_Value
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970,Alabama,29.7,675603,307750,0,352050,132471
1970,Alaska,37.3,13159,64045,0,95999,8835
1970,Arizona,13.9,8623,204383,0,213298,68919
1970,Arkansas,18.7,0,383478,0,246630,56933
1970,California,14.7,61812,2241295,34375,2502535,521978
...,...,...,...,...,...,...,...
2021,Virginia,11.3,68603,699927,298458,789358,189262
2021,Washington,9.5,36943,382807,88909,712349,850547
2021,West Virginia,49.5,633582,279133,0,205121,48756
2021,Wisconsin,15.7,286760,559535,104149,533212,178716


In [235]:
df.dtypes

State                      object
CO2_Value                 float64
Coal_Value                 object
Natural_Gas_Value          object
Nuclear_Energy_Value       object
Petroleum_Energy_Value     object
Renewable_Energy_Value     object
dtype: object

In [236]:
def comma_drop(int_str):
    try:
        # Remove the ',' from string and convert to float
        comma_drop_value = float(str(int_str).replace(',',''))

        return comma_drop_value

    except ValueError:
        # Handle the case where the input is not a valid number with '%' symbol
        print("Error: Invalid input. Please provide a valid integer string.")
        return None

In [237]:
df.iloc[0:2653, 2:7] = df.iloc[0:2653, 2:7].map(comma_drop)


In [238]:
df

Unnamed: 0_level_0,State,CO2_Value,Coal_Value,Natural_Gas_Value,Nuclear_Energy_Value,Petroleum_Energy_Value,Renewable_Energy_Value
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970,Alabama,29.7,675603.0,307750.0,0.0,352050.0,132471.0
1970,Alaska,37.3,13159.0,64045.0,0.0,95999.0,8835.0
1970,Arizona,13.9,8623.0,204383.0,0.0,213298.0,68919.0
1970,Arkansas,18.7,0.0,383478.0,0.0,246630.0,56933.0
1970,California,14.7,61812.0,2241295.0,34375.0,2502535.0,521978.0
...,...,...,...,...,...,...,...
2021,Virginia,11.3,68603.0,699927.0,298458.0,789358.0,189262.0
2021,Washington,9.5,36943.0,382807.0,88909.0,712349.0,850547.0
2021,West Virginia,49.5,633582.0,279133.0,0.0,205121.0,48756.0
2021,Wisconsin,15.7,286760.0,559535.0,104149.0,533212.0,178716.0


In [239]:
df['Coal_Value'] = df['Coal_Value'].astype(float)
df['Natural_Gas_Value'] = df['Natural_Gas_Value'].astype(float)
df['Nuclear_Energy_Value'] = df['Nuclear_Energy_Value'].astype(float)
df['Petroleum_Energy_Value'] = df['Petroleum_Energy_Value'].astype(float)
df['Renewable_Energy_Value'] = df['Renewable_Energy_Value'].astype(float)

In [240]:
df.dtypes


State                      object
CO2_Value                 float64
Coal_Value                float64
Natural_Gas_Value         float64
Nuclear_Energy_Value      float64
Petroleum_Energy_Value    float64
Renewable_Energy_Value    float64
dtype: object

## Import and Investigate the Data

### Renewable energy amounts are by number of pollution observations, these are approximations based on the pollution amounts. Approximations make renewable, fossil fuel, energy usage uniform across states in order to get a rough estimate for totals upon the test set. 

## Modeling Section

### Define features and perform train-test split

In [241]:
df = pd.get_dummies(df, columns=['State'], prefix='State')

In [242]:
df

Unnamed: 0_level_0,CO2_Value,Coal_Value,Natural_Gas_Value,Nuclear_Energy_Value,Petroleum_Energy_Value,Renewable_Energy_Value,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970,29.7,675603.0,307750.0,0.0,352050.0,132471.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1970,37.3,13159.0,64045.0,0.0,95999.0,8835.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1970,13.9,8623.0,204383.0,0.0,213298.0,68919.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1970,18.7,0.0,383478.0,0.0,246630.0,56933.0,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1970,14.7,61812.0,2241295.0,34375.0,2502535.0,521978.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,11.3,68603.0,699927.0,298458.0,789358.0,189262.0,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2021,9.5,36943.0,382807.0,88909.0,712349.0,850547.0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2021,49.5,633582.0,279133.0,0.0,205121.0,48756.0,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2021,15.7,286760.0,559535.0,104149.0,533212.0,178716.0,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [243]:
X_features = ['Coal_Value',	'Natural_Gas_Value','Nuclear_Energy_Value','Petroleum_Energy_Value','Renewable_Energy_Value'] + list(df.columns[df.columns.str.startswith('State_')])


In [244]:
X = df[X_features]
y = df['CO2_Value']


In [245]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [246]:
X_train.shape

(2121, 56)

In [247]:
X_test.shape

(531, 56)

### Scale Data

In [248]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Start with Linear, Lasso, Ridge, Regressions

In [249]:
lin = LinearRegression()

In [250]:
lin.fit(X_train_sc, y_train)

In [251]:
predictions_lin_train = lin.predict(X_train_sc)
predictions_lin = lin.predict(X_test_sc)

In [252]:
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Linear model is {}".format(train_score_lin))
print("The test score for Linear model is {}".format(test_score_lin))

The train score for Linear model is 0.9212459265286845
The test score for Linear model is 0.9279017447120519


In [253]:
mean_squared_error(y_train, predictions_lin_train)
mean_squared_error(y_test, predictions_lin)
print("The Linear model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_lin_train))))
print("The Linear model test mean squared error is (RMSE) {}".format(math.sqrt(mean_squared_error(y_test, predictions_lin))))

The Linear model train root mean squared error (RMSE) is 4.729673584307714
The Linear model test mean squared error is (RMSE) 4.793009188716111


In [254]:
# Lasso
lasso = Lasso()

lasso.fit(X_train_sc,y_train)

In [255]:
predictions_lasso_train = lasso.predict(X_train_sc)
predictions_lasso = lasso.predict(X_test_sc)

# Display overall scores
train_score_lasso = lasso.score(X_train_sc, y_train)
test_score_lasso = lasso.score(X_test_sc, y_test)
print("The train score for Lasso model is {}".format(train_score_lasso))
print("The test score for Lasso model is {}".format(test_score_lasso))

The train score for Lasso model is 0.8339998268856855
The test score for Lasso model is 0.8486543229661927


In [256]:
mean_squared_error(y_train, predictions_lasso_train)
mean_squared_error(y_test, predictions_lasso)
print("The Lasso model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_lasso_train))))
print("The Lasso model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_lasso))))

The Lasso model train root mean squared error (RMSE) is 6.866713479967608
The Lasso model test root mean squared error (RMSE) is 6.944338690446198


In [257]:
# Ridge
ridge = Ridge()

ridge.fit(X_train_sc,y_train)

In [258]:
# Predictions
predictions_ridge_train = ridge.predict(X_train_sc)
predictions_ridge = ridge.predict(X_test_sc)

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

The train score for Ridge model is 0.9212447374812631
The test score for Ridge model is 0.9279286139837684


In [259]:
mean_squared_error(y_train, predictions_ridge_train)
mean_squared_error(y_test, predictions_ridge)
print("The Ridge model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_ridge_train))))
print("The Ridge model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_ridge))))

The Ridge model train root mean squared error (RMSE) is 4.729709289031957
The Ridge model test root mean squared error (RMSE) is 4.7921159863308125


## Polynomial Features

In [260]:
poly = PolynomialFeatures(degree=2,include_bias=False)

In [261]:
X_train_sc = poly.fit_transform(X_train_sc)

X_test_sc = poly.transform(X_test_sc)

In [262]:
lin.fit(X_train_sc, y_train)

In [263]:
lasso.fit(X_train_sc, y_train)

In [264]:
ridge.fit(X_train_sc,y_train)

In [265]:
predictions_lin_train = lin.predict(X_train_sc)
predictions_lin = lin.predict(X_test_sc)

# Display overall scores
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Lin model is {}".format(train_score_lin))
print("The test score for Lin model is {}".format(test_score_lin))

The train score for Lin model is 0.9957651284014981
The test score for Lin model is 0.9944418810438017


In [266]:
mean_squared_error(y_train, predictions_lin_train)
mean_squared_error(y_test, predictions_lin)
print("The Linear model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_lin_train))))
print("The Linear model test mean squared error is (RMSE) {}".format(math.sqrt(mean_squared_error(y_test, predictions_lin))))

The Linear model train root mean squared error (RMSE) is 1.0967680221235012
The Linear model test mean squared error is (RMSE) 1.3307908391667713


In [267]:
#predict
predictions_lasso_train = lasso.predict(X_train_sc)
predictions_lasso = lasso.predict(X_test_sc)

# Display overall scores
train_score_lasso = lasso.score(X_train_sc, y_train)
test_score_lasso = lasso.score(X_test_sc, y_test)
print("The train score for Lasso model is {}".format(train_score_lasso))
print("The test score for Lasso model is {}".format(test_score_lasso))

The train score for Lasso model is 0.9020492119638948
The test score for Lasso model is 0.9113628540726311


In [268]:
mean_squared_error(y_train, predictions_lasso_train)
mean_squared_error(y_test, predictions_lasso)
print("The Lasso model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_lasso_train))))
print("The Lasso model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_lasso))))

The Lasso model train root mean squared error (RMSE) is 5.274710433817572
The Lasso model test root mean squared error (RMSE) is 5.314394202920832


In [269]:
# Predictions
predictions_ridge_train = ridge.predict(X_train_sc)
predictions_ridge = ridge.predict(X_test_sc)

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

The train score for Ridge model is 0.994920181714507
The test score for Ridge model is 0.9936021302940357


In [270]:
mean_squared_error(y_train, predictions_ridge_train)
mean_squared_error(y_test, predictions_ridge)
print("The Ridge model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_ridge_train))))
print("The Ridge model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_ridge))))

The Ridge model train root mean squared error (RMSE) is 1.2012094619265186
The Ridge model test root mean squared error (RMSE) is 1.4277875135293363


## KNN Regression 

In [271]:
# KNeighborsRegressor()
knn = KNeighborsRegressor(n_neighbors=15)

In [272]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [273]:
knn.fit(X_train,y_train)

In [274]:
#predict
predictions_knn_train = knn.predict(X_train_sc)
predictions_knn = knn.predict(X_test_sc)

# Display overall scores
train_score_knn = knn.score(X_train_sc, y_train)
test_score_knn = knn.score(X_test_sc, y_test)
print("The train score for KNN model is {}".format(train_score_knn))
print("The test score for KNN model is {}".format(test_score_knn))

python3.10(54478) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


The train score for KNN model is -1.185926499939288
The test score for KNN model is -1.1200782069766828




In [286]:
mean_squared_error(y_train, predictions_knn_train)
mean_squared_error(y_test, predictions_knn)
print("The KNN model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_knn_train))))
print("The KNN model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_knn))))

The KNN model train root mean squared error (RMSE) is 0.0
The KNN model test root mean squared error (RMSE) is 1.2920279869833784


In [280]:
# Define the parameter grid to search
param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]  # Experiment with different values for p
}

# Create the KNN model with MultiOutputRegressor
knn = KNeighborsRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_knn = grid_search.best_estimator_
predictions_knn_train = best_knn.predict(X_train_sc)
predictions_knn = best_knn.predict(X_test_sc)

Best Hyperparameters: {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}


In [281]:
# Display overall scores
train_score_knn = best_knn.score(X_train_sc, y_train)
test_score_knn = best_knn.score(X_test_sc, y_test)
print("The train score for the best KNN model is {}".format(train_score_knn))
print("The test score for the best KNN model is {}".format(test_score_knn))

The train score for the best KNN model is 1.0
The test score for the best KNN model is 0.9947609556556071


In [285]:
mean_squared_error(y_train, predictions_knn_train)
mean_squared_error(y_test, predictions_knn)
print("The KNN model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_knn_train))))
print("The KNN model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_knn))))

The KNN model train root mean squared error (RMSE) is 0.0
The KNN model test root mean squared error (RMSE) is 1.2920279869833784


### More Models

#### Decision Tree Regressor

In [283]:
dtr = DecisionTreeRegressor(random_state=42)

In [284]:
dtr.fit(X_train_sc, y_train)

In [287]:
#predict
predictions_dtr_train = dtr.predict(X_train_sc)
predictions_dtr = dtr.predict(X_test_sc)

# Display overall scores
train_score_dtr = dtr.score(X_train_sc, y_train)
test_score_dtr = dtr.score(X_test_sc, y_test)
print("The train score for DTR model is {}".format(train_score_dtr))
print("The test score for DTR model is {}".format(test_score_dtr))

The train score for DTR model is 1.0
The test score for DTR model is 0.9880618496544075


In [292]:
mean_squared_error(y_train, predictions_dtr_train)
mean_squared_error(y_test, predictions_dtr)
print("The DTR model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_dtr_train))))
print("The DTR model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_dtr))))

The DTR model train root mean squared error (RMSE) is 6.680677472955323e-17
The DTR model test root mean squared error (RMSE) is 1.9506064053004768


In [289]:
# Create a Decision Tree Regressor model with MultiOutputRegressor
dtr = DecisionTreeRegressor()

# Define the parameter grid to search
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dtr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_dtr = grid_search.best_estimator_
predictions_dtr_train = best_dtr.predict(X_train_sc)
predictions_dtr = best_dtr.predict(X_test_sc)

# Display overall scores
train_score_dtr = best_dtr.score(X_train_sc, y_train)
test_score_dtr = best_dtr.score(X_test_sc, y_test)
print("The train score for the best DTR model is {}".format(train_score_dtr))
print("The test score for the best DTR model is {}".format(test_score_dtr))

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
The train score for the best DTR model is 1.0
The test score for the best DTR model is 0.9880588123219665


In [291]:
mean_squared_error(y_train, predictions_dtr_train)
mean_squared_error(y_test, predictions_dtr)
print("The DTR model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_dtr_train))))
print("The DTR model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_dtr))))

The DTR model train root mean squared error (RMSE) is 6.680677472955323e-17
The DTR model test root mean squared error (RMSE) is 1.9506064053004768


#### Bagging Tree

In [294]:
# Create a BaggingRegressor model with MultiOutputRegressor
br = BaggingRegressor()

# Define the parameter grid to search
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(br, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_sc, y_train)

# Get the best parameters
best_params_br = grid_search.best_params_ 
print("Best Hyperparameters:", best_params_br)

# Use the best model for predictions
best_br = grid_search.best_estimator_
predictions_br_train = best_br.predict(X_train_sc)
predictions_br = best_br.predict(X_test_sc)

# Display overall scores
train_score_br = best_br.score(X_train_sc, y_train)
test_score_br = best_br.score(X_test_sc, y_test)
print("The train score for the best Bagging Regressor model is {}".format(train_score_br))
print("The test score for the best Bagging Regressor model is {}".format(test_score_br))

Best Hyperparameters: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
The train score for the best Bagging Regressor model is 0.9989159312285973
The test score for the best Bagging Regressor model is 0.9952688242114469


In [295]:
mean_squared_error(y_train, predictions_br_train)
mean_squared_error(y_test, predictions_br)
print("The Bagging Regressor model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_br_train))))
print("The Bagging Regressor model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_br))))

The Bagging Regressor model train root mean squared error (RMSE) is 0.5549106665279538
The Bagging Regressor model test root mean squared error (RMSE) is 1.2278079081372493


#### Random Forest

In [296]:
# Create a RandomForestRegressor model with MultiOutputRegressor
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [10, 25, 50, 100],
    'max_depth': [None, 1, 2, 5],
    'min_samples_split': [1, 2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_rf.fit(X_train_sc, y_train)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters:", best_params_rf)

# Use the best model for predictions
best_rf = grid_search_rf.best_estimator_
predictions_rf_train = best_rf.predict(X_train_sc)
predictions_rf = best_rf.predict(X_test_sc)

# Display overall scores
train_score_rf = best_rf.score(X_train_sc, y_train)
test_score_rf = best_rf.score(X_test_sc, y_test)
print("The train score for the best RandomForest model is {}".format(train_score_rf))
print("The test score for the best RandomForest model is {}".format(test_score_rf))

640 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
352 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/ben/miniforge3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_con

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
The train score for the best RandomForest model is 0.9981620780564012
The test score for the best RandomForest model is 0.9907277547633194


In [298]:
mean_squared_error(y_train, predictions_rf_train)
mean_squared_error(y_test, predictions_rf)
print("The Random Forest model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_rf_train))))
print("The Random Forest model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_rf))))

The Random Forest model train root mean squared error (RMSE) is 0.722533832158648
The Random Forest model test root mean squared error (RMSE) is 1.718851425629867


#### ADA Boost

In [300]:
# Create an AdaBoostRegressor model with MultiOutputRegressor
abr = AdaBoostRegressor(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Create the GridSearchCV object
grid_search_abr = GridSearchCV(abr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_abr.fit(X_train_sc, y_train)

# Get the best parameters
best_params_abr = grid_search_abr.best_params_
print("Best Hyperparameters:", best_params_abr)

# Use the best model for predictions
best_abr = grid_search_abr.best_estimator_
predictions_abr_train = best_abr.predict(X_train_sc)
predictions_abr = best_abr.predict(X_test_sc)

# Display overall scores
train_score_abr = best_abr.score(X_train_sc, y_train)
test_score_abr = best_abr.score(X_test_sc, y_test)
print("The train score for the best AdaBoostRegressor model is {}".format(train_score_abr))
print("The test score for the best AdaBoostRegressor model is {}".format(test_score_abr))

python3.10(54746) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54747) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54748) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54749) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54750) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54751) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54752) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python3.10(54753) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Best Hyperparameters: {'learning_rate': 1.0, 'loss': 'square', 'n_estimators': 50}
The train score for the best AdaBoostRegressor model is 0.8300868198467096
The test score for the best AdaBoostRegressor model is 0.8500276474643186


In [301]:
mean_squared_error(y_train, predictions_abr_train)
mean_squared_error(y_test, predictions_abr)
print("The ADA Boost model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_abr_train))))
print("The ADA Boost model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_abr))))

The ADA Boost model train root mean squared error (RMSE) is 6.947174219505573
The ADA Boost model test root mean squared error (RMSE) is 6.912760108656165


#### Support Vector Machine

In [302]:
# Create an SVR model with MultiOutputRegressor
svr = SVR()

# Define the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3],
    'epsilon': [0.1, 0.2, 0.5]
}

# Create the GridSearchCV object
grid_search_svr = GridSearchCV(svr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search_svr.fit(X_train_sc, y_train)

# Get the best parameters
best_params_svr = grid_search_svr.best_params_
print("Best Hyperparameters:", best_params_svr)

# Use the best model for predictions
best_svr = grid_search_svr.best_estimator_
predictions_svr_train = best_svr.predict(X_train_sc)
predictions_svr = best_svr.predict(X_test_sc)

# Display overall scores
train_score_svr = best_svr.score(X_train_sc, y_train)
test_score_svr = best_svr.score(X_test_sc, y_test)
print("The train score for the best SVR model is {}".format(train_score_svr))
print("The test score for the best SVR model is {}".format(test_score_svr))

Best Hyperparameters: {'C': 10, 'degree': 2, 'epsilon': 0.5, 'kernel': 'rbf'}
The train score for the best SVR model is 0.9301417778198422
The test score for the best SVR model is 0.9431960326866551


In [304]:
mean_squared_error(y_train, predictions_svr_train)
mean_squared_error(y_test, predictions_svr)
print("The SVM Boost model train root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_train, predictions_svr_train))))
print("The SVM Boost model test root mean squared error (RMSE) is {}".format(math.sqrt(mean_squared_error(y_test, predictions_svr))))

The SVM Boost model train root mean squared error (RMSE) is 4.454545717462706
The SVM Boost model test root mean squared error (RMSE) is 4.254369616076896
