## Import Libaries and Packages

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning);

## Import and Investigate the Data

In [79]:
df_train = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/project_5/data/df_2000_to_2017_pollution_renewables.csv')

In [80]:
df_test = pd.read_csv('/Users/ben/Desktop/DSI_GA_Materials/project_5/data/df_2018_to_2020_pollution_renewables.csv')

In [81]:
df_train.shape

(6063, 27)

In [82]:
df_test.shape

(1536, 27)

In [83]:
df_train.columns

Index(['State', 'Year', 'Month', 'O3 Mean', 'O3 1st Max Value',
       'O3 1st Max Hour', 'O3 AQI', 'CO Mean', 'CO 1st Max Value',
       'CO 1st Max Hour', 'CO AQI', 'SO2 Mean', 'SO2 1st Max Value',
       'SO2 1st Max Hour', 'SO2 AQI', 'NO2 Mean', 'NO2 1st Max Value',
       'NO2 1st Max Hour', 'NO2 AQI',
       'Renewable energy share in the total final energy consumption (%)',
       'Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
       'Electricity from renewables (TWh)',
       'Low-carbon electricity (% electricity)',
       'Primary energy consumption per capita (kWh/person)',
       'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
       'Renewables (% equivalent primary energy)'],
      dtype='object')

### Renewable energy amounts are by number of pollution observations, these are approximations based on the pollution amounts. Approximations make renewable, fossil fuel, energy usage uniform across states in order to get a rough estimate for totals upon the test set. 

## Modeling Section

### Define features and perform train-test split

In [84]:
X_features = ['O3 Mean', 'O3 1st Max Value',
       'O3 1st Max Hour', 'O3 AQI', 'CO Mean', 'CO 1st Max Value',
       'CO 1st Max Hour', 'CO AQI', 'SO2 Mean', 'SO2 1st Max Value',
       'SO2 1st Max Hour', 'SO2 AQI', 'NO2 Mean', 'NO2 1st Max Value',
       'NO2 1st Max Hour', 'NO2 AQI']

y_features = ['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
       'Electricity from renewables (TWh)',
       'Low-carbon electricity (% electricity)']


X = df_train[X_features]
y = df_train[y_features]

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

### Scale Data

In [86]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Start with Linear, Lasso, Ridge, Regressions

In [103]:
lin = MultiOutputRegressor(LinearRegression())

In [104]:
lin.fit(X_train_sc, y_train)

In [105]:
#predict
predictions_lin = lin.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Linear model is {}".format(train_score_lin))
print("The test score for Linear model is {}".format(test_score_lin))

R-squared score for Electricity from fossil fuels (TWh): 0.5827204706787024 

Mean Squared Error for Electricity from fossil fuels (TWh): 2.9125831368036876
 
R-squared score for Electricity from nuclear (TWh): 0.5703275606175857 

Mean Squared Error for Electricity from nuclear (TWh): 0.2048821352916771
 
R-squared score for Electricity from renewables (TWh): 0.1327971732451908 

Mean Squared Error for Electricity from renewables (TWh): 0.01336109019101315
 
R-squared score for Low-carbon electricity (% electricity): 0.5334510801093122 

Mean Squared Error for Low-carbon electricity (% electricity): 0.00021407712276413912
 
The train score for Linear model is 0.6233417246261556
The test score for Linear model is 0.4548240711626978


In [90]:
# Lasso
lasso = MultiOutputRegressor(Lasso())

lasso.fit(X_train_sc,y_train)

In [91]:
predictions_lasso = lasso.predict(X_test_sc)


# Calculate scores for each variable
scores_r2_lasso = [r2_score(y_test.iloc[:, i], predictions_lasso[:, i]) for i in range(len(y_features))]
scores_mse_lasso = [mean_squared_error(y_test.iloc[:, i], predictions_lasso[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Lasso): {scores_r2_lasso[i]} \n")
    print(f"Mean Squared Error for {feature} (Lasso): {scores_mse_lasso[i]} \n ")

# Display overall scores
train_score_lasso = lasso.score(X_train_sc, y_train)
test_score_lasso = lasso.score(X_test_sc, y_test)
print("The train score for Lasso model is {}".format(train_score_lasso))
print("The test score for Lasso model is {}".format(test_score_lasso))

R-squared score for Electricity from fossil fuels (TWh) (Lasso): 0.24870791829230154 

Mean Squared Error for Electricity from fossil fuels (TWh) (Lasso): 5.243968357506238 
 
R-squared score for Electricity from nuclear (TWh) (Lasso): -0.0048639747771392905 

Mean Squared Error for Electricity from nuclear (TWh) (Lasso): 0.4791526240918315 
 
R-squared score for Electricity from renewables (TWh) (Lasso): -0.0017352130477865657 

Mean Squared Error for Electricity from renewables (TWh) (Lasso): 0.015433845596572859 
 
R-squared score for Low-carbon electricity (% electricity) (Lasso): -0.0050124687070458585 

Mean Squared Error for Low-carbon electricity (% electricity) (Lasso): 0.0004611524504081981 
 
The train score for Lasso model is 0.06141365354383993
The test score for Lasso model is 0.05927406544008246


#### Lasso will not be of any help here

In [92]:
# Ridge
ridge = MultiOutputRegressor(Ridge())

ridge.fit(X_train_sc,y_train)

In [108]:
# Predictions
predictions_ridge = ridge.predict(X_test_sc)

# Calculate scores for each variable
scores_r2_ridge = [r2_score(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]
scores_mse_ridge = [mean_squared_error(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Ridge): {scores_r2_ridge[i]} \n")
    print(f"Mean Squared Error for {feature} (Ridge): {scores_mse_ridge[i]} \n")

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

R-squared score for Electricity from fossil fuels (TWh) (Ridge): 0.6390847186438195 

Mean Squared Error for Electricity from fossil fuels (TWh) (Ridge): 2.519164465130922 

R-squared score for Electricity from nuclear (TWh) (Ridge): 0.6366636276687127 

Mean Squared Error for Electricity from nuclear (TWh) (Ridge): 0.17325088827983298 

R-squared score for Electricity from renewables (TWh) (Ridge): 0.24242694326598713 

Mean Squared Error for Electricity from renewables (TWh) (Ridge): 0.011672012157966061 

R-squared score for Low-carbon electricity (% electricity) (Ridge): 0.6113683901836473 

Mean Squared Error for Low-carbon electricity (% electricity) (Ridge): 0.00017832457283187666 

The train score for Ridge model is 0.6006619302787535
The test score for Ridge model is 0.5323859199405416


## Polynomial Features

In [94]:
poly = PolynomialFeatures(degree=2,include_bias=False)

In [95]:
X_train_sc = poly.fit_transform(X_train_sc)

X_test_sc = poly.transform(X_test_sc)

In [96]:
lin.fit(X_train_sc, y_train)

In [97]:
ridge.fit(X_train_sc,y_train)

In [109]:
#predict
predictions_lin = lin.predict(X_test_sc)

# Calculate scores for each variable
scores_r2 = [r2_score(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]
scores_mse = [mean_squared_error(y_test.iloc[:, i], predictions_lin[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature}: {scores_r2[i]} \n")
    print(f"Mean Squared Error for {feature}: {scores_mse[i]}\n ")

# Display overall scores
train_score_lin = lin.score(X_train_sc, y_train)
test_score_lin = lin.score(X_test_sc, y_test)
print("The train score for Lasso model is {}".format(train_score_lin))
print("The test score for Lasso model is {}".format(test_score_lin))

R-squared score for Electricity from fossil fuels (TWh): 0.5827204706787024 

Mean Squared Error for Electricity from fossil fuels (TWh): 2.9125831368036876
 
R-squared score for Electricity from nuclear (TWh): 0.5703275606175857 

Mean Squared Error for Electricity from nuclear (TWh): 0.2048821352916771
 
R-squared score for Electricity from renewables (TWh): 0.1327971732451908 

Mean Squared Error for Electricity from renewables (TWh): 0.01336109019101315
 
R-squared score for Low-carbon electricity (% electricity): 0.5334510801093122 

Mean Squared Error for Low-carbon electricity (% electricity): 0.00021407712276413912
 
The train score for Lasso model is 0.6233417246261556
The test score for Lasso model is 0.4548240711626978


#### Lasso will not be of any help here

In [110]:
# Predictions
predictions_ridge = ridge.predict(X_test_sc)

# Calculate scores for each variable
scores_r2_ridge = [r2_score(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]
scores_mse_ridge = [mean_squared_error(y_test.iloc[:, i], predictions_ridge[:, i]) for i in range(len(y_features))]

# Display scores for each variable
for i, feature in enumerate(y_features):
    print(f"R-squared score for {feature} (Ridge): {scores_r2_ridge[i]} \n")
    print(f"Mean Squared Error for {feature} (Ridge): {scores_mse_ridge[i]} \n")

# Display overall scores
train_score_ridge = ridge.score(X_train_sc, y_train)
test_score_ridge = ridge.score(X_test_sc, y_test)
print("The train score for Ridge model is {}".format(train_score_ridge))
print("The test score for Ridge model is {}".format(test_score_ridge))

R-squared score for Electricity from fossil fuels (TWh) (Ridge): 0.6390847186438195 

Mean Squared Error for Electricity from fossil fuels (TWh) (Ridge): 2.519164465130922 

R-squared score for Electricity from nuclear (TWh) (Ridge): 0.6366636276687127 

Mean Squared Error for Electricity from nuclear (TWh) (Ridge): 0.17325088827983298 

R-squared score for Electricity from renewables (TWh) (Ridge): 0.24242694326598713 

Mean Squared Error for Electricity from renewables (TWh) (Ridge): 0.011672012157966061 

R-squared score for Low-carbon electricity (% electricity) (Ridge): 0.6113683901836473 

Mean Squared Error for Low-carbon electricity (% electricity) (Ridge): 0.00017832457283187666 

The train score for Ridge model is 0.6006619302787535
The test score for Ridge model is 0.5323859199405416


## Logistic Regression 