In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib

from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

In [2]:
# DEFINE SPLITTER FUNCTION

def tts_strat_cont(X, y, train_size, val_size, test_size, random_state):
    
    # Stratify based on y using pd.qcut
    y_bin = pd.qcut(y, q=5)

    # First split, stratify on y_bin
    X_train, X_other, y_train, y_other = train_test_split(
        X, y, train_size=train_size, stratify=y_bin, random_state=random_state)

    # Scale the validation size to reflect the remaining data
    val_size_scaled = val_size / (val_size + test_size)

    # Create stratified bins for the second split
    y_bin_other = pd.qcut(y_other, q=5)
    
    # Second split, stratify on y_bin_other
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=val_size_scaled, stratify=y_bin_other, random_state=random_state)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [10]:
# BUILD MODEL PIPELINE

# Lists to store results for multiple models
all_test_scores = {}
all_best_models = {}
all_best_r2 = {}

# function for the ML pipeline
def MLpipe_Strat_RMSE(X, y, train_size, val_size, test_size, preprocessor, ML_algo, param_grid, random_states):
    
    best_models = []
    test_scores = []
    r2_scores = []
    
    for i in range(random_states):
        
        try:
            ML_algo_instance = ML_algo(random_state=i)
        except TypeError:
            ML_algo_instance = ML_algo
        
        try:
            ML_algo_instance.set_params(max_iter=1000000)
        except (TypeError, ValueError):
            pass

        # call my split function
        X_train, y_train, X_val, y_val, X_test, y_test = tts_strat_cont(X, y, train_size, val_size, test_size, random_state=i)

        # preprocess the sets and fit model with inputs
        pipe = make_pipeline(preprocessor, ML_algo) 
        
        # search over specified parameter values
        grid = GridSearchCV(pipe, param_grid=param_grid, scoring='neg_root_mean_squared_error', 
                            return_train_score=True, n_jobs=-1, cv=[(np.arange(len(X_train)), np.arange(len(X_val)))],
                            verbose=True)
        grid.fit(X_train, y_train)
        # access results and save them into a data frame
        results = pd.DataFrame(grid.cv_results_)
        #print(results)
        # access best models and save them
        best_models.append(grid.best_estimator_)
        print('best model parameters:',grid.best_params_)
        print('validation score:',grid.best_score_) # this is the mean validation score over all iterations
        y_test_pred = grid.predict(X_test)
        test_scores.append(np.sqrt(mean_squared_error(y_test,y_test_pred)))
        r2_scores.append(r2_score(y_test,y_test_pred))
         #print("")

   
    return test_scores, best_models, r2_scores


In [4]:
df = pd.read_csv("interp_data/interp2.csv") # QUADRATIC INTERPOLATION
df = df.drop(df.columns[0], axis=1)

# add a random column to test the coefficients
# df["rand"] = np.random.randint(1, 6, df.shape[0])


In [11]:
X = df[['no_wrk_aux','no2_wrk_aux','o3_wrk_aux','temp','rh','t_since_depl']] #,'rand'
y = df['no2_ref']

train_size = 0.6
val_size = 0.2
test_size = 0.2

# preprocessor will be applied to all columns
num_cols = X.columns

# collect all the encoders into one preprocessor (eliminated imputer!)
num_pipeline= Pipeline([('poly_features', PolynomialFeatures(degree=6, include_bias=False, interaction_only=True)),
                        ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_cols)])

In [16]:
# LINEAR REGRESSION WITH NO REGULARIZATION

# hyperparameters to tune
param_grid_lr = {
    'linearregression__fit_intercept': [True]} 

# the model
ML_algo_lr = LinearRegression() 

# Execute the pipeline
test_scores_lr, best_models_lr, r2_scores_lr = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_lr, param_grid=param_grid_lr, random_states=5
)

all_test_scores['Linear Regression'] = test_scores_lr
all_best_models['Linear Regression'] = best_models_lr
all_best_r2['Linear Regression'] = r2_scores_lr


print("")
print("Model: Linear Regression")
print("Mean of test scores:", np.mean(test_scores_lr))
print("Standard deviation of test scores:", np.std(test_scores_lr))
print("Mean of test R2 values:",np.mean(r2_scores_lr))
print("Standard deviation of test R2 values:",np.std(r2_scores_lr))


Fitting 1 folds for each of 1 candidates, totalling 1 fits
best model parameters: {'linearregression__fit_intercept': True}
validation score: -2.0271965984697866
Fitting 1 folds for each of 1 candidates, totalling 1 fits
best model parameters: {'linearregression__fit_intercept': True}
validation score: -1.954954400150525
Fitting 1 folds for each of 1 candidates, totalling 1 fits
best model parameters: {'linearregression__fit_intercept': True}
validation score: -2.0010531444137243
Fitting 1 folds for each of 1 candidates, totalling 1 fits
best model parameters: {'linearregression__fit_intercept': True}
validation score: -1.9849204682567154
Fitting 1 folds for each of 1 candidates, totalling 1 fits
best model parameters: {'linearregression__fit_intercept': True}
validation score: -1.8891945467542468

Model: Linear Regression
Mean of test scores: 2.0502856281160384
Standard deviation of test scores: 0.07273685948599959
Mean of test R2 values: 0.8894617756126524
Standard deviation of test 

In [14]:
# RIDGE

# hyperparameters to tune
param_grid_rdg = {
    'ridge__alpha': [0, 0.0001, 0.001, 0.01]} 

# the model
ML_algo_rdg = Ridge() 

# Execute the pipeline
test_scores_rdg, best_models_rdg, r2_scores_rdg = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_rdg, param_grid=param_grid_rdg, random_states=5
)

all_test_scores['Ridge Regression'] = test_scores_rdg
all_best_models['Ridge Regression'] = best_models_rdg
all_best_r2['Ridge Regression'] = r2_scores_rdg

print("")
print("Model: Ridge Regression")
print("Mean of test scores:", np.mean(test_scores_rdg))
print("Standard deviation of test scores:", np.std(test_scores_rdg))
print("Mean of test R2 values:",np.mean(r2_scores_rdg))
print("Standard deviation of test R2 values:",np.std(r2_scores_rdg))


Fitting 1 folds for each of 4 candidates, totalling 4 fits
best model parameters: {'ridge__alpha': 0}
validation score: -2.027196598424725
Fitting 1 folds for each of 4 candidates, totalling 4 fits
best model parameters: {'ridge__alpha': 0.0001}
validation score: -1.9547393378068294
Fitting 1 folds for each of 4 candidates, totalling 4 fits
best model parameters: {'ridge__alpha': 0}
validation score: -2.0010531444120234
Fitting 1 folds for each of 4 candidates, totalling 4 fits
best model parameters: {'ridge__alpha': 0}
validation score: -1.9849204682570234
Fitting 1 folds for each of 4 candidates, totalling 4 fits
best model parameters: {'ridge__alpha': 0}
validation score: -1.8891945467716325

Model: Ridge Regression
Mean of test scores: 2.050094981343801
Standard deviation of test scores: 0.07260954047718093
Mean of test R2 values: 0.8894824750400974
Standard deviation of test R2 values: 0.0067466106957535625


In [None]:
# LASSO 

# hyperparameters to tune
param_grid_lss = {
    'lasso__alpha': [0.001, 0.01, 0.1]} 

# the model
ML_algo_lss = Lasso() 

# Execute the pipeline
test_scores_lss, best_models_lss, r2_scores_lss = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_lss, param_grid=param_grid_lss, random_states=5
)

all_test_scores['Lasso Regression'] = test_scores_lss
all_best_models['Lasso Regression'] = best_models_lss
all_best_r2['Lasso Regression'] = r2_scores_lss

print("")
print("Model: Lasso Regression")
print("Mean of test scores:", np.mean(test_scores_lss))
print("Standard deviation of test scores:", np.std(test_scores_lss))
print("Mean of test R2 values:",np.mean(r2_scores_lss))
print("Standard deviation of test R2 values:",np.std(r2_scores_lss))


Fitting 1 folds for each of 3 candidates, totalling 3 fits
best model parameters: {'lasso__alpha': 0.001}
validation score: -2.178505817176367
Fitting 1 folds for each of 3 candidates, totalling 3 fits
best model parameters: {'lasso__alpha': 0.001}
validation score: -2.1065293820613644
Fitting 1 folds for each of 3 candidates, totalling 3 fits


In [11]:
# RANDOM FOREST - CHANGE PARAMETER GRID!

from sklearn.ensemble import RandomForestRegressor

# parameter grid
param_grid_rf = {
    'randomforestregressor__n_estimators': [30, 40, 100],
    'randomforestregressor__max_depth': [10, 30, 50]} 

# model
ML_algo_rf = RandomForestRegressor()

# Execute the pipeline
test_scores_rf, best_models_rf, r2_scores_rf = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_rf, param_grid=param_grid_rf, random_states=5
)

all_test_scores['Random Forest'] = test_scores_rf
all_best_models['Random Forest'] = best_models_rf
all_best_r2['Random Forest'] = r2_scores_rf

print("")
print("Model: Random Forest")
print("Mean of test scores:", np.mean(test_scores_rf))
print("Standard deviation of test scores:", np.std(test_scores_rf))
print("Mean of test R2 values:",np.mean(r2_scores_rf))
print("Standard deviation of test R2 values:",np.std(r2_scores_rf))

Fitting 1 folds for each of 9 candidates, totalling 9 fits
best model parameters: {'randomforestregressor__max_depth': 50, 'randomforestregressor__n_estimators': 100}
validation score: -0.5767014983542453
Fitting 1 folds for each of 9 candidates, totalling 9 fits
best model parameters: {'randomforestregressor__max_depth': 30, 'randomforestregressor__n_estimators': 100}
validation score: -0.5577533622720137
Fitting 1 folds for each of 9 candidates, totalling 9 fits
best model parameters: {'randomforestregressor__max_depth': 30, 'randomforestregressor__n_estimators': 100}
validation score: -0.5657834571686248
Fitting 1 folds for each of 9 candidates, totalling 9 fits
best model parameters: {'randomforestregressor__max_depth': 30, 'randomforestregressor__n_estimators': 100}
validation score: -0.5694692438169906
Fitting 1 folds for each of 9 candidates, totalling 9 fits
best model parameters: {'randomforestregressor__max_depth': 50, 'randomforestregressor__n_estimators': 100}
validation sc

In [12]:
# NEAREST NEIGHBOR
from sklearn.neighbors import KNeighborsRegressor

# parameter grid
param_grid_knn = {
    'kneighborsregressor__n_neighbors': [1, 3, 10],
    'kneighborsregressor__weights': ["distance", "uniform"]} 

# model
ML_algo_knn = KNeighborsRegressor() # no random state

# Execute the pipeline
test_scores_knn, best_models_knn, r2_scores_knn = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_knn, param_grid=param_grid_knn, random_states=5
)

all_test_scores['Nearest Neighbors'] = test_scores_knn
all_best_models['Nearest Neighbors'] = best_models_knn
all_best_r2['Nearest Neighbors'] = r2_scores_knn

print("")
print("Model: Nearest Neighbors")
print("Mean of test scores:", np.mean(test_scores_knn))
print("Standard deviation of test scores:", np.std(test_scores_knn))
print("Mean of test R2 values:",np.mean(r2_scores_knn))
print("Standard deviation of test R2 values:",np.std(r2_scores_knn))

Fitting 1 folds for each of 6 candidates, totalling 6 fits
best model parameters: {'kneighborsregressor__n_neighbors': 1, 'kneighborsregressor__weights': 'uniform'}
validation score: 0.0
Fitting 1 folds for each of 6 candidates, totalling 6 fits
best model parameters: {'kneighborsregressor__n_neighbors': 1, 'kneighborsregressor__weights': 'uniform'}
validation score: 0.0
Fitting 1 folds for each of 6 candidates, totalling 6 fits
best model parameters: {'kneighborsregressor__n_neighbors': 1, 'kneighborsregressor__weights': 'uniform'}
validation score: 0.0
Fitting 1 folds for each of 6 candidates, totalling 6 fits
best model parameters: {'kneighborsregressor__n_neighbors': 1, 'kneighborsregressor__weights': 'uniform'}
validation score: 0.0
Fitting 1 folds for each of 6 candidates, totalling 6 fits
best model parameters: {'kneighborsregressor__n_neighbors': 1, 'kneighborsregressor__weights': 'uniform'}
validation score: 0.0

Model: Nearest Neighbors
Mean of test scores: 2.2750764270208323

In [1]:
# Elastic Net - STILL DOESN'T CONVERGE!

# hyperparameters to tune
param_grid_eln = {
    'elasticnet__alpha': [0.0001, 0.001, 0.01], 
    'elasticnet__l1_ratio': [0.85, 0.95, 0.99]}

# the model
ML_algo_eln = ElasticNet() 

# Execute the pipeline
#test_scores_eln, best_models_eln = MLpipe_Strat_RMSE(
#    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
#    preprocessor=preprocessor, ML_algo=ML_algo_eln, param_grid=param_grid_eln, random_states=5
#)

all_test_scores['Elastic Net Regression'] = test_scores_eln
all_best_models['Elastic Net Regression'] = best_models_eln

print("")
print("Model: Elastic Net Regression")
print("Mean of test scores:", np.mean(test_scores_eln))
print("Standard deviation of test scores:", np.std(test_scores_eln))

NameError: name 'ElasticNet' is not defined

In [73]:
# SUPPORT VECTOR MACHINE

# ADD KERNEL AS A HYPERPARAMETER

from sklearn.svm import SVR

# parameter grid
param_grid_svc = {
    'svr__gamma': [1, 10, 100],
    'svr__C': [10, 90, 100, 120} 

# model
ML_algo_svc = SVR() # no random state

# Execute the pipeline
test_scores_svc, best_models_svc = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo_svc, param_grid=param_grid_svc, random_states=5
)

all_test_scores['SVC'] = test_scores_svr
all_best_models['SVC'] = best_models_svr

#print("Mean of test scores:", np.mean(test_scores))
#print("Standard deviation of test scores:", np.std(test_scores))

print("")
print(f"Model: Support Vector Machine")
print(f"Mean Test Score: {np.mean(test_scores_svc)}")
print(f"Std Dev Test Score: {np.std(test_scores_svc)}")

Fitting 1 folds for each of 25 candidates, totalling 25 fits
best model parameters: {'svr__C': 100.0, 'svr__gamma': 10.0}
validation score: -0.09937376988880782
Fitting 1 folds for each of 25 candidates, totalling 25 fits
best model parameters: {'svr__C': 100.0, 'svr__gamma': 10.0}
validation score: -0.09962267979118887
Fitting 1 folds for each of 25 candidates, totalling 25 fits
best model parameters: {'svr__C': 100.0, 'svr__gamma': 10.0}
validation score: -0.09898630037610172
Fitting 1 folds for each of 25 candidates, totalling 25 fits
best model parameters: {'svr__C': 100.0, 'svr__gamma': 10.0}
validation score: -0.09936157445681344
Fitting 1 folds for each of 25 candidates, totalling 25 fits
best model parameters: {'svr__C': 100.0, 'svr__gamma': 10.0}
validation score: -0.09920904203032768
Mean of test scores: 2.607242414881502
Standard deviation of test scores: 1.421133888181789
