In [42]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib


from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

In [43]:
def tts_strat_cont(X, y, train_size, val_size, test_size, random_state):
    
    # Stratify based on y using pd.qcut
    y_bin = pd.qcut(y, q=5)

    # First split, stratify on y_bin
    X_train, X_other, y_train, y_other = train_test_split(
        X, y, train_size=train_size, stratify=y_bin, random_state=random_state)

    # Scale the validation size to reflect the remaining data
    val_size_scaled = val_size / (val_size + test_size)

    # Create stratified bins for the second split
    y_bin_other = pd.qcut(y_other, q=5)
    
    # Second split, stratify on y_bin_other
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=val_size_scaled, stratify=y_bin_other, random_state=random_state)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [44]:
best_models = []
test_scores = []

# function for the ML pipeline
def MLpipe_Strat_RMSE(X, y, train_size, val_size, test_size, preprocessor, ML_algo, param_grid, random_states):
    
        for i in range(random_states):

            # call my split function
            X_train, y_train, X_val, y_val, X_test, y_test = tts_strat_cont(X, y, train_size, val_size, test_size, random_state=i)
            
            # preprocess the sets and fit model with inputs
            pipe = make_pipeline(preprocessor, ML_algo) 

            # Exhaustive search over specified parameter values
            grid = GridSearchCV(pipe, param_grid=param_grid, scoring='neg_root_mean_squared_error', 
                                return_train_score=True, n_jobs=-1, cv=[(np.arange(len(X_train)), np.arange(len(X_val)))],
                                verbose=True)

            grid.fit(X_train, y_train)
       
            # access results and save them into a data frame
            results = pd.DataFrame(grid.cv_results_)
            print(results)

            # access best models and save them
            best_models.append(grid.best_estimator_)
        
           # print('best model parameters:',grid.best_params_)
           # print('validation score:',grid.best_score_) # this is the mean validation score over all iterations

            y_test_pred = grid.predict(X_test)
            test_scores.append(np.sqrt(mean_squared_error(y_test,y_test_pred)))
           # print("")

        return test_scores, best_models


In [45]:
df = pd.read_csv("processed_data_1/merge.csv")
df = df.drop(df.columns[0], axis=1)

X = df[['no_wrk_aux','no2_wrk_aux','o3_wrk_aux','temp','rh','t_since_depl']]
y = df['no2_ref']

train_size = 0.6
val_size = 0.2
test_size = 0.2
random_state = 10

# collect which encoder to use on each feature
num_cols = X.columns

# collect all the encoders into one preprocessor
num_pipeline= Pipeline([('imputer', SimpleImputer(strategy='median')),
                        ('poly_features', PolynomialFeatures(degree=6, include_bias=False, interaction_only=True)),
                        ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_cols)])

# hyperparameters to tune
param_grid = {
    'ridge__alpha': [0, 0.1, 1, 10, 100]} 

# the model
ML_algo = Ridge(random_state = 1, max_iter = 12000) 

# random states
random_states = 10


#test_scores, final_models = MLpipe_KFold_RMSE(X, y, preprocessor, ML_algo, param_grid, random_states)

# THIS DOESN'T SEEM RIGHT, BECAUSE I'M GETTING A NEGATIVE VALIDATION SCORE? SHOULD BE RMSE?


In [46]:
# Execute the pipeline
test_scores, best_models = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo, param_grid=param_grid, random_states=10
)

print("Mean of test scores:", np.mean(test_scores))
print("Standard deviation of test scores:", np.std(test_scores))

Fitting 1 folds for each of 5 candidates, totalling 5 fits
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.013181           0.0         0.002383             0.0   
1       0.013171           0.0         0.003719             0.0   
2       0.013152           0.0         0.002690             0.0   
3       0.013274           0.0         0.001961             0.0   
4       0.016019           0.0         0.004612             0.0   

   param_ridge__alpha                 params  split0_test_score  \
0                 0.0    {'ridge__alpha': 0}          -2.193172   
1                 0.1  {'ridge__alpha': 0.1}          -2.270942   
2                 1.0    {'ridge__alpha': 1}          -2.356394   
3                10.0   {'ridge__alpha': 10}          -2.560865   
4               100.0  {'ridge__alpha': 100}          -2.931561   

   mean_test_score  std_test_score  rank_test_score  split0_train_score  \
0        -2.193172             0.0                1         

In [31]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestRegressor

# parameter grid
param_grid = {
    'randomforestregressor__n_estimators': [1, 3, 10, 30],
    'randomforestregressor__max_depth': [1, 2, 3, 10, 30]} 

# model
ML_algo = RandomForestRegressor(random_state = 1)

# Execute the pipeline
test_scores, best_models = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo, param_grid=param_grid, random_states=10
)

print("Mean of test scores:", np.mean(test_scores))
print("Standard deviation of test scores:", np.std(test_scores))

Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Fitting 1 folds for each of 20 candidates, totalling 20 fits
Mean of test scores: 1.6192366177542632
Standard deviation of test scores: 0.07141522555049278


In [32]:
# SUPPORT VECTOR MACHINE

# ADD KERNEL AS A HYPERPARAMETER

from sklearn.svm import SVR

# parameter grid
param_grid = {
    'svr__gamma': [1e-3, 1e-1, 1e1, 1e3, 1e5],
    'svr__C': [1e-2, 1e-1, 1e0, 1e1, 1e2]} 

# model
ML_algo = SVR() # no random state

# Execute the pipeline
test_scores, best_models = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo, param_grid=param_grid, random_states=10
)
print("Mean of test scores:", np.mean(test_scores))
print("Standard deviation of test scores:", np.std(test_scores))

Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Fitting 1 folds for each of 25 candidates, totalling 25 fits
Mean of test scores: 3.829574158725759
Standard deviation of test scores: 2.2120472399845914


In [34]:
# NEAREST NEIGHBOR
from sklearn.neighbors import KNeighborsRegressor

# parameter grid
param_grid = {
    'kneighborsregressor__n_neighbors': [1, 3, 10, 30],
    'kneighborsregressor__weights': ["distance", "uniform"]} 

# model
ML_algo = KNeighborsRegressor() # no random state

# Execute the pipeline
test_scores, best_models = MLpipe_Strat_RMSE(
    X, y, train_size=0.6, val_size=0.2, test_size=0.2, 
    preprocessor=preprocessor, ML_algo=ML_algo, param_grid=param_grid, random_states=10
)
print("Mean of test scores:", np.mean(test_scores))
print("Standard deviation of test scores:", np.std(test_scores))

Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Fitting 1 folds for each of 8 candidates, totalling 8 fits
Mean of test scores: 3.336582676322932
Standard deviation of test scores: 1.936842188117602
