## Hyperparameter Optimization

In [1]:
import numpy as np
import pandas as pd
import random as random

# from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer

from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


from sklearn.metrics import make_scorer

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from scipy.stats import lognorm, loguniform, randint


In [2]:
# adapted from DSCI 573 Lecture

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Data splitting + preprocessing (+ poss subsetting)

In [3]:
df = pd.read_csv('data/train.csv')

df.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


In [4]:
target = "site_eui"
train_df, test_df = train_test_split(df, test_size=0.4, random_state=123)

X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

In [5]:
numerical_features = train_df.select_dtypes('number').columns.tolist()
numerical_features.remove('id')
numerical_features.remove("site_eui")
categorical_features = train_df.select_dtypes('object').columns.tolist()
drop_features = ['id']

In [6]:
ct = make_column_transformer(
    (make_pipeline(SimpleImputer(), StandardScaler()), numerical_features),
    (OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical_features),
    ("drop", drop_features)
)

#### Subsetting data

In [7]:
len(y_train)

45454

In [24]:
def subset_data(X, y):
    """
    Function to shuffle + subset data
    """

    # discuss size of subset 

### Baseline Models: 

In [9]:
results = {}
scoring_metrics = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_percentage_error"]

####  DummyRegressor

In [10]:
dummy_pipe = make_pipeline(
    ct,
    DummyRegressor()
)

In [11]:
results["Dummy"] = mean_std_cross_val_scores(
    dummy_pipe, X_train, y_train, scoring=scoring_metrics, return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.066 (+/- 0.013)
score_time,0.012 (+/- 0.000)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795)
test_r2,-0.000 (+/- 0.000)
train_r2,0.000 (+/- 0.000)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004)


#### Linear Regression (Ridge) 

In [12]:
ridge_pipe = make_pipeline(
    ct,
    Ridge()
)

In [13]:
results["Ridge"] = mean_std_cross_val_scores(
    ridge_pipe, X_train, y_train, scoring=scoring_metrics, return_train_score=True, error_score="raise"
)

pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge
fit_time,0.066 (+/- 0.013),0.104 (+/- 0.009)
score_time,0.012 (+/- 0.000),0.014 (+/- 0.001)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172),-47.136 (+/- 3.109)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795),-46.816 (+/- 0.766)
test_r2,-0.000 (+/- 0.000),0.354 (+/- 0.026)
train_r2,0.000 (+/- 0.000),0.364 (+/- 0.006)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013),-0.604 (+/- 0.011)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004),-0.601 (+/- 0.005)


#### KNN, SVR, RF

In [14]:
models = {
    "knn": make_pipeline(ct, KNeighborsRegressor()),
    "svr": make_pipeline(ct, SVR()),
    "randomforest": make_pipeline(ct, RandomForestRegressor())
}

In [15]:
# DONT RUN THIS AGAIN plz


# for model in models:
#     print(model)
#     results[model] = mean_std_cross_val_scores(
#         models[model],
#         X_train,
#         y_train,
#         scoring=scoring_metrics,
#         return_train_score=True,
#         verbose = False,
#         n_jobs = -1
#     )

#### Baseline Scores

In [16]:
pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge
fit_time,0.066 (+/- 0.013),0.104 (+/- 0.009)
score_time,0.012 (+/- 0.000),0.014 (+/- 0.001)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172),-47.136 (+/- 3.109)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795),-46.816 (+/- 0.766)
test_r2,-0.000 (+/- 0.000),0.354 (+/- 0.026)
train_r2,0.000 (+/- 0.000),0.364 (+/- 0.006)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013),-0.604 (+/- 0.011)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004),-0.601 (+/- 0.005)


### Hyperparameter optimization

#### Linear Regression (Ridge) ~ 
*alpha : Regularization strength*

In [17]:
param_grid = {"ridge__alpha": 10.0 ** np.arange(-6, 6, 1)}

ridge_search = RandomizedSearchCV(
    ridge_pipe, param_grid, return_train_score=True, n_jobs=-1
)

ridge_search.fit(X_train, y_train);


In [18]:
print("Best hyperparameter values: ", ridge_search.best_params_)
print("Best score: %0.3f" % (ridge_search.best_score_))

Best hyperparameter values:  {'ridge__alpha': 1.0}
Best score: 0.354


 default is the best performing?

In [19]:
best_ridge = ridge_search.best_estimator_

results["Ridge (tuned)"] = mean_std_cross_val_scores(
    best_ridge, X_train, y_train, scoring=scoring_metrics, return_train_score=True, error_score="raise"
)
pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge,Ridge (tuned)
fit_time,0.066 (+/- 0.013),0.104 (+/- 0.009),0.139 (+/- 0.081)
score_time,0.012 (+/- 0.000),0.014 (+/- 0.001),0.015 (+/- 0.001)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172),-47.136 (+/- 3.109),-47.136 (+/- 3.109)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795),-46.816 (+/- 0.766),-46.816 (+/- 0.766)
test_r2,-0.000 (+/- 0.000),0.354 (+/- 0.026),0.354 (+/- 0.026)
train_r2,0.000 (+/- 0.000),0.364 (+/- 0.006),0.364 (+/- 0.006)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013),-0.604 (+/- 0.011),-0.604 (+/- 0.011)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004),-0.601 (+/- 0.005),-0.601 (+/- 0.005)


*preserved results prior to computer implosion*

![](hp.png)

#### KNN

*n_neighbors : # of neighbors*

In [20]:
param_grid2 = {"kneighborsregressor__n_neighbors": np.arange(5, 95, 10, dtype =int)}

In [21]:
# ok i can def make this a loop but my brainz cant deal w that rn

knn_search =  GridSearchCV(
    models["knn"],
    param_grid2,
    return_train_score=True,
    n_jobs=-1
)

In [22]:
# knn_search.fit(X_train, y_train);

In [23]:
print("Best hyperparameter values: ", knn_search.best_params_)
print("Best score: %0.3f" % (knn_search.best_score_))

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

#### SVR

*C : Regularization strength*

*Gamma : Kernel coefficient*


In [None]:
param_grid3 = {"svr__gamma": np.logspace(-3, 2, 6), "svr__C": np.linspace(1, 3, 6)}

In [None]:
svr_search = RandomizedSearchCV(
    models["svr"], 
    param_grid3, 
    return_train_score=True, 
    n_jobs=-1
)


In [None]:
# svr_search.fit(X_train, y_train);

In [None]:
print("Best hyperparameter values: ", svr_search.best_params_)
print("Best score: %0.3f" % (svr_search.best_score_))

#### RandomForest
*n_estimators : number of trees in forest*

*max_depth : depth of trees*

*max_features: features to consider when looking for the best split*

In [None]:
 param_grid4 ={
        "randomforestregressor__n_estimators": np.arange(5, 200, 5),
        "randomforestregressor__max_depth": np.arange(2, 20, 4),
        "randomforestregressor__max_features": np.arange(0.05, 1, 0.05)
    }

In [None]:
rf_search = RandomizedSearchCV(
    models["randomforest"], 
    param_grid3, 
    return_train_score=True, 
    n_jobs=-1
)


In [None]:
# rf_search.fit(X_train, y_train);

In [None]:
print("Best hyperparameter values: ", rf_search.best_params_)
print("Best score: %0.3f" % (rf_search.best_score_))