# Model Selection

In [1]:
# import os

# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.impute import SimpleImputer
# from sklearn.linear_model import Ridge
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
# from sklearn.tree import DecisionTreeRegressor, export_graphviz

# from pandas_profiling import ProfileReport

# import altair as alt

from sklearn.metrics import make_scorer

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor

# import shap
# from catboost import CatBoostClassifier
# from lightgbm.sklearn import LGBMRegressor
# from xgboost import XGBRegressor

# from sklearn.ensemble import VotingRegressor
# from sklearn.ensemble import StackingRegressor
# %matplotlib inline

In [2]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

## Splitting the data

In [3]:
df = pd.read_csv('data/raw/train.csv')

df.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


In [4]:
target = "site_eui"
train_df, test_df = train_test_split(df, test_size=0.4, random_state=123)

X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

## Preprocessing

In [5]:
numerical_features = train_df.select_dtypes('number').columns.tolist()
numerical_features.remove('id')
numerical_features.remove("site_eui")
categorical_features = train_df.select_dtypes('object').columns.tolist()
drop_features = ['id']

In [6]:
ct = make_column_transformer(
    (make_pipeline(SimpleImputer(), StandardScaler()), numerical_features),
    (OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical_features),
    ("drop", drop_features)
)

## Baseline Model

In [7]:
results = {}
scoring_metrics = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_percentage_error"]

In [8]:
dummy_pipe = make_pipeline(
    ct,
    DummyRegressor()
)

In [9]:
results["Dummy"] = mean_std_cross_val_scores(
    dummy_pipe, X_train, y_train, scoring=scoring_metrics, return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.069 (+/- 0.013)
score_time,0.013 (+/- 0.001)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795)
test_r2,-0.000 (+/- 0.000)
train_r2,0.000 (+/- 0.000)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004)


## Linear Regression

In [10]:
ridge_pipe = make_pipeline(
    ct,
    Ridge()
)

In [11]:
results["Ridge"] = mean_std_cross_val_scores(
    ridge_pipe, X_train, y_train, scoring=scoring_metrics, return_train_score=True, error_score="raise"
)

pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge
fit_time,0.069 (+/- 0.013),0.101 (+/- 0.005)
score_time,0.013 (+/- 0.001),0.014 (+/- 0.002)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172),-47.136 (+/- 3.109)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795),-46.816 (+/- 0.766)
test_r2,-0.000 (+/- 0.000),0.354 (+/- 0.026)
train_r2,0.000 (+/- 0.000),0.364 (+/- 0.006)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013),-0.604 (+/- 0.011)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004),-0.601 (+/- 0.005)


## Other Models

In [12]:
models = {
    "knn": make_pipeline(ct, KNeighborsRegressor()),
    "svr": make_pipeline(ct, SVR()),
    "randomforest": make_pipeline(ct, RandomForestRegressor())
}

In [13]:
for model in models:
    print(model)
    results[model] = mean_std_cross_val_scores(
        models[model],
        X_train,
        y_train,
        scoring=scoring_metrics,
        return_train_score=True,
        verbose = True,
        n_jobs = -1
    )

knn


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


svr


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.9min remaining: 11.9min


randomforest


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [14]:
pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge,knn,svr,randomforest
fit_time,0.069 (+/- 0.013),0.101 (+/- 0.005),0.153 (+/- 0.010),146.769 (+/- 1.056),78.430 (+/- 0.511)
score_time,0.013 (+/- 0.001),0.014 (+/- 0.002),16.936 (+/- 0.541),66.039 (+/- 0.446),0.328 (+/- 0.021)
test_neg_root_mean_squared_error,-58.644 (+/- 3.172),-47.136 (+/- 3.109),-50.951 (+/- 3.031),-54.119 (+/- 3.251),-42.490 (+/- 2.929)
train_neg_root_mean_squared_error,-58.706 (+/- 0.795),-46.816 (+/- 0.766),-41.182 (+/- 0.617),-54.140 (+/- 0.808),-15.837 (+/- 0.148)
test_r2,-0.000 (+/- 0.000),0.354 (+/- 0.026),0.245 (+/- 0.025),0.149 (+/- 0.011),0.475 (+/- 0.037)
train_r2,0.000 (+/- 0.000),0.364 (+/- 0.006),0.508 (+/- 0.005),0.150 (+/- 0.003),0.927 (+/- 0.002)
test_neg_mean_absolute_percentage_error,-0.930 (+/- 0.013),-0.604 (+/- 0.011),-0.582 (+/- 0.017),-0.616 (+/- 0.011),-0.472 (+/- 0.024)
train_neg_mean_absolute_percentage_error,-0.930 (+/- 0.004),-0.601 (+/- 0.005),-0.472 (+/- 0.004),-0.614 (+/- 0.002),-0.176 (+/- 0.002)
