# 1 Dependency import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

In [2]:
def evaluate(model, grid_params):
    set_config(display="diagram")
    # set_config(display="text")

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring="neg_root_mean_squared_error").fit(X_train, y_train)

    score = gs.score(X_test, y_test)
    ratio = (score * 100) / y_test.mean()
    ratio = np.abs(ratio)

    return score, ratio, gs, pipeline

In [3]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [4]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [5]:
data = pd.read_csv("data/data-cleaned.csv", delimiter=",")

In [6]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [7]:
y = data[target]
X = data.drop(columns=[target])

In [8]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [9]:
categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [10]:
my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [11]:
y_test.describe()

count    1.341000e+03
mean     5.540900e+06
std      1.523880e+07
min      0.000000e+00
25%      9.285849e+05
50%      1.835805e+06
75%      4.217108e+06
max      2.958126e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [12]:
X_MI = X.copy()

In [13]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [14]:
discrete_features = X_MI.dtypes == int

In [15]:
# make_mi_scores(X_MI, y, discrete_features)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [16]:
%%time

model = DummyRegressor()

grid_params = [
    {
        "model__strategy": ["mean", "median"]
    },
    {
        "model__strategy": ["quantile"],
        "model__quantile": np.arange(0, 1.1, 0.1),
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -1.52393e+07 / RATIO: 275.0%
best_params: {'model__strategy': 'mean'}
CPU times: user 6.8 s, sys: 169 ms, total: 6.97 s
Wall time: 7 s


***
## 4.2 LinearRegression

In [17]:
%%time

model = LinearRegression()

grid_params = [
    {
        "model__fit_intercept": [True]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -761564 / RATIO: 13.74%
best_params: {'model__fit_intercept': True}
CPU times: user 7.47 s, sys: 123 ms, total: 7.59 s
Wall time: 1.38 s


***
## 4.3 SupportVectorRegression

In [18]:
%%time

model = SVR()

grid_params = [
    {
        "model__kernel": ["rbf"],
        "model__degree": [3],
        "model__gamma": ["scale"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -1.5691e+07 / RATIO: 283.2%
best_params: {'model__degree': 3, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
CPU times: user 19.3 s, sys: 255 ms, total: 19.6 s
Wall time: 19.4 s


***
## 4.4 DecisionTreeRegressor

In [19]:
%%time

model = DecisionTreeRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [2, 3, 4],
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -2.95676e+06 / RATIO: 53.36%
best_params: {'model__criterion': 'mse', 'model__max_depth': 4, 'model__min_samples_leaf': 1, 'model__random_state': 1}
CPU times: user 18.5 s, sys: 22.9 ms, total: 18.5 s
Wall time: 18.5 s


***
## 4.5 RandomForestRegressor

In [20]:
%%time

model = RandomForestRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__n_estimators": [25],
        "model__min_samples_leaf": [1],
        "model__criterion": ["mse"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -1.7076e+06 / RATIO: 30.82%
best_params: {'model__criterion': 'mse', 'model__min_samples_leaf': 1, 'model__n_estimators': 25, 'model__random_state': 1}
CPU times: user 1min 58s, sys: 52.2 ms, total: 1min 58s
Wall time: 1min 59s


***
## 4.6 XGBRegressor

In [21]:
%%time

model = XGBRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [3],
        "model__n_estimators": [100],
        "model__learning_rate": [0.1],
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -1.45187e+06 / RATIO: 26.2%
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}
CPU times: user 4min 27s, sys: 8.31 s, total: 4min 35s
Wall time: 26.7 s


***
## Initial

MSE: -875275 / RATIO: 19.0%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 19s, sys: 2.08 s, total: 1min 21s  
Wall time: 7.4 s

***
# 5 ...