# 1 Dependency import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

In [2]:
def evaluate(model, grid_params):
    set_config(display="diagram")
    # set_config(display="text")

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring="neg_root_mean_squared_error").fit(X_train, y_train)

    score = gs.score(X_test, y_test)
    ratio = (score * 100) / y_test.mean()
    ratio = np.abs(ratio)

    return score, ratio, gs, pipeline

In [3]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [4]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [5]:
data = pd.read_csv("data/data-cleaned.csv", delimiter=",")

In [6]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)_2016"

In [7]:
y = data[target]
X = data.drop(columns=[target])

In [8]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [9]:
categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [10]:
my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [11]:
y_test.describe()

count    6.550000e+02
mean     4.682071e+06
std      9.431604e+06
min      0.000000e+00
25%      9.083228e+05
50%      1.841154e+06
75%      3.804507e+06
max      1.026737e+08
Name: SiteEnergyUse(kBtu)_2016, dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [12]:
X_MI = X.copy()

In [13]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [14]:
discrete_features = X_MI.dtypes == int

In [15]:
# make_mi_scores(X_MI, y, discrete_features)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [16]:
%%time

model = DummyRegressor()

grid_params = [
    {
        "model__strategy": ["mean", "median"]
    },
    {
        "model__strategy": ["quantile"],
        "model__quantile": np.arange(0, 1.1, 0.1),
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

RMSE: -9.43276e+06 / RATIO: 201.5%
best_params: {'model__strategy': 'mean'}
CPU times: user 18.4 s, sys: 423 ms, total: 18.8 s
Wall time: 19 s


***
## 4.2 LinearRegression

In [17]:
%%time

model = LinearRegression()

grid_params = [
    {
        "model__fit_intercept": [True]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -563193 / RATIO: 12.03%
best_params: {'model__fit_intercept': True}
CPU times: user 39.9 s, sys: 1.53 s, total: 41.4 s
Wall time: 16.7 s


***
## 4.3 SupportVectorRegression

In [18]:
%%time

model = SVR()

grid_params = [
    {
        "model__kernel": ["rbf"],
        "model__degree": [3],
        "model__gamma": ["scale"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -9.86282e+06 / RATIO: 210.7%
best_params: {'model__degree': 3, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
CPU times: user 20.1 s, sys: 286 ms, total: 20.4 s
Wall time: 20.7 s


***
## 4.4 DecisionTreeRegressor

In [19]:
%%time

model = DecisionTreeRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [2, 3, 4],
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -1.88155e+06 / RATIO: 40.19%
best_params: {'model__criterion': 'mse', 'model__max_depth': 4, 'model__min_samples_leaf': 1, 'model__random_state': 1}
CPU times: user 49.3 s, sys: 128 ms, total: 49.5 s
Wall time: 50.1 s


***
## 4.5 RandomForestRegressor

In [20]:
%%time

model = RandomForestRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__n_estimators": [25],
        "model__min_samples_leaf": [1],
        "model__criterion": ["mse"]
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -846180 / RATIO: 18.07%
best_params: {'model__criterion': 'mse', 'model__min_samples_leaf': 1, 'model__n_estimators': 25, 'model__random_state': 1}
CPU times: user 2min 26s, sys: 62.2 ms, total: 2min 26s
Wall time: 2min 28s


***
## 4.6 XGBRegressor

In [21]:
%%time

model = XGBRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [3],
        "model__n_estimators": [100],
        "model__learning_rate": [0.1],
    }
]

score, ratio, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score:n} / RATIO: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

MSE: -760153 / RATIO: 16.24%
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}
CPU times: user 2min 10s, sys: 4.47 s, total: 2min 15s
Wall time: 50.8 s


***
# 5 Results

## 5.1 Initial

Only the 2015 dataset is used

MSE: -875275 / RATIO: 19.0%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 19s, sys: 2.08 s, total: 1min 21s  
Wall time: 7.4 s

***
## 5.2 Merge

The 2015 and 2016 datasets have been merged

MSE: -760153 / RATIO: 16.24%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 2min 10s, sys: 4.47 s, total: 2min 15s  
Wall time: 50.8 s  

***
# 6 ...