# 0 Configuration

In [24]:
config = {
    "feature_eng": {

    },
    "modeling": {
        "dummy": False,
        "linear_reg": False,
        "svr": False,
        "tree": False,
        "forest": False,
        "xgboost": True
    }
}

***
# 1 Dependency import

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

np.random.seed(0)

In [26]:
def evaluate(model, grid_params, scoring="neg_root_mean_squared_error"):
    set_config(display="diagram")
    # set_config(display="text")

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring=scoring).fit(X_train, y_train)

    score = gs.score(X_test, y_test)
    ratio = (score * 100) / y_test.mean()
    ratio = np.abs(ratio)

    return score, ratio, gs, pipeline

In [27]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [28]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [29]:
data = pd.read_csv("data/data-cleaned.csv", delimiter=",")

In [30]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [31]:
y = data[target]
X = data.drop(columns=[target])

In [32]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [33]:
categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [34]:
my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [35]:
y_test.describe()

count    674.000000
mean       0.633046
std        1.675907
min        0.000000
25%        0.104283
50%        0.205197
75%        0.497017
max       31.430914
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [36]:
X_MI = X.copy()

In [37]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [38]:
discrete_features = X_MI.dtypes == int

In [39]:
# make_mi_scores(X_MI, y, discrete_features)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [40]:
%%time
if config["modeling"]["dummy"]:

    model = DummyRegressor()

    grid_params = [
        {
            "model__strategy": ["mean", "median"]
        },
        {
            "model__strategy": ["quantile"],
            "model__quantile": np.arange(0, 1.1, 0.1),
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


***
## 4.2 LinearRegression

In [41]:
%%time
if config["modeling"]["linear_reg"]:

    model = LinearRegression()

    grid_params = [
        {
            "model__fit_intercept": [True]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


***
## 4.3 SupportVectorRegression

In [42]:
%%time
if config["modeling"]["svr"]:

    model = SVR()

    grid_params = [
        {
            "model__kernel": ["rbf"],
            "model__degree": [3],
            "model__gamma": ["scale"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs


***
## 4.4 DecisionTreeRegressor

In [43]:
%%time
if config["modeling"]["tree"]:

    model = DecisionTreeRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [2, 3, 4],
            "model__min_samples_leaf": range(1, 11, 1),
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


***
## 4.5 RandomForestRegressor

In [44]:
%%time
if config["modeling"]["forest"]:

    model = RandomForestRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__n_estimators": [25],
            "model__min_samples_leaf": [1],
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


***
## 4.6 XGBRegressor

In [50]:
%%time
if config["modeling"]["xgboost"]:

    model = XGBRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [3],
            "model__n_estimators": [100],
            "model__learning_rate": [0.1],
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params, scoring="neg_root_mean_squared_error")

    print(f"RMSE: {-score:.4}")
    print(f"Target mean value: {y_test.mean():.4}")
    print(f"Ratio: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

RMSE: 0.628
Target mean value: 0.633
Ratio: 99.2%
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}
CPU times: user 1min 19s, sys: 2.02 s, total: 1min 21s
Wall time: 7.39 s


MAE -2.683020e+05 / RATIO: 4.85%  
MAE -4.532064e+05 / RATIO: 8.192%  
MAE: -5.319727e-02 / RATIO: 8.403%  
MAE: -3.695645e-02 / RATIO: 6.101%  

RMSE: -1.186661e+06 / RATIO: 21.45%  
"remove_empty": True / RMSE: -1.186661e+06 / RATIO: 21.45%  
usefullness "remove": True / RMSE: -1.186661e+06 / RATIO: 21.45%  
"to_num": True / RMSE: -5.338139e+06 / RATIO: 96.49%  
"strip_and_lower": True / RMSE: -1.186661e+06 / RATIO: 21.45%  
"remove": True, / RMSE: -3.065516e+06 / RATIO: 57.91%  
"scale": True, / RMSE: -1.353043e-01 / RATIO: 21.37%  

In [46]:
y_test.describe()

count    674.000000
mean       0.633046
std        1.675907
min        0.000000
25%        0.104283
50%        0.205197
75%        0.497017
max       31.430914
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 5 Results

## 5.1 Initial

The 2016 dataset is used without any modifications

MSE: -1.95016e+06 / RATIO: 34.2%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 41s, sys: 2.59 s, total: 1min 44s  
Wall time: 10.1 s

***
## 5.2 Feature conversion: TaxParcelIdentificationNumber

The feature 'TaxParcelIdentificationNumber' has been converted from 'object' type to 'float64'
We can see that this convertion has a huge negative impact on the accuracy of the model. This is because the feature contain value that are much bigger than the others feature's values, and so, take a big importance in the training.
preparation:4.2.2

MSE: -5.31182e+06 / RATIO: 93.16%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 18s, sys: 1.75 s, total: 1min 19s  
Wall time: 7.63 s

***
## 5.3 Scaling

We have now scale all the numerical values from 0 to 1. It lead to let the model process 'equally' in weight all the numericals features. We can see a slight improvement compare to the initial error ratio.  
preparation:4.3.3

MSE: -0.0021134 / RATIO: 32.39%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 27s, sys: 1.94 s, total: 1min 29s  
Wall time: 8.66 s

***
## 5.4 Categoricals features inconsistencies

All the categoricals features has been stripped an lowerred (removal of leading an tailing spaces and lower all the content).  
It appears that no improvement has been done by this process. Most probably due to no or little amount of inconsistencies like several words almost identical but with first letter in uppercase, leading the model to think this is two different values.  
preparation:4.2.3

MSE: -0.0021134 / RATIO: 32.39%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 27s, sys: 2.04 s, total: 1min 29s  
Wall time: 8.75 s  

***
## 5.5 Outliers removal

It appears that few entries have a lot of value that are outliers. So few that it's acceptable to remove them.  
preparation:4.3.2

MSE: -0.00144008 / RATIO: 24.59%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 29s, sys: 1.97 s, total: 1min 31s  
Wall time: 8.87 s

***
## 5. ...

...
preparation:...

...

***
# 6 ...