# 0 Configuration

In [1519]:
config = {
    "feature_eng": {

    },
    "modeling": {
        "dummy": False,
        "linear_reg": False,
        "svr": False,
        "tree": False,
        "forest": False,
        "xgboost": True
    }
}

***
# 1 Dependency import

In [1520]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

import xgboost
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

np.random.seed(0)

In [1521]:
def evaluate(model, grid_params, scoring="neg_root_mean_squared_error"):
    set_config(display="diagram")
    # set_config(display="text")

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring=scoring).fit(X_train, y_train)

    score = gs.score(X_test, y_test)
    ratio = (score * 100) / y_test.mean()
    ratio = np.abs(ratio)

    return score, ratio, gs, pipeline

In [1522]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [1523]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [1524]:
data = pd.read_csv("data/data-cleaned.csv", delimiter=",")

In [1525]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [1526]:
y = data[target]
X = data.drop(columns=[target])

In [1527]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [1528]:
categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [1529]:
my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [1530]:
y_test.describe()

count    661.000000
mean       0.565636
std        1.035041
min        0.000000
25%        0.113502
50%        0.221829
75%        0.582242
max       13.118825
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [1531]:
X_MI = X.copy()

In [1532]:
X_MI["DefaultData"] = X_MI["DefaultData"].astype("object")

In [1533]:
X_MI_NUM = X_MI.select_dtypes(["int64", "float64"]).columns
X_MI_CAT = X_MI.select_dtypes(["object"]).columns

In [1534]:
discrete_features = X_MI.dtypes == object

In [1535]:
# impt_num = SimpleImputer(strategy="mean")
# X_MI[X_MI_NUM] = pd.DataFrame(impt_num.fit_transform(X_MI[X_MI_NUM]))

In [1536]:
# impt_cat = SimpleImputer(strategy="most_frequent")
# X_MI[X_MI_CAT] = pd.DataFrame(impt_cat.fit_transform(X_MI[X_MI_CAT]))

In [1537]:
# for col in X_MI_CAT:
#     X_MI[col], _ = X_MI[col].factorize()

In [1538]:
# scores = make_mi_scores(X_MI, y, discrete_features).sort_values(ascending=False)
# scores.head()

In [1539]:
# def plot_mi_scores(scores):
#     plt.figure(dpi=100, figsize=(8, 5))
#     scores = scores.sort_values(ascending=True)
#     width = np.arange(len(scores))
#     ticks = list(scores.index)
#     plt.barh(width, scores)
#     plt.yticks(width, ticks)
#     plt.title("Mutual Information Scores")

# plot_mi_scores(scores)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [1540]:
%%time
if config["modeling"]["dummy"]:

    model = DummyRegressor()

    grid_params = [
        {
            "model__strategy": ["mean", "median"]
        },
        {
            "model__strategy": ["quantile"],
            "model__quantile": np.arange(0, 1.1, 0.1),
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.11 µs


***
## 4.2 LinearRegression

In [1541]:
%%time
if config["modeling"]["linear_reg"]:

    model = LinearRegression()

    grid_params = [
        {
            "model__fit_intercept": [True]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.63 µs


***
## 4.3 SupportVectorRegression

In [1542]:
%%time
if config["modeling"]["svr"]:

    model = SVR()

    grid_params = [
        {
            "model__kernel": ["rbf"],
            "model__degree": [3],
            "model__gamma": ["scale"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


***
## 4.4 DecisionTreeRegressor

In [1543]:
%%time
if config["modeling"]["tree"]:

    model = DecisionTreeRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [2, 3, 4],
            "model__min_samples_leaf": range(1, 11, 1),
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 14.3 µs


***
## 4.5 RandomForestRegressor

In [1544]:
%%time
if config["modeling"]["forest"]:

    model = RandomForestRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__n_estimators": [25],
            "model__min_samples_leaf": [1],
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs


***
## 4.6 XGBRegressor

In [1545]:
%%time
if config["modeling"]["xgboost"]:

    model = XGBRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [3],
            "model__n_estimators": [100],
            "model__learning_rate": [0.1],
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params, scoring="neg_root_mean_squared_error")

    print(f"RMSE: {-score:.4}")
    print(f"Target mean value: {y_test.mean():.4}")
    print(f"Ratio: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

RMSE: 0.1439
Target mean value: 0.5656
Ratio: 25.43%
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}
CPU times: user 1min 24s, sys: 1.94 s, total: 1min 26s
Wall time: 8.45 s


In [1546]:
set_config(display="text")

In [1547]:
fi = gs.best_estimator_.named_steps["model"].feature_importances_
fi

array([0.06871989, 0.        , 0.00059661, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [1550]:
hd = list(X_train.columns)
for i, f in zip(hd, fi):
     print(i,round(f*100,2))

PrimaryPropertyType 6.87
PropertyName 0.0
Address 0.06
Neighborhood 0.09
ListOfAllPropertyUseTypes 0.0
LargestPropertyUseType 0.06
SecondLargestPropertyUseType 0.02
ThirdLargestPropertyUseType 0.02
BuildingType 8.16
ComplianceStatus 14.58
Unnamed: 0 0.06
OSEBuildingID 0.0
ZipCode 0.03
TaxParcelIdentificationNumber 0.0
CouncilDistrictCode 0.17
Latitude 0.0
Longitude 0.0
YearBuilt 0.04
NumberofBuildings 0.03
NumberofFloors 0.02
PropertyGFATotal 52.49
PropertyGFAParking 0.23
PropertyGFABuilding(s) 16.84
LargestPropertyUseTypeGFA 0.1
SecondLargestPropertyUseTypeGFA 0.0
ThirdLargestPropertyUseTypeGFA 0.0
ENERGYSTARScore 0.0
SiteEUI(kBtu/sf) 0.0
SourceEUI(kBtu/sf) 0.0
SteamUse(kBtu) 0.0
Electricity(kWh) 0.0
NaturalGas(kBtu) 0.0
TotalGHGEmissions 0.0
GHGEmissionsIntensity 0.0


In [1548]:
# d_train = xgboost.DMatrix(X_train, label=y_train)
# d_test = xgboost.DMatrix(X_test, label=y_test)



# params = {
#     "eta": 0.01,
#     "objective": "binary:logistic",
#     "subsample": 0.5,
#     "base_score": np.mean(y_train),
#     "eval_metric": "logloss"
# }

# model = xgboost.train(params, d_train, 5000, evals = [(d_test, "test")], verbose_eval=100, early_stopping_rounds=20)

***
# 5 Results

## 5.1 Initial

The 2016 dataset is used without any modifications

MSE: -1.95016e+06 / RATIO: 34.2%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 41s, sys: 2.59 s, total: 1min 44s  
Wall time: 10.1 s

***
## 5.2 Feature conversion: TaxParcelIdentificationNumber

The feature 'TaxParcelIdentificationNumber' has been converted from 'object' type to 'float64'
We can see that this convertion has a huge negative impact on the accuracy of the model. This is because the feature contain value that are much bigger than the others feature's values, and so, take a big importance in the training.
preparation:4.2.2

MSE: -5.31182e+06 / RATIO: 93.16%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 18s, sys: 1.75 s, total: 1min 19s  
Wall time: 7.63 s

***
## 5.3 Scaling

We have now scale all the numerical values from 0 to 1. It lead to let the model process 'equally' in weight all the numericals features. We can see a slight improvement compare to the initial error ratio.  
preparation:4.3.3

MSE: -0.0021134 / RATIO: 32.39%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 27s, sys: 1.94 s, total: 1min 29s  
Wall time: 8.66 s

***
## 5.4 Categoricals features inconsistencies

All the categoricals features has been stripped an lowerred (removal of leading an tailing spaces and lower all the content).  
It appears that no improvement has been done by this process. Most probably due to no or little amount of inconsistencies like several words almost identical but with first letter in uppercase, leading the model to think this is two different values.  
preparation:4.2.3

MSE: -0.0021134 / RATIO: 32.39%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 27s, sys: 2.04 s, total: 1min 29s  
Wall time: 8.75 s  

***
## 5.5 Outliers removal

It appears that few entries have a lot of value that are outliers. So few that it's acceptable to remove them.  
preparation:4.3.2

MSE: -0.00144008 / RATIO: 24.59%  
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}  
CPU times: user 1min 29s, sys: 1.97 s, total: 1min 31s  
Wall time: 8.87 s

***
## 5. ...

...
preparation:...

...

***
# 6 ...

transformer la cible avec log -> cela donne des résultats aberrants  
analyser les résultats des modèles (xgboost): quel sont les points faibles ? quelles sont les features les plus parlantes ? ---> feature engineering
...