# 0 Configuration

In [1]:
config = {
    "feature_eng": {

    },
    "modeling": {
        "dummy": False,
        "linear_reg": False,
        "svr": False,
        "tree": False,
        "forest": False,
        "xgboost": False
    }
}

***
# 1 Dependency import

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

import xgboost
from xgboost import XGBRegressor

import eli5
from eli5.sklearn import PermutationImportance
import shap

# plt.style.use(["dark_background"])
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

np.random.seed(0)

In [3]:
def get_categorical_features_name(dataset, split_by_unique_count=True, split_count=10):
    features_name = dataset.select_dtypes(["object", "bool"]).columns
    if split_by_unique_count:
        less_uniques = [feature_name for feature_name in features_name if dataset[feature_name].nunique() <= split_count]
        lot_uniques = features_name.difference(less_uniques).tolist()
        return (less_uniques, lot_uniques)
    else:
        return features_name.values

In [4]:
def get_numerical_features_name(dataset):
    features_name = dataset.select_dtypes(["int64", "float64"]).columns.values.tolist()
    return features_name

In [5]:
def evaluate(model, grid_params, dataset, target, scoring="neg_root_mean_squared_error"):
    set_config(display="diagram") # display="text" -> for textual output

    ### DATASET PREPARATION ###

    y = dataset[target].copy()
    X = dataset.drop(columns=[target]).copy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

    categorical_cols_less_unique, categorical_cols_lot_unique = get_categorical_features_name(X_train)
    numerical_cols = get_numerical_features_name(X_train)

    ### PIPELINE CONSTRUCTION ###

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    ### GRIDSEARCH DECLARATION AND FITTING ###

    gs = GridSearchCV(pipeline, grid_params, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    ### MOST COMMONLY USED METRICS CALCULATION ###

    score = gs.score(X_test, y_test)
    ratio = (score * 100) / y_test.mean()
    ratio = np.abs(ratio)
    target = y_test

    ### FEATURE IMPORTANCE OUTPUT ###
    best_model = gs.best_estimator_.named_steps["model"]
    categorical_cols_less_unique = gs.best_estimator_["transforms"].transformers_[1][1]["one_hot_encoder"].get_feature_names_out(categorical_cols_less_unique).tolist()
    categorical_cols_lot_unique = gs.best_estimator_["transforms"].transformers_[2][1]["one_hot_encoder"].get_feature_names_out(categorical_cols_lot_unique).tolist()
    feature_names = numerical_cols + categorical_cols_less_unique + categorical_cols_lot_unique

    return score, ratio, target, gs, best_model, feature_names

In [6]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [7]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [8]:
data = pd.read_csv("data/data-cleaned.csv", delimiter=",")

In [9]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [10]:
target_1 = "SiteEnergyUse(kBtu)"
target_2 = "GHGEmissionsIntensity(kgCO2e/ft2)"

In [11]:
# y = data[target]
# X = data.drop(columns=[target])

In [12]:
# X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [13]:
# categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
# categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
# numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [14]:
# my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
# X_train = X_train_full[my_cols].copy()
# X_test = X_test_full[my_cols].copy()

In [15]:
# y_test.describe()

***
# 3 Feature engineering

## 3.1 Mutual Information

In [16]:
# X_MI = X.copy()

In [17]:
# X_MI["DefaultData"] = X_MI["DefaultData"].astype("object")

In [18]:
# X_MI_NUM = X_MI.select_dtypes(["int64", "float64"]).columns
# X_MI_CAT = X_MI.select_dtypes(["object"]).columns

In [19]:
# discrete_features = X_MI.dtypes == object

In [20]:
# impt_num = SimpleImputer(strategy="mean")
# X_MI[X_MI_NUM] = pd.DataFrame(impt_num.fit_transform(X_MI[X_MI_NUM]))

In [21]:
# impt_cat = SimpleImputer(strategy="most_frequent")
# X_MI[X_MI_CAT] = pd.DataFrame(impt_cat.fit_transform(X_MI[X_MI_CAT]))

In [22]:
# for col in X_MI_CAT:
#     X_MI[col], _ = X_MI[col].factorize()

In [23]:
# scores = make_mi_scores(X_MI, y, discrete_features).sort_values(ascending=False)
# scores.head()

In [24]:
# def plot_mi_scores(scores):
#     plt.figure(dpi=100, figsize=(8, 5))
#     scores = scores.sort_values(ascending=True)
#     width = np.arange(len(scores))
#     ticks = list(scores.index)
#     plt.barh(width, scores)
#     plt.yticks(width, ticks)
#     plt.title("Mutual Information Scores")

# plot_mi_scores(scores)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [25]:
%%time
if config["modeling"]["dummy"]:

    model = DummyRegressor()

    grid_params = [
        {
            "model__strategy": ["mean", "median"]
        },
        {
            "model__strategy": ["quantile"],
            "model__quantile": np.arange(0, 1.1, 0.1),
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


***
## 4.2 LinearRegression

In [26]:
%%time
if config["modeling"]["linear_reg"]:

    model = LinearRegression()

    grid_params = [
        {
            "model__fit_intercept": [True]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


***
## 4.3 SupportVectorRegression

In [27]:
%%time
if config["modeling"]["svr"]:

    model = SVR()

    grid_params = [
        {
            "model__kernel": ["rbf"],
            "model__degree": [3],
            "model__gamma": ["scale"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


***
## 4.4 DecisionTreeRegressor

In [28]:
%%time
if config["modeling"]["tree"]:

    model = DecisionTreeRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [2, 3, 4],
            "model__min_samples_leaf": range(1, 11, 1),
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


***
## 4.5 RandomForestRegressor

In [29]:
%%time
if config["modeling"]["forest"]:

    model = RandomForestRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__n_estimators": [25],
            "model__min_samples_leaf": [1],
            "model__criterion": ["mse"]
        }
    ]

    score, ratio, gs, pipeline = evaluate(model, grid_params)

    print(f"RMSE: {score:n} / RATIO: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


***
## 4.6 XGBRegressor

In [30]:
%%time
if config["modeling"]["xgboost"]:

    model = XGBRegressor()

    grid_params = [
        {
            "model__random_state": [1],
            "model__max_depth": [3],
            "model__n_estimators": [100],
            "model__learning_rate": [0.1],
        }
    ]

    score, ratio, pipeline, gs, X_train, y_train, X_test, y_test, categorical_cols_less_unique, categorical_cols_lot_unique = evaluate(model, grid_params, data, target_1)

    print(f"RMSE: {-score:.4}")
    print(f"Target mean value: {y_train.mean():.4}")
    print(f"Ratio: {ratio:.4}%")
    print(f"best_params: {gs.best_params_}")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


***
## 4.7 Permutation, PDP, SHAP

In [31]:
model = XGBRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [3],
        "model__n_estimators": [100],
        "model__learning_rate": [0.1],
    }
]

score, ratio, target, gs, best_model, feature_names = evaluate(model, grid_params, data, target_1)

print(f"RMSE: {-score:.4}")
print(f"Target mean value: {target.mean():.4}")
print(f"Ratio: {ratio:.4}%")
print(f"best_params: {gs.best_params_}")

RMSE: 0.1439
Target mean value: 0.5656
Ratio: 25.43%
best_params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__random_state': 1}


In [36]:
eli5.show_weights(best_model, feature_names=feature_names)

Weight,Feature
0.5249,Electricity(kWh)
0.1684,TotalGHGEmissions
0.1458,NumberofFloors
0.0816,NumberofBuildings
0.0687,OSEBuildingID
0.0023,NaturalGas(kBtu)
0.0017,SecondLargestPropertyUseTypeGFA
0.0010,GHGEmissionsIntensity
0.0009,TaxParcelIdentificationNumber
0.0006,Address_14027 lake city way ne


***
### 4.6.1 Permutation Importance

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_test_preproc, y_test)

***
### 4.6.3 SHAP

In [None]:
# num_impt = SimpleImputer(strategy="mean")
# X_train_num = pd.DataFrame(data=num_impt.fit_transform(X_train[numerical_cols]), columns=numerical_cols)

# cat_impt = SimpleImputer(strategy="most_frequent")
# X_train_cat = pd.DataFrame(data=cat_impt.fit_transform(X_train[categorical_cols_less_unique + categorical_cols_lot_unique]), columns=categorical_cols_less_unique + categorical_cols_lot_unique)

# ohe = OneHotEncoder(handle_unknown="ignore")
# X_train_cat = pd.DataFrame(data=ohe.fit_transform(X_train_cat).toarray())

# X = X_train_num.join([X_train_cat])

In [None]:
# # model = xgboost.XGBRegressor().fit(X, y_train)

# model = gs.best_estimator_["model"]

# explainer = shap.Explainer(model)
# shap_values = explainer(df_preproc)

In [None]:
# shap.plots.waterfall(shap_values[0])

***
## 4.6.1 ...

In [None]:
# set_config(display="text")

In [None]:
# fi = gs.best_estimator_.named_steps["model"].feature_importances_
# fi

In [None]:
# hd = list(X_train.columns)
# for i, f in zip(hd, fi):
#      print(i,round(f*100,2))

In [None]:
# d_train = xgboost.DMatrix(X_train, label=y_train)
# d_test = xgboost.DMatrix(X_test, label=y_test)



# params = {
#     "eta": 0.01,
#     "objective": "binary:logistic",
#     "subsample": 0.5,
#     "base_score": np.mean(y_train),
#     "eval_metric": "logloss"
# }

# model = xgboost.train(params, d_train, 5000, evals = [(d_test, "test")], verbose_eval=100, early_stopping_rounds=20)

***
# 5 ...

...

***
# 6 ...

transformer la cible avec log -> cela donne des résultats aberrants  
analyser les résultats des modèles (xgboost): quel sont les points faibles ? quelles sont les features les plus parlantes ? ---> feature engineering
...