# 1 Dependency import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

In [76]:
def evaluate(model, X=X, y=y):
    numerical_transformer = SimpleImputer(strategy="mean")

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
    print(f"MAE: {mae}")

In [91]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [92]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [2]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [3]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [4]:
y = data[target]
X = data.drop(columns=[target])

In [5]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [6]:
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [18]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [29]:
y_test.describe()

count    6.660000e+02
mean     4.606876e+06
std      1.055033e+07
min      0.000000e+00
25%      8.487488e+05
50%      1.619646e+06
75%      3.423317e+06
max      1.362414e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [30]:
X_MI = X.copy()

In [31]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [32]:
discrete_features = X_MI.dtypes == int

In [36]:
# make_mi_scores(X_MI, y, discrete_features)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

***
# 4 Modeling

***
## 4.1 DummyRegressor

### 4.1.1 Strategy 1

In [None]:
%%time

model = DummyRegressor(strategy="mean")
evaluate(model)

***
### 4.1.2 Strategy 2

In [None]:
%%time

model = DummyRegressor(strategy="median")
evaluate(model)

***
### 4.1.3 Strategy 3

In [None]:
%%time

for q in range(0, 10, 1):
    model = DummyRegressor(strategy="quantile", quantile=q/10)
    evaluate(model)

***
## 4.2 DecisionTreeRegressor

### 4.2.1 Strategy 1

In [119]:
%%time

model = DecisionTreeRegressor(random_state=0)
evaluate(model, X_train, y_train)

MAE: 585487.5613508442
CPU times: user 820 ms, sys: 6.35 ms, total: 826 ms
Wall time: 853 ms


***
### 4.2.2 Strategy 2

In [78]:
%%time

for md in range(1, 21, 1):
    model = DecisionTreeRegressor(max_depth=md, random_state=0)
    evaluate(model, X_train, y_train)

MAE: 4625734.660815676
MAE: 3286254.8231888013
MAE: 1824836.0651855927
MAE: 1525157.4801350944
MAE: 873516.3457854999
MAE: 765991.7467842886
MAE: 686762.2014245798
MAE: 619341.8451169787
MAE: 547388.4994035431
MAE: 393335.4635796078
MAE: 477485.715255011
MAE: 378991.0959711589
MAE: 496356.1426041438
MAE: 591320.2712276696
MAE: 442742.45513816585
MAE: 582215.6179572225
MAE: 517094.1495318698
MAE: 424031.2338662248
MAE: 591380.687778303
MAE: 538939.0062675466
CPU times: user 12.8 s, sys: 29.4 ms, total: 12.8 s
Wall time: 13 s


***
### 4.2.3 Strategy 3

In [114]:
%%time

for msl in range(1, 10, 1):
    model = DecisionTreeRegressor(min_samples_leaf=msl, random_state=0)
    evaluate(model)

MAE: 970592.4405405406
MAE: 585055.3392392393
MAE: 534005.7032582583
MAE: 986423.37002002
MAE: 622881.4031719815
MAE: 694501.724199405
MAE: 759987.6942498835
MAE: 840894.8128855936
MAE: 880936.8716241142
CPU times: user 7.72 s, sys: 3.2 ms, total: 7.72 s
Wall time: 7.84 s


***
### 4.2.4 Strategy 4

In [79]:
%%time

for s in ["mse", "friedman_mse", "mae", "poisson"]:
    model = DecisionTreeRegressor(criterion=s, random_state=0)
    evaluate(model)

MAE: 970592.4405405406
MAE: 970592.4405405406
MAE: 661239.1747747747
MAE: 1110675.0091091092
CPU times: user 37.1 s, sys: 79.1 ms, total: 37.2 s
Wall time: 37.5 s


***
### 4.2.5 Strategy 5

In [None]:
%%time

model = DecisionTreeRegressor(min_samples_leaf=3, criterion="mae", random_state=0)
evaluate(model)

***
## 4.3 RandomForestRegressor

### 4.2.1 Strategy 1

In [None]:
%%time

model = RandomForestRegressor(random_state=1)
evaluate(model)

***
### 4.2.2 Strategy 2

In [None]:
%%time

for ne in range(50, 250, 50):
    RandomForestRegressor(n_estimators=ne, random_state=1)
    evaluate(model)

***
### 4.2.3 Strategy 3

In [None]:
%%time

for c in ["mse", "mae", "poisson"]:
    RandomForestRegressor(criterion=c, random_state=1)
    evaluate(model)

***
### 4.2.4 Strategy 4

In [None]:
%%time

for msl in range(5, 25, 5):
    RandomForestRegressor(min_samples_leaf=msl, random_state=1)
    evaluate(model)

***
## 4.4 XGBRegressor

### 4.4.1 Strategy 1

In [None]:
%%time

model = XGBRegressor(random_state=1)
evaluate(model)

***
### 4.4.2 Strategy 2

In [None]:
%%time

for ne in [100, 200, 500, 1000]:
    model = XGBRegressor(n_estimators=ne, random_state=1)
    evaluate(model)

***
### 4.4.3 Strategy 3

In [None]:
%%time

for lr in [0.05, 0.075, 0.09, 0.1, 0.11, 0.12, 0.15]:
    model = XGBRegressor(learning_rate=lr, n_estimators=200, random_state=1, verbosity=0)
    print(f"learning_rate:  {lr}")
    evaluate(model)

***
### 4.4.3 Strategy 3

In [None]:
%%time

for md in [2, 3, 4, 5]:
    model = XGBRegressor(max_depth=md, learning_rate=0.09, n_estimators=200, random_state=1, verbosity=0)
    print(f"max_depth:  {md}")
    evaluate(model)

***
# 5 ...

In [68]:
set_config(display="diagram")

categorical_transformer_pipeline = Pipeline(steps=[
    ("cat_imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(handle_unknown="ignore"))
])

column_transformer = ColumnTransformer([
    ("num_imputer", SimpleImputer(strategy="mean"), numerical_cols),
    ("cat_transformer_pipeline", categorical_transformer_pipeline, categorical_cols)
])

pipeline = Pipeline([
    ("transforms", column_transformer),
    ("model", DecisionTreeRegressor())
])

In [69]:
display(gs)

In [70]:
# pipeline.get_params()

In [71]:
grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": range(10, 110, 10),
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse", "friedman_mse", "mae", "poisson"]
    }
]

In [72]:
gs = GridSearchCV(pipeline, grid_params, scoring="r2").fit(X_train, y_train)

In [73]:
set_config(display="text")

gs.score(X_test, y_test)

0.993102355822537

In [74]:
gs.best_params_

{'model__criterion': 'mse',
 'model__max_depth': 10,
 'model__min_samples_leaf': 1,
 'model__random_state': 1}