# 1 Dependency import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [None]:
def evaluate(model):
    numerical_transformer = SimpleImputer(strategy="mean")

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
    print(f"MAE: {mae}")

***
# 2 Loading data

In [None]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [None]:
target = "SiteEnergyUse(kBtu)"

In [None]:
y = data[target]
X = data.drop(columns=[target])

In [None]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [None]:
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

In [None]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [None]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
y_test.describe()

***
# 3 Feature engineering

***
# 4 Modeling

***
## 4.1 DummyRegressor

### 4.1.1 Strategy 1

In [None]:
%%time

model = DummyRegressor(strategy="mean")
evaluate(model)

***
### 4.1.2 Strategy 2

In [None]:
%%time

model = DummyRegressor(strategy="median")
evaluate(model)

***
### 4.1.3 Strategy 3

In [None]:
%%time

for q in range(0, 10, 1):
    model = DummyRegressor(strategy="quantile", quantile=q/10)
    evaluate(model)

***
### 4.1.4 Best strategy

model = DummyRegressor(strategy="median")  
MAE: 4110889.2774774777

***
## 4.2 DecisionTreeRegressor

### 4.2.1 Strategy 1

In [None]:
%%time

model = DecisionTreeRegressor(random_state=0)
evaluate(model)

***
### 4.2.2 Strategy 2

In [None]:
%%time

for md in range(10, 100, 10):
    model = DecisionTreeRegressor(max_depth=md, random_state=0)
    evaluate(model)

***
### 4.2.3 Strategy 3

In [None]:
%%time

for msl in range(1, 10, 1):
    model = DecisionTreeRegressor(min_samples_leaf=msl, random_state=0)
    evaluate(model)

***
### 4.2.4 Strategy 4

In [None]:
%%time

for s in ["mse", "friedman_mse", "mae", "poisson"]:
    model = DecisionTreeRegressor(criterion=s, random_state=0)
    evaluate(model)

***
### 4.2.5 Strategy 5

In [None]:
%%time

model = DecisionTreeRegressor(min_samples_leaf=3, criterion="mae", random_state=0)
evaluate(model)

***
## 4.3 RandomForestRegressor

### 4.2.1 Strategy 1

In [None]:
%%time

model = RandomForestRegressor(random_state=1)
evaluate(model)

***
### 4.2.2 Strategy 2

In [None]:
%%time

for ne in range(50, 250, 50):
    RandomForestRegressor(n_estimators=ne, random_state=1)
    evaluate(model)

***
### 4.2.3 Strategy 3

In [None]:
%%time

for c in ["mse", "mae", "poisson"]:
    RandomForestRegressor(criterion=c, random_state=1)
    evaluate(model)

***
### 4.2.4 Strategy 4

In [None]:
%%time

for msl in [5, 25, 5]:
    RandomForestRegressor(min_samples_leaf=msl, random_state=1)
    evaluate(model)

***
## 4.4 XGBRegressor

### 4.4.1 Strategy 1

In [None]:
%%time

model = XGBRegressor()
evaluate(model)

***
# 5 ...