# 1 Dependency import

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

***
# 2 Loading data

In [17]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [18]:
target = "SiteEnergyUse(kBtu)"

In [19]:
y = data[target]
X = data.drop(columns=[target])

In [20]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [21]:
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

In [22]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [23]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [24]:
y_test.describe()

count    6.660000e+02
mean     4.606876e+06
std      1.055033e+07
min      0.000000e+00
25%      8.487488e+05
50%      1.619646e+06
75%      3.423317e+06
max      1.362414e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

***
# 4 Modeling

***
## 4.1 DummyRegressor

### 4.1.1 Strategy 1

In [25]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [26]:
model = DummyRegressor(strategy="mean")

In [27]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [28]:
mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
print(f"MAE: {mae}")

MAE: 5743745.266818169


### 4.1.2 Strategy 2

In [53]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [54]:
model = DummyRegressor(strategy="median")

In [55]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [56]:
mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
print(f"MAE: {mae}")

MAE: 4110889.2774774777


### 4.1.3 Strategy 3

In [75]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [76]:
model = DummyRegressor(strategy="quantile", quantile=0.5)

In [77]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [78]:
mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
print(f"MAE: {mae}")

MAE: 5212277.375525526


***
## 4.2 DecisionTreeRegressor

### 4.2.1 Strategy 1

In [32]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [34]:
model = DecisionTreeRegressor()

In [35]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [36]:
mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
print(f"MAE: {mae}")

MAE: 602440.8444444445


***
## 4.3 RandomForestRegressor

### 4.2.1 Strategy 1

In [38]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [42]:
model = RandomForestRegressor(n_estimators=100, random_state=1)

In [43]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [44]:
mae = -1 * cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error").mean()
print(f"MAE: {mae}")

MAE: 498087.1706216216


***
# 5 ...