# 1 Dependency import

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

In [12]:
def evaluate(model, grid_params):
    set_config(display="diagram")
    # set_config(display="text")

    categorical_transformer_pipeline = Pipeline(steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    column_transformer = ColumnTransformer([
        ("num_imputer", SimpleImputer(strategy="mean"), numerical_cols),
        ("cat_transformer_pipeline", categorical_transformer_pipeline, categorical_cols)
    ])

    pipeline = Pipeline([
        ("transforms", column_transformer),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring="neg_root_mean_squared_error").fit(X_train, y_train)

    score = gs.score(X_test, y_test)

    return score, gs.best_params_, pipeline

    # display(pipeline)
    # pipeline.get_params()

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [3]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [4]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [5]:
y = data[target]
X = data.drop(columns=[target])

In [6]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [7]:
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [8]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [9]:
y_test.describe()

count    6.660000e+02
mean     4.606876e+06
std      1.055033e+07
min      0.000000e+00
25%      8.487488e+05
50%      1.619646e+06
75%      3.423317e+06
max      1.362414e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [None]:
X_MI = X.copy()

In [None]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [None]:
discrete_features = X_MI.dtypes == int

In [None]:
# make_mi_scores(X_MI, y, discrete_features)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [22]:
%%time

model = DummyRegressor()

grid_params = [
    {
        "model__strategy": ["mean", "median"]
    },
    {
        "model__strategy": ["quantile"],
        "model__quantile": np.arange(0, 1.1, 0.1),
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

MSE: -10552887.019848462
best_params: {'model__strategy': 'mean'}
CPU times: user 1.07 s, sys: 4.27 ms, total: 1.07 s
Wall time: 1.08 s


***
## 4.2 LinearRegression

In [29]:
%%time

model = LinearRegression()

grid_params = [
    {
        "model__fit_intercept": [True]
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

MSE: -24997.29450086674
best_params: {'model__fit_intercept': True}
CPU times: user 701 ms, sys: 13.3 ms, total: 714 ms
Wall time: 138 ms


***
## 4.3 SupportVectorRegression

In [31]:
%%time

model = SVR()

grid_params = [
    {
        "model__kernel": ["rbf"],
        "model__degree": [3],
        "model__gamma": ["scale"]
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

***
## 4.4 DecisionTreeRegressor

In [23]:
%%time

model = DecisionTreeRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": range(1, 21, 1),
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse"]
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

MSE: -1154672.2938696651
best_params: {'model__criterion': 'mse', 'model__max_depth': 19, 'model__min_samples_leaf': 1, 'model__random_state': 1}
CPU times: user 41.6 s, sys: 8.97 ms, total: 41.6 s
Wall time: 41.8 s


***
## 4.5 RandomForestRegressor

In [24]:
%%time

model = RandomForestRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": range(1, 21, 1),
        "model__n_estimators": range(100, 1100, 100),
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse"]
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

KeyboardInterrupt: 

***
## 4.6 XGBRegressor

In [None]:
%%time

model = XGBRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": range(1, 21, 1),
        "model__n_estimators": range(100, 1100, 100),
        "model__learning_rate": np.arange(0.0, 2.1, 0.1),
        "model__criterion": ["mse"]
    }
]

score, b_params, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {b_params}")

***
# 5 ...