# 1 Dependency import

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config

from xgboost import XGBRegressor

In [50]:
def evaluate(model, grid_params):
    set_config(display="diagram")
    # set_config(display="text")

    num_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="mean"))
    ])

    cat_less_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_uknown="ignore"))
    ])

    cat_lot_unique_pipe = Pipeline(steps=[
        ("simple_imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(handle_uknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num_pipe", num_pipe, numerical_cols),
        ("cat_less_unique_pipe", cat_less_unique_pipe, categorical_cols_less_unique),
        ("cat_lot_unique_pipe", cat_lot_unique_pipe, categorical_cols_lot_unique)
    ])

    pipeline = Pipeline([
        ("transforms", preprocessor),
        ("model", model)
    ])

    gs = GridSearchCV(pipeline, grid_params, scoring="neg_root_mean_squared_error").fit(X_train, y_train)

    score = gs.score(X_test, y_test)

    return score, gs, pipeline

In [3]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [4]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

***
# 2 Loading data

In [5]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [6]:
#target = "GHGEmissionsIntensity(kgCO2e/ft2)"
target = "SiteEnergyUse(kBtu)"

In [7]:
y = data[target]
X = data.drop(columns=[target])

In [8]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [9]:
categorical_cols_less_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
categorical_cols_lot_unique = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [10]:
my_cols = categorical_cols_lot_unique + categorical_cols_less_unique + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [11]:
y_test.describe()

count    6.660000e+02
mean     4.606876e+06
std      1.055033e+07
min      0.000000e+00
25%      8.487488e+05
50%      1.619646e+06
75%      3.423317e+06
max      1.362414e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering

## 3.1 Mutual Information

In [12]:
X_MI = X.copy()

In [13]:
for col in X_MI.select_dtypes("object"):
    X_MI[col], _ = X_MI[col].factorize()

In [14]:
discrete_features = X_MI.dtypes == int

In [15]:
# make_mi_scores(X_MI, y, discrete_features)

***
# 4 Modeling

***
## 4.1 DummyRegressor

In [47]:
%%time

model = DummyRegressor()

grid_params = [
    {
        "model__strategy": ["mean", "median"]
    },
    {
        "model__strategy": ["quantile"],
        "model__quantile": np.arange(0, 1.1, 0.1),
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

categorical_cols_less_unique pre: 
          BuildingType DefaultData  \
count             2664        2664   
unique               7           2   
top     NonResidential          No   
freq              1150        2236   

                                                  Comment ComplianceStatus  \
count                                                   7             2664   
unique                                                  7                1   
top     One of two office buildings on McKinstry's cam...        Compliant   
freq                                                    1             2664   

             Outlier  
count             65  
unique             2  
top     High Outlier  
freq              35  
categorical_cols_lot_unique pre: 
         PrimaryPropertyType PropertyName TaxParcelIdentificationNumber  \
count                   2664         2664                          2663   
unique                    27         2628                          2596   
top     L

Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 236, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 53, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/utils/metaestimators.py", line 120, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "/opt/anaconda/lib/python3.8/site-packages/sklearn/pipeline.py", line 418, in predict
    Xt = transform.transform(Xt)
  File "/opt/anaconda/lib/

MSE: -10552887.019848462
best_params: {'model__strategy': 'mean'}
CPU times: user 6.88 s, sys: 153 ms, total: 7.03 s
Wall time: 7.35 s


***
## 4.2 LinearRegression

In [None]:
%%time

model = LinearRegression()

grid_params = [
    {
        "model__fit_intercept": [True]
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

***
## 4.3 SupportVectorRegression

In [None]:
%%time

model = SVR()

grid_params = [
    {
        "model__kernel": ["rbf"],
        "model__degree": [3],
        "model__gamma": ["scale"]
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

***
## 4.4 DecisionTreeRegressor

In [None]:
%%time

model = DecisionTreeRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [2, 3, 4],
        "model__min_samples_leaf": range(1, 11, 1),
        "model__criterion": ["mse"]
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

***
## 4.5 RandomForestRegressor

In [None]:
%%time

model = RandomForestRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__n_estimators": [25],
        "model__min_samples_leaf": [1],
        "model__criterion": ["mse"]
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

***
## 4.6 XGBRegressor

In [None]:
%%time

model = XGBRegressor()

grid_params = [
    {
        "model__random_state": [1],
        "model__max_depth": [3],
        "model__n_estimators": [100],
        "model__learning_rate": [0.1],
    }
]

score, gs, pipeline = evaluate(model, grid_params)

print(f"MSE: {score}")
print(f"best_params: {gs.best_params_}")

***
# 5 ...