# 1 Dependency import

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

***
# 2 Loading data

In [2]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [3]:
target = "SiteEnergyUse(kBtu)"

In [4]:
y = data[target]
X = data.drop(columns=[target])

In [5]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [6]:
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

In [7]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [8]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [19]:
y_test.describe()

count    6.660000e+02
mean     4.606876e+06
std      1.055033e+07
min      0.000000e+00
25%      8.487488e+05
50%      1.619646e+06
75%      3.423317e+06
max      1.362414e+08
Name: SiteEnergyUse(kBtu), dtype: float64

***
# 3 Feature engineering actions

***
# 4 Modeling

***
## 4.1 Naive approach

In [9]:
numerical_transformer = SimpleImputer(strategy="mean")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [22]:
model = DummyRegressor(strategy="mean")

In [23]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [24]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['Unnamed: 0',
                                                   'OSEBuildingID', 'DataYear',
                                                   'CouncilDistrictCode',
                                                   'YearBuilt',
                                                   'NumberofBuildings',
                                                   'NumberofFloors',
                                                   'PropertyGFATotal',
                                                   'PropertyGFAParking',
                                                   'PropertyGFABuilding(s)',
                                                   'LargestPropertyUseTypeGFA',
                                                   'SecondLargestPropertyUseTypeGFA',
                                                   'ThirdLargestPropertyUseT...
  

In [25]:
predictions = pipeline.predict(X_test)

In [26]:
mae = mean_absolute_error(y_test, predictions)
print(f"MAE: {mae}")

MAE: 5187411.554622189


***
### 4.1.1 Results

DummyRegressor(strategy="mean")  
MAE: 5187411.554622189

***
# 5 ...