# 1 Dependency import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

***
# 2 Loading data

In [None]:
data = pd.read_csv("data/2015-cleaned.csv", delimiter=",")

In [None]:
categorical_cols = data.select_dtypes("object")

In [None]:
numerical_cols = data.columns.difference(categorical_cols)

In [None]:
X = data.drop(columns=["SiteEnergyUse(kBtu)"])
y = data["SiteEnergyUse(kBtu)"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

***
# 3 Feature engineering actions

***
# 4 Modeling

***
## 4.1 Naive approach

In [None]:
numerical_transformer = SimpleImputer(strategy="constant")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=1)

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, predictions)
print(f"MAE: {mae}")

***
# 5 ...