## Import dependencies

In [3]:
# display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# customized preprocessing functions
import util

# standard libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
import time
import matplotlib.pyplot as plt


# scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# models
from pmdarima import auto_arima
import statsmodels.api as sm
import xgboost as xgb

## Import data

In [None]:
data_path = "../00_data"

df_rapperswil = pd.read_csv(os.path.join(data_path, "features_rapperswil.csv"), sep=",")
# df_burgdorf = pd.read_csv(os.path.join(data_path, "features_burgdorf.csv"), sep=",")

print('Dataset shape of Rapperswil data: {}'.format(df_rapperswil.shape))
print('Dataset shape of Burgdorf data: {}'.format(df_burgdorf.shape))

df_rapperswil['date'] = pd.to_datetime(df_rapperswil['date'])

In [None]:
#  rapperswil declare categorical columns
for col in ['hour', 'day_of_week', 'quarter', 'month', 'day_of_year', 'day_of_month', 'week_of_year', 'weather', 'holiday']:
    df_rapperswil[col] = df_rapperswil[col].astype(object)

## Models

In [2]:
def split_features(df, label=None):
    X = df[[ 'date_only',
             'hour',
             'day_of_week',
             'quarter',
             'month',
             'day_of_year',
             'day_of_month',
             'week_of_year',
             'temperature',
             'weather',
             'holiday',
             't-7',
             't-3',
             't-2',
             't-1']]
    if label:
        y = df[label]
        return X, y
    return X

In [None]:
df_train, df_test = split(df_rapperswil, '2021-07-01 01:00:00')

In [None]:
# split data sets
X_train, y_train = split_features(df_train, label='occupancy_rate')
X_test, y_test = split_features(df_test, label='occupancy_rate')

**Feature scaling** \
Features vary in magnitude and units, which is why we apply feature scaling using `StandardScaler()` for numeric features and `OneHotEncoder()` for categorical features. For example, the input value `day_of_week` should not be used as a continuous value from 1 to 7, since this would mean that the last weekdays (5, 6 or 7) are associated to a higher
weight than the first ones (1,2 and 3).

In [None]:
# split numerical and categorical columns
data_num = X_train.select_dtypes(include=[np.number])
data_cat = X_train.select_dtypes(include=[np.object])

# create data pipeline
num_pipeline = Pipeline([('std_scaler', StandardScaler())])

num_attribs = list(data_num)
cat_attribs = list(data_cat)

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', OneHotEncoder(), cat_attribs)])

# fit and transform training data set with preprocessing pipeline and
# only transform test feature set with the pipeline fit on training feature set to not
# artificially improve test performance

X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.transform(X_test)

## 1. Random forest

For hyperparameter optimization, we apply `random grid search` with cross validation to narrow down the range of reasonable values for the given parameters for the models. Then, `full grid search` with cross validation is applied with the value range obtained from the random grid search.

1) Define random grid

In [None]:
# number of trees in random forest
n_estimators = [x for x in range(200, 2000, 25)]

# number of features to consider at every split
max_features = ['auto', 'sqrt']

# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# create grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

2) Run grid search

In [None]:
# use the random grid to search for best hyperparameters

# create random forest model
rf = RandomForestRegressor(random_state = 42)

# ensure prediction is made on subsequent data
cv = TimeSeriesSplit(n_splits=3)

# random search of parameters, using 3 fold cross validation, 
# search across 75 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 75, cv = cv,
                               verbose=2, random_state=42, n_jobs = -1)
# fit the random search model
rf_random.fit(X_train, y_train)

#s how best parameters
print('Best parameters:')
rf_random.best_params_

3) Define new grid with optimal value range

In [None]:
# define more specific grid

# number of trees in random forest
n_estimators = [x for x in range(200, 2000, 25)]

# number of features to consider at every split
max_features = ['auto', 'sqrt']

# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# create grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

4) Run grid search

In [None]:
# create random forest model
rf = RandomForestRegressor(random_state = 42)

# ensure prediction is made on subsequent data
cv = TimeSeriesSplit(n_splits=3)

# random search of parameters, using 3 fold cross validation, 
# search across 75 different combinations, and use all available cores
rf = GridSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 75, cv = cv,
                               verbose=2, random_state=42, n_jobs = -1)
# fit the random search model
rf.fit(X_train, y_train)

# show best parameters
print('Best parameters:')
rf.best_params_

5) Fit model with optimal parameters

In [None]:
rf = RandomForestRegressor(random_state = 42,
                           n_estimators = 600,
                           max_features = 'auto',
                           max_depth = 20,
                           min_samples_split = 2,
                           min_samples_leaf = 2)

In [None]:
# fit the random search model
rf.fit(X_train, y_train)

# run prediction
df_test['random_forest'] = rf.predict(X_test)
df_all = pd.concat([df_test, df_train], sort=False)

In [None]:
print('MAE: ', round(mean_absolute_error(y_true=df_test['occupancy_rate'], y_pred=df_test['random_forest']), 2))
print('RMSE: ', round(mean_squared_error(y_true=df_test['occupancy_rate'], y_pred=df_test['random_forest'], squared=False), 2))

## 2. XGBoost

1) Optimal parameters obtained from last random grid search

In [None]:
xgb = xgb.XGBRegressor(n_estimators = 600,
                            min_child_weight = 17,
                            max_leaf_nodes = 110,
                            # max_features = 'auto',
                            max_depth = 51,
                            gamma = 0,
                            eta = 0.01,
                            colsample_bytree = 0.9,
                            random_state=42)

In [None]:
# fit the random search model
xgb.fit(X_train, y_train)

# run prediction
df_test['f1 features'] = xgb.predict(X_test)
df_all = pd.concat([df_test, df_train], sort=False)

Best Performance: 3.93 (MAE), 6.03 (RMSE) 

In [None]:
print('MAE: ', round(mean_absolute_error(y_true=df_test['occupancy_rate'], y_pred=df_test['f1 features']), 2))
print('RMSE: ', round(mean_squared_error(y_true=df_test['occupancy_rate'], y_pred=df_test['f1 features'], squared=False), 2))

In [None]:
# plot the true target values for the test set versus the estimated values with the best model
plt.scatter(df_test['occupancy_rate'], df_test['time-related features'], alpha=0.1);
plt.plot([0, 100], [0, 100], "k--", lw=3);
plt.xlabel('Measured parking occupancy (%)');
plt.ylabel('Predicted parking occupancy (%)');
plt.title('F1 features: True values vs predicted values');
file = 'f1_features.png'
plt.savefig('../05_visualisations_of_eda/' + file);