# Advanced Models

This IPython notebook contains the entire source code for recreating the advanced model (gradient boosted regressor) as discussed in the work _Predictive Models for Parking Space Occupation_. Please note that running the script `feature_engineering` is a prerequisite to run this notebook.

Table of contents:

1. [Dependencies](#dependencies)
2. [Data Import](#data_import)
3. [Data Preprocessing](#data_preprocessing)
4. [Model Training](#model_training)
5. [Appendix](#appendix)
    1. [Single Model](#single_model)
    2. [Rolling Predictions](#rolling_predictions)
    3. [Random Forest](#rnd_forest)

For the purpose of legibility, code cells are separated, where applicable, to reflect segments required only when running the notebook interactively.

<a id='dependencies'></a>

## 1. Dependencies

Some libraries might require pip install.

In [4]:
# standard libraries
import numpy as np
import pandas as pd
import time
import os

# machine learning libraries
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb


In [5]:
# display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib.pyplot as plt

# Define the locations to be used.
locations = ['burgdorf', 'rapperswil']

# Set this to False if another grid-search should be preformed.
use_default = True

# Set this to False if no tex environment is installed.
use_tex = True

# Set this to True if plots should be saved to disk.
save_figs = True


<a id='data_import'></a>

## 2. Data Import

In [6]:
data_path = '../00_data'

df_rapperswil = pd.read_csv(os.path.join(data_path, "features_rapperswil.csv"), sep=",")
df_burgdorf = pd.read_csv(os.path.join(data_path, "features_burgdorf.csv"), sep=",")

print('Dataset shape of Rapperswil data: {}'.format(df_rapperswil.shape))
print('Dataset shape of Burgdorf data: {}'.format(df_burgdorf.shape))

dfs = {}

for loc in locations:
    dfs[loc] = pd.read_csv(os.path.join(data_path, "features_{}.csv".format(loc)), sep=",")
    dfs[loc]['date'] = pd.to_datetime(dfs[loc]['date'])

    #  declare categorical columns
    for col in ['hour', 'day_of_week', 'quarter', 'month', 'day_of_year', 'day_of_month',
                'week_of_year', 'weather', 'weather_t-1', 'weather_t-2', 'weather_t-3', 'weather_t-7', 
                'holiday']:
        
        dfs[loc][col] = dfs[loc][col].astype(object)

    # set datetime column as index
    dfs[loc].set_index('date', inplace = True)


Dataset shape of Rapperswil data: (12303, 30)
Dataset shape of Burgdorf data: (7768, 30)


In [None]:
for loc in locations:
    print('Dataset shape of {} data: {}'.format(loc.capitalize(), dfs[loc].shape))


<a id='data_preprocessing'></a>

## 3. Data Preprocessing

Split datasets into train and test data using 70/30% ratio by considering the order of the time series.

In [None]:
# identify split date manually and use beginning of the month of the specific date
# e.g. 2021-08-16 04:00:00 --> 2021-08-01 01:00:00, only for pragmatic reasons for better paper story
splits = {}

for loc in locations:
    splits[loc] = 0.7 * len(dfs[loc])


In [None]:
## Splits a datetime-indexed dataframe according to a specified split date.
# @param df         a datetime-indexed dataframe.
# @param split_date the split date.
#
# @returns A copy of the lower part of the dataframe (including split date) and a copy of the upper part.

def split(df:pd.DataFrame, split_date:pd.DatetimeIndex):
    # split df into train and test set
    df_train = df.loc[df.index <= split_date].copy()
    df_test = df.loc[df.index > split_date].copy()
    
    return df_train, df_test


In [None]:
dfs_train, dfs_test = {}, {}

for loc in locations:
    dfs_train[loc], dfs_test[loc] = split(dfs[loc], dfs[loc].index[int(splits[loc])])


In [None]:
for loc in locations:
    print('Cutoff date {}:\t{}\t({}/{} entries)'.format(loc.capitalize(), dfs[loc].index[int(splits[loc])], len(dfs_train[loc]), len(dfs_test[loc])))


In [None]:
## Extracts a feature column from the dataframe.\n",
# @param df     the dataframe.
# @param label  the label of the column to be extracted.
#
# @returns the original dataframe and (if found) the column indexed by @p label or (if not found) None.
def extract_features(df:pd.DataFrame, label:str=None):
    X = df[[# 'date_only', (is not as a feature and kept out-commented to have overview of source data)
             'hour',
             'day_of_week',
             'quarter',
             'month',
             'day_of_year',
             'day_of_month',
             'week_of_year',
             #'temperature',
             'temperature_t-1',
             'temperature_t-2',
             'temperature_t-3',
             'temperature_t-7',
             'weather',
             'weather_t-1',
             'weather_t-2',
             'weather_t-3',
             'weather_t-7',
             'holiday',
             't-7',
             't-3',
             't-2',
             't-1']]
    if label:
        y = df[label]
        return X, y
    return X, None


Split datasets

In [None]:
# extract the occupancy rate as a label
X_train, X_test = {}, {}
y_train, y_test = {}, {}

for loc in locations:
    X_train[loc], y_train[loc] = extract_features(dfs_train[loc], label='occupancy_rate')
    X_test[loc], y_test[loc] = extract_features(dfs_test[loc], label='occupancy_rate')


### Feature standardization and scaling

Features vary in magnitude and units, which is why feature scaling is applied, using `StandardScaler()` for numeric features and `OneHotEncoder()` for categorical features. For example, the input value `day_of_week` should not be used as a continuous value from 1 to 7, since this would associate a higher weight to the later weekdays (5, 6 or 7) than to the earlier ones (1,2 and 3).

In [None]:
## Standardize features using standard scaling and one-hot encoding.
# @p df_train   the training dataframe.
# @p df_test    the test dataframe.
#
# @returns  the standardized training and test datasets, as well as the generated pipeline object.
def standardize_features(df_train:pd.DataFrame, df_test:pd.DataFrame):
    # split numerical and categorical columns
    data_num = df_train.select_dtypes(include=[np.number])
    data_cat = df_train.select_dtypes(include=[object])

    # check whether no columns got lost during type allocation
    len(df_train.columns) == len(data_num.columns) + len(data_cat.columns)

    # create data pipeline
    num_pipeline = Pipeline([('std_scaler', StandardScaler())])

    num_attribs = list(data_num)
    cat_attribs = list(data_cat)

    full_pipeline = ColumnTransformer([
            ('num', num_pipeline, num_attribs),
            # don’t take precautions to handle unseen values for OneHotEncoder
            ('cat', OneHotEncoder(handle_unknown = 'ignore'), cat_attribs)])

    # fit and transform training data set with preprocessing pipeline and
    # only transform test feature set with the pipeline fit on training feature set to not
    # artificially improve test performance

    return full_pipeline.fit_transform(df_train), full_pipeline.transform(df_test), full_pipeline


In [None]:
pipelines = {}

for loc in locations:
        X_train[loc], X_test[loc], pipelines[loc] = standardize_features(X_train[loc], X_test[loc])


<a id='model_training'></a>

## 4. Model Training

For hyperparameter optimization, we apply `random grid search` with cross validation to narrow down the range of reasonable values for the given parameters for the models. Then, `full grid search` with cross validation is applied with the value range obtained from the random grid search.

1) Define random grid and run grid search

In [None]:
t_start, t_end, params = {}, {}, {}

if not use_default:
    # number of trees in random forest
    n_estimators = [x for x in range(200, 2000, 25)]

    # learning rate
    arr = np.arange(0.01, 1.0, 0.05)
    eta = arr.tolist()

    # defines the minimum sum of weights of all observations required in a child
    min_child_weight = [i for i in range(1, 100, 8)]

    # maximum depth of a tree
    max_depth = [i for i in range(1, 250, 5)]

    # regularization
    gamma = [0, 0.5, 1, 3, 5]

    # number of columns used by each tree
    colsample_bytree = [0.5, 0.6, 0.7, 0.8, 0.9, 1]

    # maximum number of leaf nodes in tree
    max_leaf_nodes = [i for i in range(10, 300, 5)]
    max_leaf_nodes.append(None)

    # create random grid
    random_grid = {'n_estimators' : n_estimators,
                   'eta' : eta,
                   'max_depth': max_depth,
                # 'max_features': max_features,
                   'min_child_weight': min_child_weight,
                   'gamma' : gamma,
                   'colsample_bytree' : colsample_bytree,
                   'max_leaf_nodes': max_leaf_nodes}

    for loc in locations:
        # create xgb model
        regressor_rnd = xgb.XGBRegressor(random_state=42)

        # ensure prediction is made on subsequent data
        cv = TimeSeriesSplit(n_splits=3)

        # random search of parameters, using 3 fold cross validation, 
        # search across 75 different combinations, and use all available cores
        xbg_random = RandomizedSearchCV(estimator = regressor_rnd, param_distributions = random_grid,
                                        n_iter = 75, cv = cv,
                                        verbose=2, random_state=42, n_jobs = -1)
        # fit the xgboost model
        t_start[loc] = time.time()
        xbg_random.fit(X_train[loc], y_train[loc])
        t_end[loc] = time.time()

        params[loc] = xbg_random.best_params_

else:
    params['burgdorf'] = {
        'n_estimators': 725,
        'min_child_weight': 1,
        'max_leaf_nodes': 40,
        'max_depth': 26,
        'gamma': 0,
        'eta': 0.11,
        'colsample_bytree': 1}
        
    params['rapperswil'] = {
        'n_estimators': 425,
        'min_child_weight': 49,
        'max_leaf_nodes': 175,
        'max_depth': 51,
        'gamma': 1,
        'eta': 0.01,
        'colsample_bytree': 0.8}

In [None]:
if not use_default:
    for loc in locations:
        print("Grid-search {} took {} seconds.".format(loc.capitalize(), t_end[loc] - t_start[loc]))
        print("Best parameters are:")
        params[loc]
        print("\n\n")

2. Use same procedure to run the `full_grid_search` with the obtained values from `random_grid_search`

In [None]:
# provide more granular range based on values from random grid search
if not use_default:
    for loc in locations:

        # number of trees in random forest
        n_estimators = [x for x in range(max(params[loc]['n_estimators'] - 25, 1), params[loc]['n_estimators'] + 25, 1)]
        
        # learning rate
        eta = np.array([params[loc]['eta']]).tolist()

        # defines the minimum sum of weights of all observations required in a child
        min_child_weight = [i for i in range(max(params[loc]['min_child_weight'] - 8, 1), params[loc]['min_child_weight'] + 8, 1)]

        # maximum depth of a tree
        max_depth = [i for i in range(max(params[loc]['max_depth'] - 5, 1), params[loc]['max_depth'] + 5, 1)]
        
        # regularization
        gamma = [0, 1, 5]
        
        # number of columns used by each tree
        colsample_bytree = [0.6, 0.7, 0.8, 0.9, 1]
        
        # maximum number of leaf nodes in tree
        max_leaf_nodes = [i for i in range(max(params[loc]['max_depth'] - 5, 1), params[loc]['max_depth'] + 5, 1)]
        max_leaf_nodes.append(None)
        
        # create random grid
        random_grid = {'n_estimators' : n_estimators,
            'eta' : eta,
            'max_depth': max_depth,
            # 'max_features': max_features,
            'min_child_weight': min_child_weight,
            'gamma' : gamma,
            'colsample_bytree' : colsample_bytree,
            'max_leaf_nodes': max_leaf_nodes}

        # create xgb model
        regressor_rnd = xgb.XGBRegressor(random_state=42)

        # ensure prediction is made on subsequent data
        cv = TimeSeriesSplit(n_splits=3)

        # random search of parameters, using 3 fold cross validation, 
        # search across 75 different combinations, and use all available cores
        xbg_random = RandomizedSearchCV(estimator = regressor_rnd, param_distributions = random_grid,
                                        n_iter = 75, cv = cv,
                                        verbose=2, random_state=42, n_jobs = -1)

        # xbg_random = GridSearchCV(estimator = regressor_rnd, param_distributions = random_grid,
        #                                 n_iter = 75, cv = cv,
        #                                 verbose=2, random_state=42, n_jobs = -1)

        # fit the xgboost model
        t_start[loc] = time.time()
        xbg_random.fit(X_train[loc], y_train[loc])
        t_end[loc] = time.time()

        params[loc] = xbg_random.best_params_

else:
    params['burgdorf'] = {
        'n_estimators': 742,
        'min_child_weight': 6,
        'max_leaf_nodes': 23,
        'max_depth': 22,
        'gamma': 5,
        'eta': 0.11,
        'colsample_bytree': 0.9
    }
        
    params['rapperswil'] = {
        'n_estimators': 424,
        'min_child_weight': 42,
        'max_leaf_nodes': 47,
        'max_depth': 55,
        'gamma': 0,
        'eta': 0.01,
        'colsample_bytree': 1
    }


In [None]:
if not use_default:
    for loc in locations:
        print("Grid-search {} took {} seconds.".format(loc.capitalize(), t_end[loc] - t_start[loc]))
        print("Best parameters are:")
        params[loc]
        print("\n\n")


In [None]:
regressors = {}

for loc in locations:
    regressors[loc] = xgb.XGBRegressor(**params[loc], random_state=42)

    # fit the random search model
    t_start[loc] = time.time()
    _ = regressors[loc].fit(X_train[loc], y_train[loc])
    t_end[loc] = time.time()


In [None]:
for loc in locations:
    print("Fitting for {} took {} seconds.".format(loc.capitalize(), t_end[loc] - t_start[loc]))


3. Run predictions

In [None]:
maes, rmses = {}, {}

for loc in locations:
    t_start[loc] = time.time()
    dfs_test[loc]['pred_xgb_all_features'] = regressors[loc].predict(X_test[loc])
    t_end[loc] = time.time()
    maes[loc] = mean_absolute_error(y_true=dfs_test[loc]['occupancy_rate'], y_pred=dfs_test[loc]['pred_xgb_all_features'])
    rmses[loc] = mean_squared_error(y_true=dfs_test[loc]['occupancy_rate'], y_pred=dfs_test[loc]['pred_xgb_all_features'], squared=False)


In [None]:
for loc in locations:
    print("Prediction for {} ({} entries) took {} seconds.".format(loc.capitalize(), len(dfs_test[loc]), round(t_end[loc] - t_start[loc], 2)))
    print("\tMAE: ", round(maes[loc], 2))
    print("\tRMSE: ", round(rmses[loc], 2))


4. Plot results

In [None]:
plt_params = {
    'text.usetex' : use_tex,
    'font.size' : 20,
    'xtick.labelsize' : 18,
    'ytick.labelsize' : 18,
    'lines.linewidth': 1,
    'grid.linewidth':   2,
}
plt.rcParams.update(plt_params)

for loc in ['burgdorf', 'rapperswil']:
    fig = plt.figure(figsize=(20,8))

    # plot the true target values for the test set versus the estimated values with the best model
    _ = plt.scatter(dfs_test[loc]['occupancy_rate'], 
        dfs_test[loc]['pred_xgb_all_features'],
        alpha=0.1);
    _ = plt.plot([0, 100], [0, 100], "k--", lw=3);
    _ = plt.xlabel(r'Measured parking occupancy $\left[\%\right]$');
    _ = plt.ylabel(r'Predicted parking occupancy $\left[\%\right]$');
    _ = plt.title('All features: True values vs predicted values ({})'.format(loc.capitalize()));

    # provide MAE and RMSE details to the plot
    textstr = '\n'.join((
        r'MAE: %.2f' % (maes[loc]),
        r'RMSE: %.2f' % (rmses[loc])
    ))

    # style infobox
    # plt.gca().hist(x, 20)
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

    # place a text box in upper left in axes coords
    _ = plt.text(0.03, 0.95, textstr, verticalalignment='top', bbox=props, transform=plt.gca().transAxes)

    if save_figs:
        # export - pay attention to an appropriate name
        filename = 'f_all_' + loc + '.png'
        plt.savefig('../05_visualisations_of_eda/' + filename)


<a id='appendix'></a>
## 5. Appendix

<a id='single_model'></a>

### 1. Single Model

In the subsequent code section a single model is generated in order to predict occupancies for multiple parking locations. The data is selected such that only entries from a date range present in both datasets are considered. In order to destinguish the data, an additional column `loc` is added, which is a categorical feature and thus one-hot encoded.

In [None]:
# Find overlapping window
start_date = max(dfs['burgdorf'].index[0], dfs['rapperswil'].index[0])
end_date = min(dfs['burgdorf'].index[-1], dfs['rapperswil'].index[-1])

if not (start_date < end_date):
    raise AssertionError("Non-overlapping time windows!")

# Merge dataframes by time and location name
dfcomb = dfs['burgdorf'].loc[(dfs['burgdorf'].index >= start_date) & (dfs['burgdorf'].index <= end_date)].assign(loc='burgdorf')
dfcomb = dfcomb.append(dfs['rapperswil'].loc[(dfs['rapperswil'].index >= start_date) & (dfs['rapperswil'].index <= end_date)].assign(loc='rapperswil'))
dfcomb.sort_values(['date', 'loc'],inplace=True)

print('Merged datasets (total {} entries)'.format(len(dfcomb)))

# Perform Train/Test split
splitcomb = 0.7 * len(dfcomb)
comb_train, comb_test = split(dfcomb, dfcomb.index[int(splitcomb)])
print('Cutoff date:\t{}\t({}/{} entries)'.format(dfcomb.index[int(splitcomb)], len(comb_train), len(comb_test)))


In [None]:
## Extracts a feature column from the combined dataframe.
# @param df     the dataframe.
# @param label  the label of the column to be extracted.
#
# @returns the original dataframe and (if found) the column indexed by @p label or (if not found) None.
def extract_combined_features(df:pd.DataFrame, label:str=None):
    X = df[[# 'date_only', (is not as a feature and kept out-commented to have overview of source data)
             'hour',
             'day_of_week',
             'quarter',
             'month',
             'day_of_year',
             'day_of_month',
             'week_of_year',
             'temperature',
             'weather',
             'holiday',
             't-7',
             't-3',
             't-2',
             't-1',
             'loc']]
    if label:
        y = df[label]
        return X, y
    return X, None


In [None]:
combX_train, comby_train = extract_combined_features(comb_train, label='occupancy_rate')
combX_test, comby_test = extract_combined_features(comb_test, label='occupancy_rate')

combX_train, combX_test, comb_pipeline = standardize_features(combX_train, combX_test)


In [None]:
t_start_comb, t_end_comb, comb_params = None, None, None

if not use_default:
    # number of trees in random forest
    n_estimators = [x for x in range(200, 2000, 25)]

    # learning rate
    arr = np.arange(0.01, 1.0, 0.05)
    eta = arr.tolist()

    # defines the minimum sum of weights of all observations required in a child
    min_child_weight = [i for i in range(1, 100, 8)]

    # maximum depth of a tree
    max_depth = [i for i in range(1, 250, 5)]

    # regularization
    gamma = [0, 1, 5]

    # number of columns used by each tree
    colsample_bytree = [0.8, 0.9, 1]

    # maximum number of leaf nodes in tree
    max_leaf_nodes = [i for i in range(10, 300, 5)]
    max_leaf_nodes.append(None)

    # create random grid
    random_grid = {'n_estimators' : n_estimators,
                'eta' : eta,
                'max_depth': max_depth,
                # 'max_features': max_features,
                'min_child_weight': min_child_weight,
                'gamma' : gamma,
                'colsample_bytree' : colsample_bytree,
                'max_leaf_nodes': max_leaf_nodes}

    # create xgb model
    regressor_rnd = xgb.XGBRegressor(random_state=42)

    # ensure prediction is made on subsequent data
    cv = TimeSeriesSplit(n_splits=3)

    # random search of parameters, using 3 fold cross validation, 
    # search across 75 different combinations, and use all available cores
    xbg_random = RandomizedSearchCV(estimator = regressor_rnd, param_distributions = random_grid, n_iter = 75, cv = cv,
                                verbose=2, random_state=42, n_jobs = -1, error_score='raise')
    # fit the xgboost model
    t_start_comb = time.time()
    xbg_random.fit(combX_train, comby_train)
    t_end_comb = time.time()

    comb_params = xbg_random.best_params_

    print("Grid-search took {} seconds.".format(t_end_comb - t_start_comb))
    print("Best parameters are:")
    comb_params
    print("\n\n")

else:
    comb_params = {
        'n_estimators': 200,
        'min_child_weight': 73,
        'max_leaf_nodes': 165,
        'max_depth': 31,
        'gamma': 1,
        'eta': 0.060000000000000005,
        'colsample_bytree': 0.9}


In [None]:
# Fit model
regressor_comb = xgb.XGBRegressor(**comb_params, random_state=42)

# fit the random search model
t_start_comb = time.time()
_ = regressor_comb.fit(combX_train, comby_train)
t_end_comb = time.time()

print("Fitting took {} seconds.".format(t_end_comb - t_start_comb))


In [None]:
# Run predictions
t_start_comb = time.time()
comb_test['pred_xgb_all_features'] = regressor_comb.predict(combX_test)
t_end_comb = time.time()

print("Prediction ({} entries) took {} seconds.".format(len(comb_test), t_end_comb - t_start_comb))

maes_comb, rmses_comb = {}, {}

maes_comb['all'] = mean_absolute_error(y_true=comb_test.occupancy_rate, y_pred=comb_test.pred_xgb_all_features)
rmses_comb['all'] = mean_squared_error(y_true=comb_test.occupancy_rate, y_pred=comb_test.pred_xgb_all_features, squared=False)

maes_comb['burgdorf'] = mean_absolute_error(y_true=comb_test[comb_test['loc'] == 'burgdorf'].occupancy_rate, y_pred=comb_test[comb_test['loc'] == 'burgdorf'].pred_xgb_all_features)
rmses_comb['burgdorf'] = mean_squared_error(y_true=comb_test[comb_test['loc'] == 'burgdorf'].occupancy_rate, y_pred=comb_test[comb_test['loc'] == 'burgdorf'].pred_xgb_all_features, squared=False)

maes_comb['rapperswil'] = mean_absolute_error(y_true=comb_test[comb_test['loc'] == 'rapperswil'].occupancy_rate, y_pred=comb_test[comb_test['loc'] == 'rapperswil'].pred_xgb_all_features)
rmses_comb['rapperswil'] = mean_squared_error(y_true=comb_test[comb_test['loc'] == 'rapperswil'].occupancy_rate, y_pred=comb_test[comb_test['loc'] == 'rapperswil'].pred_xgb_all_features, squared=False)

for val in ['all', 'burgdorf', 'rapperswil']:
    print("Prediction for {}:\tMAE = {}\tRMSE = {}".format(val.capitalize(), round(maes_comb[val], 2), round(rmses_comb[val], 2)))


In [None]:
for loc in ['all', 'burgdorf', 'rapperswil']:
    fig = plt.figure(figsize=(20,8))

    if loc == 'all':
        y_true = comb_test.occupancy_rate
        y_pred = comb_test.pred_xgb_all_features
    else:
        y_true = comb_test[comb_test['loc'] == loc].occupancy_rate
        y_pred = comb_test[comb_test['loc'] == loc].pred_xgb_all_features


    # plot the true target values for the test set versus the estimated values with the best model
    _ = plt.scatter(comb_test.occupancy_rate, 
        comb_test.pred_xgb_all_features,
        alpha=0.1);
    _ = plt.plot([0, 100], [0, 100], "k--", lw=3);
    _ = plt.xlabel(r'Measured parking occupancy $\left[\%\right]$');
    _ = plt.ylabel(r'Predicted parking occupancy $\left[\%\right]$');
    _ = plt.title('Combined Model (All features): True values vs predicted values ({})'.format(loc.capitalize()));

    # provide MAE and RMSE details to the plot
    textstr = '\n'.join((
        r'MAE: %.2f' % (maes_comb[loc]),
        r'RMSE: %.2f' % (rmses_comb[loc])
    ))

    # style infobox
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

    # place a text box in upper left in axes coords
    _ = plt.text(0.03, 0.95, textstr, verticalalignment='top', bbox=props, transform=plt.gca().transAxes)

    if save_figs:
        # export - pay attention to an appropriate name
        filename = 'combined_f_all_' + loc + '.png'
        plt.savefig('../05_visualisations_of_eda/' + filename)

<a id='rolling_predictions'></a>

### 2. Rolling Predictions

In the subsequent code section the previously generated models (individual models) are used to perform rolling predictions. This is used to reflect the prediction accuracy over larger time horizons. The following cases are considered:

- 12 hours
- 24 hours
- 7 days

In [None]:
horizons = {
    '12h': [12, pd.Timedelta('12h')],
    '24h': [24, pd.Timedelta('24h')],
    '7d': [168, pd.Timedelta('7d')]
}

rolling_pred = {}

if not use_default:

    for loc in locations:
        rolling_pred[loc] = pd.DataFrame(columns=horizons.keys()).assign(date=None)
        for hor in horizons:
            preds = [None] * horizons[hor][0]

            rolling_start = time.time()
            print('Starting rolling prediction for {} over {}.'.format(loc.capitalize(), hor))

            for i in range(0, len(dfs_test[loc])):
                # if there is no reference entry left, skip the horizon
                if (dfs_test[loc].index[i] + horizons[hor][1] > dfs_test[loc].index[-1]):
                    break
                
                for j in range(0, horizons[hor][0]):
                    entry, _ = extract_features(dfs_test[loc].iloc[[(i + j)]], label='occupancy_rate')
                    if j > 0:
                        entry['t-1'].iloc[0] = preds[j-1]

                    if j > 1:
                        entry['t-2'].iloc[0] = preds[j-2]
                        
                    if j > 2:
                        entry['t-3'].iloc[0] = preds[j-3]
                        
                    if j > 6:
                        entry['t-7'].iloc[0] = preds[j-7]

                    preds[j] = regressors[loc].predict(pipelines[loc].transform(entry))[0]

                if horizons[hor][0] == 12:
                    newline = [None] * (len(horizons) + 1)
                    newline[0] = preds[-1]
                    newline[-1] = dfs_test[loc].index[i + horizons[hor][0]]
                    rolling_pred[loc].loc[len(rolling_pred[loc])] = newline
                else:
                    rolling_pred[loc].loc[(rolling_pred[loc].date == dfs_test[loc].index[i + horizons[hor][0]]), hor] = preds[-1]

            rolling_end = time.time()
            rolling_pred[loc].to_csv('../00_data/{}_{}_rolling.csv'.format(loc, hor))

            print('\ttook {} seconds.'.format(rolling_end - rolling_start))

        rolling_pred[loc].set_index('date', inplace=True)

else:
    for loc in locations:
        rolling_pred[loc] = pd.read_csv('../00_data/{}_{}_rolling.csv'.format(loc, list(horizons.keys())[-1]), index_col=0)
        rolling_pred[loc].set_index('date', inplace=True)


In [None]:
maes_rolling, rmses_rolling = {}, {}

for loc in locations:
    maes_rolling[loc], rmses_rolling[loc] = {}, {}
    for hor in horizons:
        y_pred = rolling_pred[loc][hor][rolling_pred[loc][hor].notna()]
        y_true = dfs[loc].occupancy_rate[(dfs[loc].index >= y_pred.index[0]) & (dfs[loc].index <= y_pred.index[-1])]
        maes_rolling[loc][hor] = mean_absolute_error(y_true=y_true, y_pred=y_pred)
        rmses_rolling[loc][hor] = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)

        print("Prediction for {} for a horizon of {}:\tMAE = {}\tRMSE = {}".format(loc.capitalize(), hor, round(maes_rolling[loc][hor], 2), round(rmses_rolling[loc][hor], 2)))


<a id='rnd_forest'></a>

### 3. Random forest

For hyperparameter optimization, `random grid search` with cross validation is applied to narrow down the range of reasonable values for the given parameters for the models. Then, `full grid search` with cross validation is applied with the value range obtained from the random grid search.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


1) Define random grid and run grid search

In [None]:
params_rf = {}

if not use_default:
    # number of trees in random forest
    n_estimators = [x for x in range(200, 2000, 25)]

    # number of features to consider at every split
    max_features = ['auto', 'sqrt']

    # maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    # important note - consider regularization for the next run

    # minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]

    # minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]

    # create grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf}

    for loc in locations:
        # create xgb model
        regressor_rnd = RandomForestRegressor(random_state=42)

        # ensure prediction is made on subsequent data
        cv = TimeSeriesSplit(n_splits=3)

        # random search of parameters, using 3 fold cross validation, 
        # search across 75 different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator = regressor_rnd, param_distributions = random_grid,
                                        n_iter = 75, cv = cv,
                                        verbose=2, random_state=42, n_jobs = -1)
        # fit the random forest model
        t_start[loc] = time.time()
        rf_random.fit(X_train[loc], y_train[loc])
        t_end[loc] = time.time()

        params_rf[loc] = rf_random.best_params_

        print("Grid-search for {} took {} seconds.".format(loc.capitalize(), t_end[loc] - t_start[loc]))
        print("Best parameters are:")
        params_rf[loc]
        print("\n\n")
        
else:
    params_rf['burgdorf'] = {
        'n_estimators': 1275,
        'min_samples_split': 2,
        'min_samples_leaf': 2,
        'max_features': 'auto',
        'max_depth': 90
    }

    params_rf['rapperswil'] = {
        'n_estimators': 600,
        'min_samples_split': 2,
        'min_samples_leaf': 2,
        'max_features': 'auto',
        'max_depth': 20
    }
    

In [None]:
rfs = {}
maes_rf, rmses_rf = {}, {}
t_start, t_end = {}, {}

for loc in locations:
    rfs[loc] = RandomForestRegressor(**params_rf[loc], random_state=42)

    # fit the random search model
    t_start[loc] = time.time()
    _ = rfs[loc].fit(X_train[loc], y_train[loc])
    t_end[loc] = time.time()

    X_test[loc]['random_forest'] = rfs[loc].predict(X_test[loc])

    print("Prediction for {} took {} seconds.".format(loc.capitalize(), t_end[loc] - t_start[loc]))

    maes_rf[loc] = mean_absolute_error(y_true=X_test[loc].occupancy_rate, y_pred=X_test[loc].random_forest)
    rmses_rf[loc] = mean_squared_error(y_true=X_test[loc].occupancy_rate, y_pred=X_test[loc].random_forest, squared=False)
    
    print("\tMAE = {}\tRMSE = {}".format(round(maes_rf[loc], 2), round(rmses_rf[loc], 2)))


In [None]:
print('MAE: ', round(mean_absolute_error(y_true=df_test['occupancy_rate'], y_pred=df_test['random_forest']), 2))
print('RMSE: ', round(mean_squared_error(y_true=df_test['occupancy_rate'], y_pred=df_test['random_forest'], squared=False), 2))