## Parameters

In [1]:
# Where the data is located and the engineered features will be saved.
data_path = '../00_data/'

## Import dependencies

In [2]:
# display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# standard libraries
import numpy as np
import pandas as pd
import os
from datetime import datetime

# make this notebook's output stable across runs
np.random.seed(42)
    
# ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Import data

In [3]:
# file names
rapperswil_data = 'rapperswil.csv'
burgdorf_data = 'burgdorf.csv'
weather_rapperswil = 'weather_rapperswil.csv'
weather_burgdorf = 'weather_burgdorf.csv'
public_holidays_rapperswil = 'public_holidays_zurich.csv'
public_holidays_bern = 'public_holidays_bern.csv'

# function to import data
def load_data(data_path, data_file, **kwargs):
    csv_path = os.path.join(data_path, data_file)
    return pd.read_csv(csv_path, **kwargs)

# load weather data
df_weather_rapperswil = load_data(data_path, weather_rapperswil, sep=',')
df_weather_burgdorf = load_data(data_path, weather_burgdorf, sep=',')

# load parking data
df_rapperswil = load_data(data_path, rapperswil_data, sep=',')
df_burgdorf = load_data(data_path, burgdorf_data, sep=',')

# load public holiday data
# columns to keep
df_holidays_rapperswil = load_data(data_path, public_holidays_rapperswil, sep=';', encoding='latin1', usecols=['Date'])
df_holidays_burgdorf = load_data(data_path, public_holidays_bern, sep=';', encoding='latin1', usecols=['Date'])

print('Dataset length of Rapperswil data: {}'.format(len(df_rapperswil)))
print('Dataset length of Burgdorf data: {}'.format(len(df_burgdorf)))

Dataset length of Rapperswil data: 12324
Dataset length of Burgdorf data: 7171


## Data cleaning 

#### 0) Public holiday data

source: https://www.feiertagskalender.ch/

columns:
- **date:** represents a public holiday 

In [4]:
# Change Date column to date and convert to timestamp
df_holidays_rapperswil = df_holidays_rapperswil.rename(columns={'Date' : 'date'})
df_holidays_rapperswil['date'] = [datetime.strptime(i, '%d.%m.%Y').date() for i in df_holidays_rapperswil.date]

df_holidays_burgdorf = df_holidays_burgdorf.rename(columns={'Date' : 'date'})
df_holidays_burgdorf['date'] = [datetime.strptime(i, '%d.%m.%Y').date() for i in df_holidays_burgdorf.date]

#### 1) Parking data

columns:
- **datetime:** hourly
- **occupancy_rate:** avg. parking occupancy for given hour

In [5]:
#rename columns
df_rapperswil = df_rapperswil.rename(columns={'DATE': 'date', 'BELEGUNGSQUOTE (%)': 'occupancy_rate'})
df_burgdorf = df_burgdorf.rename(columns={'category': 'date', 'Auslastung': 'occupancy_rate'})

#convert date column into datetime format
df_rapperswil['date'] = pd.to_datetime(df_rapperswil['date'])
df_burgdorf['date'] = pd.to_datetime(df_burgdorf['date'])

#remove time zone
df_rapperswil['date'] = df_rapperswil['date'].apply(lambda x: x.replace(tzinfo=None))

Calculate hours for Burgdorf

In [6]:
# add hh:mm artificially for Burgdorf data since raw data does not explicitly provide HH:MM level but is based on HH:MM
# transform 1, 2, ... 23 to 01:00, 02:00 

count = 0
dates = []

for i in range(len(df_burgdorf.date)):
    if count == 23:
        dates.append(str(count)+':00')
        count = 0

    else:
        if count < 10:
            dates.append('{:02d}'.format(count)+':00')
        else:
            dates.append(str(count)+':00')
        count += 1

# Join dates with the corresponding hours
df_burgdorf['date'] = df_burgdorf['date'].astype(str) + ' ' + dates

# transform string date column to datetime column
df_burgdorf['date'] = pd.to_datetime(df_burgdorf['date'])

#### 2) Weather data

source: https://home.openweathermap.org/history_bulks/new (fee required)

columns:
- **temperature** (kelvin)
- **weather:** rain, clear, clouds, etc.
- **datetime:** hourly

In [7]:
# only keep essential columns
df_weather_rapperswil = df_weather_rapperswil[['dt', 'temp', 'weather_main']]
df_weather_burgdorf = df_weather_burgdorf[['dt', 'temp', 'weather_main']]

# convert date column into datetime format
df_weather_rapperswil['date'] = pd.to_datetime(df_weather_rapperswil['dt'], unit='s')
df_weather_burgdorf['date'] = pd.to_datetime(df_weather_burgdorf['dt'], unit='s')

# drop dt column
df_weather_rapperswil.drop(columns=['dt'], inplace = True)
df_weather_burgdorf.drop(columns=['dt'], inplace = True)

# rename columns
df_weather_rapperswil = df_weather_rapperswil.rename(columns={'weather_main': 'weather', 'temp' : 'temperature'})
df_weather_burgdorf = df_weather_burgdorf.rename(columns={'weather_main': 'weather', 'temp' : 'temperature'})

## Feature Engineering

We see the following variables as relevant for feature-based models:

**Historical features:**
- Hour
- Day of week
- Quarter
- Month
- Day of year
- Day of month
- Week of year

**Related features:**
- Weather type (clouds, rain, snow, etc.)
- Temperature (Kelvin)
- Public holiday (boolean value)

**Lag features:**
- *t-1* shift
- *t-2* shift
- *t-3* shift
- *t-7* shift


### Historical features

In [8]:
def time_features(df):
    """
    Input parking data.
    Creates several time dimensions, such as quarter or day of week.
    Return df with historical features.
    """
    df = df.copy()
    df['date_only'] = df['date'].dt.date # only for visualization purposes, not a feature
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_name'] = df['date'].dt.day_name()
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day_of_year'] = df['date'].dt.dayofyear
    df['day_of_month'] = df['date'].dt.day
    df['week_of_year'] = df['date'].dt.weekofyear
    
    return df

In [9]:
# create historical features
df_rapperswil = time_features(df_rapperswil)
df_burgdorf = time_features(df_burgdorf)

### Related features

Data for related features (weather, holidays) already exists and, thus, needs to be merged with parking data

#### A) Merge weather data, parking and public holidays data

In [10]:
def merge_weather(df_p, df_w):
    df_park = df_p.copy()
    df_weather = df_w.copy()

    # set datetime as index for merging
    df_weather.set_index('date')
    df_park.set_index('date')

    # merge parking and weather data
    df_park = df_park.merge(df_weather, left_index=True, right_index=True)
    df_park.drop(['date_y'], axis=1, inplace=True)
    df_park = df_park.rename(columns={'date_x' : 'date'})
    df_park.set_index('date')

    return df_park

In [11]:
df_rapperswil = merge_weather(df_rapperswil, df_weather_rapperswil)
df_burgdorf = merge_weather(df_burgdorf, df_weather_burgdorf)

#### B) Add public holiday information

In [12]:
# create boolean column by identifying whether data corresponds to a public holiday
holidays = [i for i in df_holidays_rapperswil.date]
df_rapperswil['holiday'] = df_rapperswil['date_only'].isin(holidays)

holidays = [i for i in df_holidays_burgdorf.date]
df_burgdorf['holiday'] = df_burgdorf['date_only'].isin(holidays)

# convert true/false into integers
df_rapperswil.replace({'holiday': {False: 0, True: 1}}, inplace=True)
df_burgdorf.replace({'holiday': {False: 0, True: 1}}, inplace=True)

In [13]:
print('Length and number of columns of Rapperswil data: {}, {}'.format(len(df_rapperswil), len(df_rapperswil.columns)))
print('Length and number of columns of Burgdorf data: {}, {}'.format(len(df_burgdorf), len(df_burgdorf.columns)))

Length and number of columns of Rapperswil data: 12324, 14
Length and number of columns of Burgdorf data: 7171, 14


### Lag features

In [14]:
def lag_features(df_p):
    df_park = df_p.copy()

    # prepare lag data for rapperswil
    df_lags = pd.concat([df_park['occupancy_rate'].shift(7),
                        df_park['occupancy_rate'].shift(3),
                        df_park['occupancy_rate'].shift(2),
                        df_park['occupancy_rate'].shift(1)], axis=1)

    df_lags.columns = ['t-7', 't-3', 't-2', 't-1']

    # join lag df with parking and weather data of rapperswil
    df_park = pd.concat([df_lags, df_park], axis=1)

    # drop rows that contain NaN values caused by shifting
    df_park.dropna(inplace=True)

    # set datetime as index
    df_park = df_park.set_index('date')

    return df_park

In [15]:
df_rapperswil = lag_features(df_rapperswil)
df_burgdorf = lag_features(df_burgdorf)

In [16]:
print('Length and number of columns of Rapperswil data: {}, {}'.format(len(df_rapperswil), len(df_rapperswil.columns)))
print('Length and number of columns of Burgdorf data: {}, {}'.format(len(df_burgdorf), len(df_burgdorf.columns)))

Length and number of columns of Rapperswil data: 12317, 17
Length and number of columns of Burgdorf data: 7164, 17


In [17]:
# format columns
df_rapperswil = df_rapperswil.astype({'t-7': float,
                                      't-3': float,
                                      't-2': float,
                                      't-1': float,
                                      'occupancy_rate': float})

df_rapperswil['date_only'] = pd.to_datetime(df_rapperswil['date_only'])
df_burgdorf['date_only'] = pd.to_datetime(df_burgdorf['date_only'])

In [18]:
# Print to see if everything is ok :)
df_rapperswil
df_burgdorf

Unnamed: 0_level_0,t-7,t-3,t-2,t-1,occupancy_rate,date_only,hour,day_of_week,day_name,quarter,month,day_of_year,day_of_month,week_of_year,temperature,weather,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-06-01 07:00:00,12.582781,6.643357,8.940397,10.927152,12.913907,2020-06-01,7,0,Monday,2,6,153,1,23,275.82,Clouds,1
2020-06-01 08:00:00,6.993007,8.940397,10.927152,12.913907,14.900662,2020-06-01,8,0,Monday,2,6,153,1,23,275.22,Clouds,1
2020-06-01 09:00:00,7.284768,10.927152,12.913907,14.900662,19.081272,2020-06-01,9,0,Monday,2,6,153,1,23,274.02,Clouds,1
2020-06-01 10:00:00,7.342657,12.913907,14.900662,19.081272,34.105960,2020-06-01,10,0,Monday,2,6,153,1,23,276.22,Clouds,1
2020-06-01 11:00:00,6.643357,14.900662,19.081272,34.105960,53.973510,2020-06-01,11,0,Monday,2,6,153,1,23,277.84,Clouds,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-27 08:00:00,3.478261,3.478261,5.241935,13.043478,46.568627,2021-10-27,8,2,Wednesday,4,10,300,27,43,297.88,Rain,0
2021-10-27 09:00:00,3.773585,5.241935,13.043478,46.568627,78.629032,2021-10-27,9,2,Wednesday,4,10,300,27,43,294.47,Rain,0
2021-10-27 10:00:00,3.225806,13.043478,46.568627,78.629032,85.887097,2021-10-27,10,2,Wednesday,4,10,300,27,43,291.80,Thunderstorm,0
2021-10-27 11:00:00,3.773585,46.568627,78.629032,85.887097,89.919355,2021-10-27,11,2,Wednesday,4,10,300,27,43,290.92,Rain,0


Unnamed: 0_level_0,t-7,t-3,t-2,t-1,occupancy_rate,date_only,hour,day_of_week,day_name,quarter,month,day_of_year,day_of_month,week_of_year,temperature,weather,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-01-01 07:00:00,0.96,4.69,4.06,0.67,2.21,2021-01-01,7,4,Friday,1,1,1,1,53,275.86,Rain,1
2021-01-01 08:00:00,4.73,4.06,0.67,2.21,4.61,2021-01-01,8,4,Friday,1,1,1,1,53,275.38,Clouds,1
2021-01-01 09:00:00,4.69,0.67,2.21,4.61,4.88,2021-01-01,9,4,Friday,1,1,1,1,53,273.24,Clouds,1
2021-01-01 10:00:00,4.69,2.21,4.61,4.88,4.64,2021-01-01,10,4,Friday,1,1,1,1,53,276.45,Clouds,1
2021-01-01 11:00:00,4.69,4.61,4.88,4.64,3.75,2021-01-01,11,4,Friday,1,1,1,1,53,278.26,Clouds,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-26 14:00:00,68.67,60.86,48.33,36.77,22.25,2021-10-26,14,1,Tuesday,4,10,299,26,43,283.49,Clouds,0
2021-10-26 15:00:00,74.18,48.33,36.77,22.25,15.68,2021-10-26,15,1,Tuesday,4,10,299,26,43,282.34,Clouds,0
2021-10-26 16:00:00,77.38,36.77,22.25,15.68,11.74,2021-10-26,16,1,Tuesday,4,10,299,26,43,285.38,Clear,0
2021-10-26 17:00:00,72.18,22.25,15.68,11.74,5.53,2021-10-26,17,1,Tuesday,4,10,299,26,43,287.13,Clear,0


## Export Feature to CSV

In [20]:
df_rapperswil.to_csv(os.path.join(data_path, "features_rapperswil.csv"), sep=",")
df_burgdorf.to_csv(os.path.join(data_path, "features_burgdorf.csv"), sep=",")