## Import dependencies

In [34]:
# autosave every 60 seconds
%autosave 60

#display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from matplotlib.ticker import MaxNLocator
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#standard libraries
import numpy as np
import pandas as pd
import os

#make this notebook's output stable across runs
np.random.seed(42)
    
#ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#ARIMA model
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

#indicate how many differencing rounds are necessary
from pmdarima.arima.utils import ndiffs

#preprocessing libraries
import json
from datetime import datetime, timezone
import time

#model libraries
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.base import clone

#viz
import seaborn as sns
# plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")

#store model
import pickle

Autosaving every 60 seconds


## Import data

In [43]:
# data path
data_path = '/Users/christopherkindl/working/start-hack-2021/00_data/'

#define variables
rapperswil_data = 'rapperswil.csv'
burgdorf_data = 'burgdorf.csv'
weather_data = 'weather_rapperswil.csv'

#function to import data
def load_data(data_path, data_type):
    csv_path = os.path.join(data_path, data_type)
    return pd.read_csv(csv_path, sep=';')

#function to import data (,)
def load_data_2(data_path, data_type):
    csv_path = os.path.join(data_path, data_type)
    return pd.read_csv(csv_path, sep=',')

#load rapperswil data
df_weather = load_data_2(data_path, weather_data)
df_rapperswil = load_data(data_path, rapperswil_data)

## Data cleaning 

In [48]:
#rename columns
df_rapperswil = df_rapperswil.rename(columns={'Datum': 'date', 'BELEGUNGSQUOTE (%)': 'occupancy_rate'})

#convert date column into datetime format
df_rapperswil['date'] = pd.to_datetime(df_rapperswil['date'])

#remove time zone
df_rapperswil['date'] = df_rapperswil['date'].apply(lambda x: x.replace(tzinfo=None))

## Feature Engineering

We see the following features as relevant:

**Date-time features:**
- Hour
- Day of week
- Quarter
- Month
- Day of year
- Day of month
- Week of year

**Local features:**
- Weather type
- Temperature (Fahrenheit)
- Public holiday or not (Coming soon)

**Lag features:** (TBD)


### Date-time features

In [22]:
def time_features(df, label=None):
    """
    Input initial df of parking lots (e.g. df_rapperswil).
    Create several time dimensions, such as quarter or day of week.
    Return df with new time dimensions.
    """
    df = df.copy()
    #df['id'] = df.index
    df['date_only'] = df['date'].dt.date
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    #df['day_name'] = df['date'].dt.day_name()
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    #df['year'] = df['date'].dt.year
    df['day_of_year'] = df['date'].dt.dayofyear
    df['day_of_month'] = df['date'].dt.day
    df['week_of_year'] = df['date'].dt.weekofyear
    
    return df

In [45]:
df_rapperswil = time_features(df_rapperswil)

### Local features

In [52]:
#only keep essential columns
df_weather = df_weather[['dt', 'temp', 'weather_main']]

#convert date column into datetime format
df_weather['date'] = pd.to_datetime(df_weather['dt'], unit='s')

# keep only essential columns
# df_weather = df_weather[['date', 'weather_main']]

#drop dt column
df_weather.drop(columns=['dt'], inplace = True)

# rename columns
df_weather = df_weather.rename(columns={'weather_main': 'weather', 'temp' : 'temperature'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_weather['date'] = pd.to_datetime(df_weather['dt'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [46]:
# apply function
df_rapperswil = time_features(df_rapperswil)

Merge weather data with parking data

In [98]:
merged_df = pd.merge(df_rapperswil, df_weather, how='inner', on = 'date')

# drop date_only column
merged_df = merged_df.drop('date_only', 1)

In [None]:
# set date as index
merged_df.set_index('date')

## Modelling, training and performance

1) General configurations

In [101]:
# split into X and y
X, y = merged_df.drop('occupancy_rate', axis=1), merged_df[['occupancy_rate']]

# number of cross-validation rounds for model evaluation
ts_cv = TimeSeriesSplit(n_splits=5)

2) Pre-processing pipeline
- drop (or impute) Nan values
- encode features
- standardise features (scaling)

In [56]:
# transform categorical values into integers
X['weather_num'] = pd.Categorical(X['weather'])
X['weather_num'] = X.weather_num.cat.codes

In [109]:
#split numerical and categorical columns
data_num = X.select_dtypes(include=[np.number])
data_cat = X.select_dtypes(include=[np.object])

#create data pipeline
num_pipeline = Pipeline([('std_scaler', StandardScaler())])

num_attribs = list(data_num)
cat_attribs = list(data_cat)

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', OneHotEncoder(), cat_attribs), 
    ])

X = full_pipeline.fit_transform(X)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_cat = X.select_dtypes(include=[np.object])


In [None]:
#assign initial colum names in seperate list
cat_encoder = full_pipeline.named_transformers_['cat']
cat_encoder = list(cat_encoder.categories_)
cat_encoder_attribs = [str(cat_attribs[index]) + '_' + category for index,categories 
                       in enumerate(cat_hot_attribs) for category in categories]
attributes = num_attribs + cat_encoder_attribs 

3) Run models

In [110]:
# run cross-validation

# Init RF and CV
cv = TimeSeriesSplit(n_splits=3)
rf = RandomForestRegressor(n_estimators=250, random_state=42) # will be optimised by grid search

scores = cross_validate(rf, X, y, cv=cv, scoring='neg_mean_squared_error', return_estimator=True)

# Base RMSLE
base_rmsle = np.sqrt(-np.mean(scores["test_score"]))
print("Base Root Meat Squared Error is: {:.5f}".format(base_rmsle))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Base Root Meat Squared Error is: 7.71417


4) Feature importance

In [152]:
#get feature importance
for idx,estimator in enumerate(scores['estimator']):
    print("Features sorted by their score for estimator {}:".format(idx))
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = attributes,
                                       columns=['importance']).sort_values('importance', ascending=False)
    print('-----------------------------------------------')
    print(feature_importances)

Features sorted by their score for estimator 0:
-----------------------------------------------
                      importance
hour                    0.758953
temperature             0.119762
day_of_week             0.048661
day_of_year             0.034109
day_of_month            0.014885
week_of_year            0.011609
month                   0.002842
weather_Clear           0.001690
quarter                 0.001561
weather_Rain            0.001538
weather_Clouds          0.001481
weather_Fog             0.001080
weather_Mist            0.000775
weather_Snow            0.000595
weather_Drizzle         0.000285
weather_Thunderstorm    0.000115
weather_Haze            0.000058
Features sorted by their score for estimator 1:
-----------------------------------------------
                      importance
hour                    0.765452
temperature             0.120031
day_of_week             0.048495
day_of_year             0.031244
day_of_month            0.011551
week_of_year    