# Data preprocessing, model training and model selection

### Import of dependencies

In [1]:
# autosave every 60 seconds
%autosave 60

#display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from matplotlib.ticker import MaxNLocator
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#import standard libraries
import numpy as np
import pandas as pd
import os


#make this notebook's output stable across runs
np.random.seed(42)
    
#ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action='ignore', message='^internal gelsd')

#preprocessing libraries
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

#model libraries
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

#evaluation libraries
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.base import clone

#store model
import pickle

Autosaving every 60 seconds


### Import data

In [10]:
data_folder = '/00_data'
data_path = os.getcwd() + data_folder

In [15]:
#define variables
rapperswil_data = 'rapperswil.csv'

#function to import data
def load_data(data_path, data_type):
    csv_path = os.path.join(data_path, data_type)
    return pd.read_csv(csv_path, sep=';')

#load data
df = load_data(data_path, rapperswil_data)

**Data cleaning**

In [16]:
#rename columns
df = df.rename(columns={'Datum': 'date', 'BELEGUNGSQUOTE (%)': 'occupancy_rate'})

In [17]:
df

Unnamed: 0,date,occupancy_rate
0,2020-08-01T01:00:00+01:00,36.163522
1,2020-08-01T04:00:00+01:00,9.748428
2,2020-08-01T06:00:00+01:00,9.119497
3,2020-08-01T10:00:00+01:00,16.981132
4,2020-08-01T19:00:00+01:00,71.698113
...,...,...
6341,2020-09-28T19:00:00+01:00,44.025157
6342,2020-09-28T23:00:00+01:00,11.006289
6343,2020-09-29T05:00:00+01:00,7.291667
6344,2020-09-29T09:00:00+01:00,47.798742
