# Baseline Modeling

## Import libraries and read in data

In [5]:
import pandas as pd
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Reading in saved dataframe from 1_EDA_DC_Covid.ipynb and 2_Feature_Engineering.ipynb
model_df = pd.read_csv('./Data/model_df.csv')

# Setting Date index
model_df['Date'] = pd.to_datetime(model_df['Date'])
model_df.set_index('Date', inplace=True)
model_df.sort_index(inplace=True)
model_df.head()

Unnamed: 0_level_0,Ward_1_Cases,Ward_2_Cases,Ward_3_Cases,Ward_4_Cases,Ward_5_Cases,Ward_6_Cases,Ward_7_Cases,Ward_8_Cases,Avg_Temp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-04-01,10.0,13.0,3.0,11.0,7.0,11.0,6.0,7.0,47.6
2020-04-02,9.0,6.0,4.0,12.0,15.0,18.0,17.0,7.0,54.0
2020-04-03,16.0,13.0,2.0,27.0,24.0,24.0,21.0,18.0,56.8
2020-04-04,7.0,0.0,4.0,10.0,4.0,2.0,13.0,25.0,54.4
2020-04-05,12.0,15.0,6.0,23.0,10.0,31.0,21.0,7.0,57.1


## Create Baseline Time Series Model Using Persistence Algorithm

In [6]:
def get_base_model_rmses(df):
    
    # Excluding Avg Temp since this is an exogenous variable
    for col in df.columns[:-1]:
        
        # Creating lagged dataframe
        lagged_df = pd.concat([df[col].shift(1), df[col]], axis=1)
        lagged_df.columns = ['t-1', 't']
        
        # Creating train and test sets
        # split into train and test sets
        X = lagged_df['t-1']
        y = lagged_df['t']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, shuffle=False)
        
        # Getting predictions via persistence algorithm, which holds that the 't' value
        # should be assigned the 't-1' value, i.e., the value that came one lag earlier
        predictions = []
        for x in X_test:
            predictions.append(x)
        test_score = root_mean_squared_error(y_test, predictions)
        print(f'The baseline RMSE for {col} is {test_score}')
        print()

In [7]:
get_base_model_rmses(model_df)

The baseline RMSE for Ward_1_Cases is 17.031388135386436

The baseline RMSE for Ward_2_Cases is 19.3502407783939

The baseline RMSE for Ward_3_Cases is 9.12414379544733

The baseline RMSE for Ward_4_Cases is 23.858484292328228

The baseline RMSE for Ward_5_Cases is 24.367918551765033

The baseline RMSE for Ward_6_Cases is 22.084548279900876

The baseline RMSE for Ward_7_Cases is 23.106964711571663

The baseline RMSE for Ward_8_Cases is 20.775313147185926

