In [None]:
import pandas as pd
import altair as alt
from pathlib import Path

TABLE_ID = '80072ned'
TABLE_PATH = Path(f'./data/{TABLE_ID}')

# required for export to pdf with images?
alt.renderers.enable('png')

# UWV Exploratory Analysis

In [None]:
slp: pd.DataFrame = pd.read_parquet(TABLE_PATH / f'{TABLE_ID}.parquet')
slp.info()

In [None]:
slp

In [None]:
slp.category_group_title.value_counts()

## Get train and test

We will use 2022 and up as the final test data.
All prior tot 2022 will be training data. To test the trained model, we wil use 2021. So we get three splits:

- All data from 2013 onward and prior to 2021 is the real train data. This is the data to perform exploratory data analysis on. 
- All data from 2021 wil be the test set to test our trained models on. 
- When we are really done, 2022 and onwards will be the final test set. 

Additionally, we will only use the quarterly numbers (period_type = 'KW')

In [None]:
slp_test = slp[(slp.period_year >= 2022) & (slp.period_type == 'KW')]
slp_train = slp[(slp.period_year > 2012) & (slp.period_year < 2021) & (slp.period_type == 'KW')]
slp_train_test = slp[(slp.period_year == 2021) & (slp.period_type == 'KW')]

In [None]:
slp_train.info()

In [None]:
alt.Chart(slp_train).mark_line().encode(
    x='period', 
    y='sick_leave_percentage',
    color='sbi_title'
)

In [None]:
alt.Chart(slp_train).mark_boxplot().encode(
    x='period',
    y='sick_leave_percentage',
)

In [None]:
slp_train_total = slp_train[slp_train.sbi == 'T001081']

alt.Chart(slp_train_total).mark_line().encode(
    x='period_quarter_number',
    y='sick_leave_percentage',
    color='period_year'
).properties(title='Seasonality of T001081 sick leave %')

In [None]:
alt.Chart(slp_train).mark_point().encode(
     x=alt.X(alt.repeat("column"), type='ordinal'),
    y='sick_leave_percentage',
    color='period_year'   
).repeat(
    column=['period_quarter_number']
).properties(
    title='Seasonality sick leave % (all categories)'
)

In [None]:
alt.Chart(slp_train_total).mark_point().encode(
    x=alt.X(alt.repeat("column"), type='ordinal'),
    y='sick_leave_percentage',
    color='period_year'
).repeat(
    column=['period_quarter_number']
).properties(
    title='Seasonality of T001081 sick leave %'
)

In [None]:
alt.Chart(slp_train[slp_train.period_quarter_number == 1]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train[slp_train.period_quarter_number  == 2]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train[slp_train.period_quarter_number  == 3]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train[slp_train.period_quarter_number  == 4]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
)

In [None]:

alt.Chart(slp_train_total[slp_train_total.period_quarter_number ==  1]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train_total[slp_train_total.period_quarter_number ==  2]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train_total[slp_train_total.period_quarter_number ==  3]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
) | alt.Chart(slp_train_total[slp_train_total.period_quarter_number ==  4]).mark_point().encode(
    x='period',
    y='sick_leave_percentage',
    color='period_year'
)

In [None]:
from pandas.plotting import lag_plot
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20,10))

for i, lag in enumerate(range(1,9)):
    lag_plot(slp_train_total.sick_leave_percentage, lag=lag, ax=axes[i // 4, i % 4])
   
plt.show()

In [None]:
import math

start_lag = 0
lag_length = 21

lagged_auto_correlation = pd.DataFrame()
lagged_auto_correlation['lag'] = range(start_lag, lag_length)

white_noise_border = 1.96 / math.sqrt(len(slp_train_total.sick_leave_percentage))

wn_border = pd.DataFrame()
wn_border['lag'] = range(start_lag - 1, lag_length + 1)
wn_border['pos_white_noise_border'] = [white_noise_border for _ in range(start_lag - 1, lag_length + 1)]
wn_border['neg_white_noise_border'] = [-white_noise_border for _ in range(start_lag - 1, lag_length + 1)]

lagged_auto_correlation['auto_correlation'] = [slp_train_total.sick_leave_percentage.autocorr(lag=lag) for lag in lagged_auto_correlation['lag']]

alt.Chart(lagged_auto_correlation).mark_bar().encode(
    x='lag',
    y='auto_correlation',
) + alt.Chart(wn_border).mark_line(strokeDash=[1,1]).encode(
    x='lag',
    y='pos_white_noise_border',
) + alt.Chart(wn_border).mark_line(strokeDash=[1,1]).encode(
    x='lag',
    y='neg_white_noise_border'
)

In [None]:
moving_average = pd.DataFrame()

moving_average['quarter'] = slp_train_total.period
moving_average['sick'] = slp_train_total.sick_leave_percentage

for window in range(3, 16, 2):
    moving_average[f'{window}-MA'] = slp_train_total.sick_leave_percentage.rolling(window, center=True).mean()

moving_average


In [None]:
charts = [alt.Chart(moving_average).mark_line().encode(x='quarter', y='sick')]

for window in range(3, 16, 2):
    charts.append(alt.Chart(moving_average).mark_line().encode(x='quarter', y=f'{window}-MA'))
    
alt.vconcat(*charts)

In [None]:
from statsmodels.tsa.seasonal import STL

slp_series = slp_train_total.sick_leave_percentage
slp_series.index = slp_train_total.period
slp_series

In [None]:

plt.rc("font", size=6)
stl = STL(slp_series, period=4)
res = stl.fit()

fig = res.plot()
fig.autofmt_xdate()



In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

plot_pacf(slp_series, lags=15, alpha=0.1)
plt.show()