In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load the data
from uwv.config import CBS80072NED, CBS_OPENDATA_PROCESSED_DATA_DIR, OUTPUT_DIR

cbs = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")

In [None]:
# Filter out rows where 'period_quarter_number' is 0
cbs = cbs[cbs['period_quarter_number'] != 0]

# Map quarters to months and create the 'date' column
cbs['month'] = cbs['period_quarter_number'].map({1: 1, 2: 4, 3: 7, 4: 10})
cbs['date'] = pd.to_datetime({'year': cbs['period_year'], 'month': cbs['month'], 'day': 1})

# Set this new 'date' column as the index
cbs.set_index('date', inplace=True)

In [None]:
# Filter data based on 'sbi_title'
sbi_code = 'T001081'  # Example: A-U Alle economische activiteiten
filtered_cbs = cbs[cbs['sbi'] == sbi_code]

# Filter the data to include only dates from 2016 to 2023
filtered_cbs = filtered_cbs.loc['2016-01-01':'2023-12-31']

In [None]:
# Plotting the sick_leave_percentage
filtered_cbs['sick_leave_percentage'].plot(figsize=(12,8))
plt.title('Sick Leave Percentage (2016-2023)')
plt.show()

In [None]:
# Augmented Dickey-Fuller Test to check if the data is stationary
from statsmodels.tsa.stattools import adfuller

In [None]:
def adf_test(series, title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(), autolag='AIC')
    
    labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
    out = pd.Series(result[0:4], index=labels)

    for key, val in result[4].items():
        out[f'critical value ({key})'] = val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [None]:
# Running ADF test on sick_leave_percentage
adf_test(filtered_cbs['sick_leave_percentage'], title='Sick Leave Percentage')

In [None]:
# Granger Causality Test (if you have another series to compare with)
# Demonstrating with a hypothetical second series 'sbi_code' from `filtered_cbs`
# You should replace 'sbi_code' with the actual column name you'd like to test against

In [None]:
if 'sbi_code' in filtered_cbs.columns:
    from statsmodels.tsa.stattools import grangercausalitytests
    grangercausalitytests(filtered_cbs[['sick_leave_percentage', 'sbi_code']].dropna(), maxlag=3)

In [None]:
# Simulating and plotting a random series as was done in the original code
np.random.seed(42)
df_random = pd.DataFrame(np.random.randint(20, 30, (50, 2)), columns=['test', 'predictions'])
df_random.plot(figsize=(12,8))
plt.title('Simulated Test vs Predictions')
plt.show()

In [None]:
# Calculating RMSE between test and predictions
from statsmodels.tools.eval_measures import rmse

In [None]:
rmse_value = rmse(df_random['test'], df_random['predictions'])
print(f'RMSE between test and predictions: {rmse_value}')

In [None]:
# Resampling sick_leave_percentage to quarterly and plotting
from statsmodels.graphics.tsaplots import month_plot, quarter_plot

In [None]:
quarter_plot(filtered_cbs['sick_leave_percentage'].resample('QS').mean())
plt.title('Quarterly Plot of Sick Leave Percentage')
plt.show()