In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load the data
from uwv.config import CBS80072NED, CBS_OPENDATA_PROCESSED_DATA_DIR, OUTPUT_DIR

cbs = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")

In [3]:
# Filter out rows where 'period_quarter_number' is 0
cbs = cbs[cbs['period_quarter_number'] != 0]

# Map quarters to months and create the 'date' column
cbs['month'] = cbs['period_quarter_number'].map({1: 1, 2: 4, 3: 7, 4: 10})
cbs['date'] = pd.to_datetime({'year': cbs['period_year'], 'month': cbs['month'], 'day': 1})

# Set this new 'date' column as the index
cbs.set_index('date', inplace=True)

In [4]:
# Filter data based on 'sbi_title'
sbi_code = 'T001081'  # Example: A-U Alle economische activiteiten
filtered_cbs = cbs[cbs['sbi'] == sbi_code]

# Filter the data to include only dates from 2016 to 2023
filtered_cbs = filtered_cbs.loc['2016-01-01':'2023-12-31']

In [5]:
filtered_cbs.index.freq = 'QS'  # Set the frequency to quarterly

In [6]:
# Drop any NaN values to clean the dataset
filtered_cbs = filtered_cbs.dropna()

In [7]:
# Check the columns and ensure the sick_leave_percentage column is in the correct format
filtered_cbs['sick_leave_percentage'] = filtered_cbs['sick_leave_percentage'].astype(float)

In [None]:
# Plot the sick_leave_percentage
filtered_cbs['sick_leave_percentage'].plot(figsize=(16,5), title='Sick Leave Percentage')

In [None]:
# Seasonal decomposition to understand the components of the time series
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(filtered_cbs['sick_leave_percentage'], model='multiplicative')
result.plot()
plt.show()

In [None]:
result.seasonal.plot(figsize=(18,5), title='Seasonal Component of Sick Leave Percentage')

In [11]:
# Define the training and test datasets
train = filtered_cbs.iloc[:-8]  # Use all but the last 8 quarters for training
test = filtered_cbs.iloc[-8:]   # Use the last 8 quarters for testing

In [None]:
# Auto ARIMA model to suggest the best SARIMAX configuration
from pmdarima import auto_arima
auto_arima(train['sick_leave_percentage'], seasonal=True, m=4).summary()

In [13]:
# Fit the SARIMAX model without differencing
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [14]:
model = SARIMAX(train['sick_leave_percentage'],
                order=(1, 0, 1),  # No differencing
                seasonal_order=(1, 0, 1, 4),  # Seasonal order with no differencing
                enforce_stationarity=False,
                enforce_invertibility=False)

In [15]:
results = model.fit(maxiter=1000, method='lbfgs')

In [None]:
# Summary of the SARIMAX model
print(results.summary())

In [17]:
# Predictions
start = len(train)
end = len(train) + len(test) - 1
predictions = results.predict(start, end).rename('SARIMAX Predictions')

In [None]:
# Plotting the results
ax = test['sick_leave_percentage'].plot(legend=True, figsize=(15,8), title='Sick Leave Percentage - Test vs Predictions')
predictions.plot(legend=True)

In [None]:
# Calculate RMSE for predictions
from statsmodels.tools.eval_measures import rmse
rmse_value = rmse(test['sick_leave_percentage'], predictions)
print(f'RMSE: {rmse_value}')

In [20]:
# Train SARIMAX on the full dataset
final_model = SARIMAX(filtered_cbs['sick_leave_percentage'],
                      order=(1, 0, 1),  # No differencing
                      seasonal_order=(1, 0, 1, 4),  # Seasonal order with no differencing
                      enforce_stationarity=False,
                      enforce_invertibility=False)

In [None]:
final_results = final_model.fit(maxiter=1000, method='lbfgs')

In [22]:
# Forecast future values
forecast_steps = 8  # Number of quarters to forecast
fcast = final_results.predict(len(filtered_cbs), len(filtered_cbs) + forecast_steps - 1).rename('SARIMAX Forecast')

In [None]:
# Plotting the forecast
ax = filtered_cbs['sick_leave_percentage'].plot(legend=True, figsize=(15,8), title='Sick Leave Percentage - Actual vs Forecast')
fcast.plot(legend=True)

plt.show()