In [None]:
import pandas as pd
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

READING IN AND FORMATTING CBS DATA

In [None]:
# Load the data
from uwv.config import CBS80072NED, CBS_OPENDATA_PROCESSED_DATA_DIR, OUTPUT_DIR

cbs = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")

In [None]:
# Filter out rows where 'period_quarter_number' is 0
cbs = cbs[cbs['period_quarter_number'] != 0]

# Map quarters to months and create the 'date' column
cbs['month'] = cbs['period_quarter_number'].map({1: 1, 2: 4, 3: 7, 4: 10})
cbs['date'] = pd.to_datetime({'year': cbs['period_year'], 'month': cbs['month'], 'day': 1})

# Set this new 'date' column as the index
cbs.set_index('date', inplace=True)

In [None]:
# Filter data based on 'sbi_title'
sbi_code = 'T001081'  # Example: A-U Alle economische activiteiten
filtered_cbs = cbs[cbs['sbi'] == sbi_code]

# Filter the data to include only dates from 2016 to 2023
filtered_cbs = filtered_cbs.loc['2016-01-01':'2023-12-31']

In [None]:
filtered_cbs.index.freq = 'QS'  # Set the frequency to quarterly

In [None]:
# Check the columns and ensure the sick_leave_percentage column is in the correct format
filtered_cbs['sick_leave_percentage'] = filtered_cbs['sick_leave_percentage'].astype(float)

START PROPHET CODE

In [None]:
# Prepare the data
prophet_data = filtered_cbs.reset_index()[['date', 'sick_leave_percentage']]
prophet_data.columns = ['ds', 'y']  # Renaming columns to match Prophet's expectations

In [None]:
# Initialize and fit the Prophet model
model = Prophet(changepoint_prior_scale=0.1)  # Adjust the changepoint_prior_scale for flexibility
model.fit(prophet_data)

In [None]:
# Create a dataframe for future dates
future_dates = model.make_future_dataframe(periods=8, freq='QE')  # 8 quarters into the future

In [None]:
# Make predictions
forecast = model.predict(future_dates)

In [None]:
# View the forecasted values
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
# Plot the forecast
model.plot(forecast)
plt.show()

In [None]:
# Plot the forecast components
model.plot_components(forecast)
plt.show()

In [None]:
# Merge the actual and predicted values
comparison_df = prophet_data.merge(forecast[['ds', 'yhat']], on='ds', how='left')

In [None]:
# Drop the rows where yhat is NaN (this will drop future dates where we don't have actual values)
comparison_df.dropna(inplace=True)

In [None]:
# Calculate MAE, MSE, and RMSE
mae = mean_absolute_error(comparison_df['y'], comparison_df['yhat'])
mse = mean_squared_error(comparison_df['y'], comparison_df['yhat'])
rmse = np.sqrt(mse)

In [None]:
# Print the evaluation metrics
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(comparison_df['ds'], comparison_df['y'], label='Actual')
plt.plot(comparison_df['ds'], comparison_df['yhat'], label='Predicted')
plt.title('Actual vs Predicted Sick Leave Percentage')
plt.xlabel('Date')
plt.ylabel('Sick Leave Percentage')
plt.legend()
plt.show()

In [None]:
# Plot the residuals
comparison_df['residuals'] = comparison_df['y'] - comparison_df['yhat']
plt.figure(figsize=(10, 6))
plt.plot(comparison_df['ds'], comparison_df['residuals'], label='Residuals', color='red')
plt.title('Residuals of the Predictions')
plt.xlabel('Date')
plt.ylabel('Residuals')
plt.axhline(0, color='black', linestyle='--', linewidth=0.8)
plt.legend()
plt.show()