In [1]:
# 0. Imports and Configuration
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error
from itertools import product
from pmdarima import auto_arima
from scipy.stats import boxcox
import plotly.graph_objects as go
from uwv.config import KNMI_PROCESSED_DATA_DIR, KNMI_AVG_TEMP, CBS_OPENDATA_PROCESSED_DATA_DIR, CBS80072NED

[32m2024-11-12 16:04:05.990[0m | [1mINFO    [0m | [36muwv.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\uwv[0m


In [2]:
# 1. Load Data
cbs = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")
knmi = pd.read_parquet(KNMI_PROCESSED_DATA_DIR / f"{KNMI_AVG_TEMP}.parquet")

In [3]:
# 2. Merge the datasets on 'period_year' and 'period_quarter_number'
cbsk = pd.merge(cbs, knmi, on=['period_year', 'period_quarter_number'], how="inner")

In [4]:
# 3. Data Preprocessing
# Filter out invalid quarter values and set date index
cbsk = cbsk[cbsk['period_quarter_number'] != 0]
cbsk['month'] = cbsk['period_quarter_number'].map({1: 1, 2: 4, 3: 7, 4: 10})
cbsk['date'] = pd.to_datetime({'year': cbsk['period_year'], 'month': cbsk['month'], 'day': 1})
cbsk.set_index('date', inplace=True)

In [5]:
# 4. Filter by specific 'sbi' code and date range
sbi_code = '422400'  # Example code
filtered_cbs = cbsk[cbsk['sbi'] == sbi_code].loc['2008-01-01':'2024-12-31']
filtered_cbs.index.freq = 'QS'
filtered_cbs = filtered_cbs.dropna()  # Drop rows with any NaN values

In [6]:
# 5. Transformations (if needed)
filtered_cbs['sick_leave_boxcox'], lambda_val = boxcox(filtered_cbs['sick_leave_percentage'] + 1)

In [7]:
# 6. Rolling average for 'avg_temp' as exogenous variable
filtered_cbs['avg_temp_rolling'] = filtered_cbs['avg_temp'].rolling(window=5).mean().bfill()

In [8]:
# 7. Define SARIMAX Model Selection Function
def find_best_sarimax(train, exog_train, p_range, d_range, q_range, P_range, D_range, Q_range, m):
    best_aic = float("inf")
    best_order, best_seasonal_order, best_model = None, None, None
    for order in product(p_range, d_range, q_range):
        for seasonal_order in product(P_range, D_range, Q_range):
            try:
                model = SARIMAX(train, order=order,
                                seasonal_order=(seasonal_order[0], seasonal_order[1], seasonal_order[2], m),
                                exog=exog_train, enforce_stationarity=False, enforce_invertibility=False)
                results = model.fit(disp=False, maxiter=1000, method='powell')
                if results.aic < best_aic:
                    best_aic, best_order, best_seasonal_order, best_model = results.aic, order, seasonal_order, results
            except Exception as e:
                print(f"Error with {order} and {seasonal_order}: {e}")
    return best_model, best_order, best_seasonal_order

In [9]:
# 8. Split Data for Training and Testing
train = filtered_cbs.loc[:'2022-12-31', 'sick_leave_percentage']
test = filtered_cbs.loc['2023-01-01':'2023-12-31', 'sick_leave_percentage']
exog_train = filtered_cbs.loc[:'2022-12-31', ['avg_temp_rolling']]
exog_test = filtered_cbs.loc['2023-01-01':'2023-12-31', ['avg_temp_rolling']]

In [10]:
# 9. Broad Parameter Search
p_range, d_range, q_range = range(0, 3), [1, 2], range(0, 3)
P_range, D_range, Q_range, m = range(0, 3), [1], range(0, 3), 4
best_model, best_order, best_seasonal_order = find_best_sarimax(train, exog_train, p_range, d_range, q_range, P_range, D_range, Q_range, m)

In [11]:
# 10. Refine Parameter Search around Best Values
p_range = range(max(0, best_order[0] - 1), best_order[0] + 2)
q_range = range(max(0, best_order[2] - 1), best_order[2] + 2)
P_range = range(max(0, best_seasonal_order[0] - 1), best_seasonal_order[0] + 2)
Q_range = range(max(0, best_seasonal_order[2] - 1), best_seasonal_order[2] + 2)
D_range = [best_seasonal_order[1]]

final_model, _, _ = find_best_sarimax(train, exog_train, p_range, d_range, q_range, P_range, D_range, Q_range, m)

In [12]:
# 11. Make Predictions on Test Data
start, end = len(train), len(train) + len(test) - 1
predictions = final_model.predict(start=start, end=end, exog=exog_test).rename('SARIMAX Predictions')

In [13]:
# 12. Evaluate Predictions
mae_all = mean_absolute_error(test, predictions)
mae_q1 = mean_absolute_error(test.iloc[:1], predictions.iloc[:1])
print(f'MAE for all quarters: {mae_all:.4f}, MAE for Q1: {mae_q1:.4f}')

MAE for all quarters: 0.6522, MAE for Q1: 0.1669


In [14]:
# 13. Define forecast steps and period for plotting
forecast_steps = 4  # Number of quarters to forecast
forecast_period = pd.date_range(start='2024-01-01', periods=forecast_steps, freq='QS')

# Generate forecast
forecast = final_model.get_forecast(steps=forecast_steps, exog=exog_test.iloc[-forecast_steps:])

In [15]:
# 14. Visualization with Combined Plot for Test, Predictions, and Forecast
fig = go.Figure()

# Add actual sick leave percentage line (test data)
fig.add_trace(go.Scatter(
    x=test.index,
    y=test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period
fig.add_trace(go.Scatter(
    x=test.index,
    y=predictions,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add forecast line for future period
fig.add_trace(go.Scatter(
    x=forecast_period,
    y=forecast.predicted_mean,
    mode='lines+markers',
    name='Forecast',
    line=dict(color='green', width=2, dash='dot')
))

# Add MAE values as text annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.15, showarrow=False,
    text=f"MAE for all four quarters: {mae_all:.4f}",
    font=dict(size=12, color="black")
)

fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.20, showarrow=False,
    text=f"MAE for next quarter (Q1): {mae_q1:.4f}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title='Sick Leave Percentage - Test, Predictions, and Forecast for Q Healthcare and Social Work',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",  # Year-Month format for clearer date labeling
        range=[test.index.min(), forecast_period[-1]]  # Extend date range to end of forecast
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.5, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=80, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot
fig.write_image("Sick_leave_predict_sarimax.jpeg")
fig.write_html("Sick_leave_predict_sarimax.html")
