In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
# 1. Load the dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the specific branch
branch_name = 'Q Gezondheids- en welzijnszorg'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Drop rows where 'Date' could not be created
df = df.dropna(subset=['Date'])

# Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)
df = df.asfreq('QS')

In [7]:
# 6. Define target variable
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [8]:
#Block 7: Handle Q1 2022 Outlier
# Identify the outlier
outlier_mask = (df['Year'] == 2022) & (df['Quarter'] == 1)

# Replace outlier with a smoothed value using seasonal decomposition
decomposition = seasonal_decompose(df[target_column].dropna(), model='additive', period=4)
df.loc[outlier_mask, target_column] = decomposition.trend[outlier_mask].fillna(decomposition.seasonal[outlier_mask])

# Debug: Confirm outlier replacement
print("Outlier Replaced for Q1 2022:\n", df.loc[outlier_mask, target_column])

# Apply smoothing for COVID-era data (2020-2021)
covid_mask = (df['Year'] >= 2020) & (df['Year'] <= 2021)
df.loc[covid_mask, target_column] = df[target_column].rolling(window=4, min_periods=1).mean()

# Debug: Confirm smoothing for COVID-era
print("Smoothed COVID-era Data:\n", df.loc[covid_mask, target_column])


Outlier Replaced for Q1 2022:
 Date
2022-01-01    7.65
Freq: QS-JAN, Name: 80072ned_Ziekteverzuimpercentage_1, dtype: float64
Smoothed COVID-era Data:
 Date
2020-01-01    5.875
2020-04-01    5.975
2020-07-01    6.125
2020-10-01    6.375
2021-01-01    6.425
2021-04-01    6.525
2021-07-01    6.625
2021-10-01    6.775
Freq: QS-JAN, Name: 80072ned_Ziekteverzuimpercentage_1, dtype: float64


In [9]:
# %% Block 8: Split Training and Testing Data
# Training for 2022 predictions: Data up to Q4 2021
train_df_2022 = df[df['Year'] <= 2021]
y_train_2022 = train_df_2022[target_column]

# Training for 2023 predictions: Data up to Q4 2022
train_df_2023 = df[df['Year'] <= 2022]
y_train_2023 = train_df_2023[target_column]

# Test data: Data from 2022 and 2023
test_df = df[(df['Year'] > 2021) & (df['Year'] <= 2023)]
y_test = test_df[target_column]


In [10]:
# %% Block 9: Optimize SARIMAX Parameters for 2022
print("Performing auto_arima for 2022 parameter optimization...")
auto_model_2022 = auto_arima(
    y_train_2022,
    seasonal=True,
    m=4,  # Quarterly data
    trace=True,
    error_action='ignore',
    suppress_warnings=True,
    stepwise=True
)

best_order_2022 = auto_model_2022.order
best_seasonal_order_2022 = auto_model_2022.seasonal_order
print(f"Optimal Parameters for 2022: order={best_order_2022}, seasonal_order={best_seasonal_order_2022}")

# %% Block 9.1: Optimize SARIMAX Parameters for 2023
print("Performing auto_arima for 2023 parameter optimization...")
auto_model_2023 = auto_arima(
    y_train_2023,
    seasonal=True,
    m=4,  # Quarterly data
    trace=True,
    error_action='ignore',
    suppress_warnings=True,
    stepwise=True
)

best_order_2023 = auto_model_2023.order
best_seasonal_order_2023 = auto_model_2023.seasonal_order
print(f"Optimal Parameters for 2023: order={best_order_2023}, seasonal_order={best_seasonal_order_2023}")

Performing auto_arima for 2022 parameter optimization...
Performing stepwise search to minimize aic
 ARIMA(2,2,2)(1,0,1)[4]             : AIC=14.929, Time=1.37 sec
 ARIMA(0,2,0)(0,0,0)[4]             : AIC=130.190, Time=0.01 sec
 ARIMA(1,2,0)(1,0,0)[4]             : AIC=57.472, Time=0.26 sec
 ARIMA(0,2,1)(0,0,1)[4]             : AIC=inf, Time=0.61 sec
 ARIMA(2,2,2)(0,0,1)[4]             : AIC=24.683, Time=1.73 sec
 ARIMA(2,2,2)(1,0,0)[4]             : AIC=inf, Time=1.33 sec
 ARIMA(2,2,2)(2,0,1)[4]             : AIC=16.828, Time=1.90 sec
 ARIMA(2,2,2)(1,0,2)[4]             : AIC=16.941, Time=1.68 sec
 ARIMA(2,2,2)(0,0,0)[4]             : AIC=22.780, Time=0.49 sec
 ARIMA(2,2,2)(0,0,2)[4]             : AIC=24.810, Time=1.08 sec
 ARIMA(2,2,2)(2,0,0)[4]             : AIC=17.105, Time=1.78 sec
 ARIMA(2,2,2)(2,0,2)[4]             : AIC=22.144, Time=2.10 sec
 ARIMA(1,2,2)(1,0,1)[4]             : AIC=inf, Time=1.54 sec
 ARIMA(2,2,1)(1,0,1)[4]             : AIC=14.896, Time=1.20 sec
 ARIMA(2,2,1

In [11]:
# Block 10: Fit SARIMAX for 2022 Predictions
print("Fitting SARIMAX model for 2022 predictions...")
model_2022 = sm.tsa.SARIMAX(
    y_train_2022,
    order=best_order_2022,
    seasonal_order=best_seasonal_order_2022,
    enforce_stationarity=False,
    enforce_invertibility=False
)
results_2022 = model_2022.fit(disp=False)

# Predict for 2022
forecast_2022 = results_2022.get_forecast(steps=4)  # 4 quarters of 2022
y_pred_2022 = forecast_2022.predicted_mean

Fitting SARIMAX model for 2022 predictions...


In [12]:
# Block 11: Fit SARIMAX for 2023 Predictions
print("Fitting SARIMAX model for 2023 predictions...")
model_2023 = sm.tsa.SARIMAX(
    y_train_2023,  # Training data up to Q4 2022
    order=(0, 1, 1),  # Optimal order from auto_arima
    seasonal_order=(0, 1, 2, 4),  # Optimal seasonal order from auto_arima
    enforce_stationarity=False,
    enforce_invertibility=False
)
results_2023 = model_2023.fit(disp=False)

# Predict for 2023
forecast_2023 = results_2023.get_forecast(steps=4)  # 4 quarters of 2023
y_pred_2023 = forecast_2023.predicted_mean

# Debug: Print predictions for 2023
print("Predictions for 2023:\n", y_pred_2023)


Fitting SARIMAX model for 2023 predictions...
Predictions for 2023:
 2023-01-01    8.341107
2023-04-01    8.120478
2023-07-01    7.876359
2023-10-01    8.530485
Freq: QS-JAN, Name: predicted_mean, dtype: float64


In [13]:
forecast_steps = len(y_test)  # Exclude additional steps beyond 2023

In [14]:
# 12. Debugging Test Data
print("Unique Years in y_test:", y_test.index.year.unique())


Unique Years in y_test: Index([2022, 2023], dtype='int32', name='Date')


In [15]:
# %% Block 13: Calculate MAE for 2022 and 2023

# Create masks for 2022
q1_2022_mask = (y_test.index.year == 2022) & (y_test.index.quarter == 1)
all_2022_mask = y_test.index.year == 2022

# Align the masks with the prediction index for 2022
q1_2022_mask_pred = y_pred_2022.index.quarter == 1
all_2022_mask_pred = y_pred_2022.index.year == 2022

# Calculate MAE for Q1 2022
mae_q1_2022 = mean_absolute_error(
    y_test[q1_2022_mask].dropna(),
    y_pred_2022[q1_2022_mask_pred].dropna()
)

# Calculate MAE for all quarters of 2022
mae_all_2022 = mean_absolute_error(
    y_test[all_2022_mask].dropna(),
    y_pred_2022[all_2022_mask_pred].dropna()
)

# Create masks for 2023
q1_2023_mask = (y_test.index.year == 2023) & (y_test.index.quarter == 1)
all_2023_mask = y_test.index.year == 2023

# Align the masks with the prediction index for 2023
q1_2023_mask_pred = y_pred_2023.index.quarter == 1
all_2023_mask_pred = y_pred_2023.index.year == 2023

# Calculate MAE for Q1 2023
mae_q1_2023 = mean_absolute_error(
    y_test[q1_2023_mask].dropna(),
    y_pred_2023[q1_2023_mask_pred].dropna()
)

# Calculate MAE for all quarters of 2023
mae_all_2023 = mean_absolute_error(
    y_test[all_2023_mask].dropna(),
    y_pred_2023[all_2023_mask_pred].dropna()
)

# Print results
print(f"MAE for Q1 2022: {mae_q1_2022:.4f}")
print(f"MAE for all quarters of 2022: {mae_all_2022:.4f}")
print(f"MAE for Q1 2023: {mae_q1_2023:.4f}")
print(f"MAE for all quarters of 2023: {mae_all_2023:.4f}")


MAE for Q1 2022: 0.7907
MAE for all quarters of 2022: 0.5360
MAE for Q1 2023: 0.2411
MAE for all quarters of 2023: 0.8421


In [16]:
# Visualization for 2022 & 2023 Predictions
fig = go.Figure()

# Add actual sick leave percentage line (test data)
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_pred_test,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add MAE values as annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.05, showarrow=False,
    text=f"MAE Q1 2022: {f'{mae_q1_2022:.4f}' if mae_q1_2022 is not None else 'N/A'}, "
         f"MAE All 2022: {f'{mae_all_2022:.4f}' if mae_all_2022 is not None else 'N/A'}",
    font=dict(size=12, color="black")
)
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.00, showarrow=False,
    text=f"MAE Q1 2023: {f'{mae_q1_2023:.4f}' if mae_q1_2023 is not None else 'N/A'}, "
         f"MAE All 2023: {f'{mae_all_2023:.4f}' if mae_all_2023 is not None else 'N/A'}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title=f'Sick Leave Predictions (2022-2023) - {branch_name}',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",
        range=[y_test.index.min(), "2023-12-31"]  # Restrict to 2023
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.1, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=80, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot as HTML
fig.write_html("C:/Users/c.hakker/Downloads/Sick_leave_predict_2022_2023_Q_Healthcare.html")



NameError: name 'y_pred_test' is not defined

In [17]:
residuals = y_test - y_pred_test
print("Residuals for 2023:\n", residuals[y_test.index.year == 2023])

Residuals for 2023:
 Date
2023-01-01    1.002301
2023-04-01   -0.314767
2023-07-01   -0.871497
2023-10-01   -0.040314
Freq: QS-JAN, dtype: float64
