In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
# 1. Load the dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the specific branch
branch_name = 'Q Gezondheids- en welzijnszorg'  # Change this for other branches
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Drop rows where 'Date' could not be created
df = df.dropna(subset=['Date'])

# Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)
df = df.asfreq('QS')

In [7]:
# 6. Define target variable
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [8]:
# 7. Handle Q1 2022 Outlier
# Identify the outlier
outlier_mask = (df['Year'] == 2022) & (df['Quarter'] == 1)

# Replace outlier with a smoothed value using seasonal decomposition
decomposition = seasonal_decompose(df[target_column].dropna(), model='additive', period=4)
df.loc[outlier_mask, target_column] = decomposition.trend[outlier_mask].fillna(decomposition.seasonal[outlier_mask])

# Debug: Confirm outlier replacement
print("Outlier Replaced for Q1 2022:\n", df.loc[outlier_mask, target_column])

Outlier Replaced for Q1 2022:
 Date
2022-01-01    7.65
Freq: QS-JAN, Name: 80072ned_Ziekteverzuimpercentage_1, dtype: float64


In [9]:
# 8. Split data into training and testing
train_end_year = 2021
train_df = df[df['Year'] <= train_end_year]
test_df = df[df['Year'] == 2022]

y_train = train_df[target_column]
y_test = test_df[target_column]

In [10]:
# 9. Optimize SARIMAX parameters
print("Performing auto_arima for parameter optimization...")
auto_model = auto_arima(
    y_train,
    seasonal=True,
    m=4,  # Quarterly data
    trace=True,
    error_action='ignore',
    suppress_warnings=True,
    stepwise=True
)

best_order = auto_model.order
best_seasonal_order = auto_model.seasonal_order
print(f"Optimal Parameters: order={best_order}, seasonal_order={best_seasonal_order}")

Performing auto_arima for parameter optimization...
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,1,1)[4]             : AIC=-6.039, Time=1.73 sec
 ARIMA(0,1,0)(0,1,0)[4]             : AIC=23.368, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[4]             : AIC=-9.697, Time=0.27 sec
 ARIMA(0,1,1)(0,1,1)[4]             : AIC=-6.009, Time=0.16 sec
 ARIMA(1,1,0)(0,1,0)[4]             : AIC=13.337, Time=0.02 sec
 ARIMA(1,1,0)(2,1,0)[4]             : AIC=-8.815, Time=0.43 sec
 ARIMA(1,1,0)(1,1,1)[4]             : AIC=-8.699, Time=0.34 sec
 ARIMA(1,1,0)(0,1,1)[4]             : AIC=-6.153, Time=0.36 sec
 ARIMA(1,1,0)(2,1,1)[4]             : AIC=inf, Time=1.91 sec
 ARIMA(0,1,0)(1,1,0)[4]             : AIC=-7.738, Time=0.23 sec
 ARIMA(2,1,0)(1,1,0)[4]             : AIC=-7.708, Time=0.52 sec
 ARIMA(1,1,1)(1,1,0)[4]             : AIC=-7.926, Time=0.52 sec
 ARIMA(0,1,1)(1,1,0)[4]             : AIC=-9.883, Time=0.13 sec
 ARIMA(0,1,1)(0,1,0)[4]             : AIC=10.043, Time=0.27 sec
 ARIMA(0,1,1

In [11]:
# 10. Fit the SARIMAX model
print(f"Fitting SARIMAX model for branch '{branch_name}'...")
model = sm.tsa.SARIMAX(
    y_train,
    order=best_order,
    seasonal_order=best_seasonal_order,
    enforce_stationarity=False,
    enforce_invertibility=False
)
results = model.fit(disp=False)

# Predict for 2022
forecast_steps = len(y_test)
forecast = results.get_forecast(steps=forecast_steps)
y_pred_test = forecast.predicted_mean

Fitting SARIMAX model for branch 'Q Gezondheids- en welzijnszorg'...


In [12]:
# 11. Calculate MAE for 2022
# Create masks for Q1 and all quarters of 2022
q1_2022_mask = (y_test.index.quarter == 1)
all_2022_mask = (y_test.index.year == 2022)

# Calculate MAE
mae_q1_2022 = mean_absolute_error(
    y_test[q1_2022_mask].dropna(),
    y_pred_test[q1_2022_mask].dropna()
)
mae_all_2022 = mean_absolute_error(
    y_test[all_2022_mask].dropna(),
    y_pred_test[all_2022_mask].dropna()
)

# Print MAE results
print(f"MAE for Q1 2022: {mae_q1_2022:.4f}")
print(f"MAE for all quarters of 2022: {mae_all_2022:.4f}")

MAE for Q1 2022: 0.1417
MAE for all quarters of 2022: 0.1390


In [13]:
# 12. Visualization for 2022 Predictions
fig = go.Figure()

# Add actual sick leave percentage line (test data)
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_pred_test,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add MAE values as annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.05, showarrow=False,
    text=f"MAE Q1 2022: {mae_q1_2022:.4f}, MAE All 2022: {mae_all_2022:.4f}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title=f'Sick Leave Predictions (2022) - {branch_name}',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",
        range=[y_test.index.min(), y_test.index.max()]
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.1, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=80, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot as HTML
fig.write_html("C:/Users/c.hakker/Downloads/Sick_leave_predict_2022_Q_Healthcare.html")
