In [1]:
# 0. Import Libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from pmdarima import auto_arima

In [2]:
# 1. Load Dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the Specific Branch
branch_name = 'C Industrie'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [33]:
# 3. Verify and Process the 'Year' and 'Quarter' Columns
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [34]:
# 4. Set Index and Frequency
df.set_index('Date', inplace=True, drop=True)
df = df.asfreq('QS')

In [35]:
# 5. Define Target and Exogenous Variables
target_column = '80072ned_Ziekteverzuimpercentage_1'
exog_columns = ['81589NED_Rechtspersonen_13', '85663NED_CaoLonenPerMaandExclBijzBeloningen_1']

# Add Q1-Specific Features
df['Q1_Dummy'] = (df['Quarter'] == 1).astype(int)

In [36]:
# 6. Split Data into Training and Testing Sets
train_end_year = 2021
test_year = 2022

train_df = df[df['Year'] <= train_end_year]
test_df = df[df['Year'] == test_year]

y_train = train_df[target_column]
y_test = test_df[target_column]
exog_train = train_df[exog_columns + ['Q1_Dummy']].fillna(0)  # Handle NaNs
exog_test = test_df[exog_columns + ['Q1_Dummy']].fillna(0)

In [37]:
# 7. Normalize Exogenous Variables
scaler = StandardScaler()
exog_train_scaled = scaler.fit_transform(exog_train)
exog_test_scaled = scaler.transform(exog_test)

In [38]:
# 8. Use Simple Model's Fixed Parameters
print("Using Simple Model's Parameters...")
model = sm.tsa.SARIMAX(
    y_train,
    exog=exog_train_scaled,
    order=(1, 1, 1),  # Fixed parameters from simple model
    seasonal_order=(1, 1, 1, 4),  # Fixed seasonal parameters
    enforce_stationarity=False,
    enforce_invertibility=False
)
results = model.fit(disp=False)
print(results.summary())

Using Simple Model's Parameters...
                                       SARIMAX Results                                        
Dep. Variable:     80072ned_Ziekteverzuimpercentage_1   No. Observations:                   56
Model:                  SARIMAX(1, 1, 1)x(1, 1, 1, 4)   Log Likelihood                   3.681
Date:                                Thu, 28 Nov 2024   AIC                              8.637
Time:                                        19:52:20   BIC                             23.091
Sample:                                    01-01-2008   HQIC                            14.025
                                         - 10-01-2021                                         
Covariance Type:                                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.2865      0.117      2.443    

In [41]:
# 9. Predictions, Forecast for 2023, and MAE Calculation
# Predict for the test set
y_pred_test = results.predict(start=y_test.index[0], end=y_test.index[-1], exog=exog_test_scaled)

# Forecast for 2023 (4 quarters ahead)
forecast_steps = 4
forecast_exog = exog_test_scaled[-forecast_steps:]  # Reuse the last available exogenous data
forecast = results.get_forecast(steps=forecast_steps, exog=forecast_exog)
forecast_mean = forecast.predicted_mean
forecast_index = pd.date_range(start="2023-01-01", periods=forecast_steps, freq='QS')

# Calculate MAE
q1_mask = test_df['Quarter'] == 1
mae_q1 = mean_absolute_error(y_test[q1_mask], y_pred_test[q1_mask])
mae_all = mean_absolute_error(y_test, y_pred_test)

print(f"\nMAE for Q1 {test_year} for branch '{branch_name}': {mae_q1:.4f}")
print(f"MAE for all quarters of {test_year} for branch '{branch_name}': {mae_all:.4f}")


MAE for Q1 2022 for branch 'C Industrie': 0.1174
MAE for all quarters of 2022 for branch 'C Industrie': 0.0788


In [42]:
# 10. Visualization with Forecast
fig = go.Figure()

# Add actual sick leave percentage line (test data)
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_pred_test,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add forecast line for 2023
fig.add_trace(go.Scatter(
    x=forecast_index,
    y=forecast_mean,
    mode='lines+markers',
    name='Forecast',
    line=dict(color='green', width=2, dash='dot')
))

# Add MAE values as text annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.15, showarrow=False,
    text=f"MAE for all four quarters: {mae_all:.4f}",
    font=dict(size=12, color="black")
)

fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.20, showarrow=False,
    text=f"MAE for next quarter (Q1): {mae_q1:.4f}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title=f'Sick Leave Predictions and Forecast - {branch_name.capitalize()}',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",  # Year-Month format for clearer date labeling
        range=[y_test.index.min(), pd.Timestamp("2023-10-01")]
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.5, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=80, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot as HTML
fig.write_html("C:/Users/c.hakker/Downloads/Sick_leave_predict_forecast_C_industrie.html")
