In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from pmdarima import auto_arima

In [2]:
# 1. Load the dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the specific branch 'G Handel'
branch_name = 'G Handel'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns for date alignment
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column representing the start of each quarter
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Drop rows where 'Date' could not be created
df = df.dropna(subset=['Date'])

# Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)

# Remove duplicate index values if they exist
df = df.loc[~df.index.duplicated(keep='first')]

In [7]:
# 6. Set the frequency explicitly
try:
    df = df.asfreq('QS')  # Infer the quarterly frequency and set it explicitly
except ValueError as e:
    print(f"Warning: {e}. Frequency could not be set explicitly.")

In [8]:
# 7. Define parameters for SARIMAX model and validate features
branch_name = 'g handel'
target_column = '80072ned_Ziekteverzuimpercentage_1'
exog_columns = [
    '83451NED_ArbeidsvolumeMetSeizoenscorrectie_3',
    '83451NED_MaandloonExclusiefOverwerk_6'
]

# Add lagged and differenced features
df['Maandloon_Lag1'] = df['83451NED_MaandloonExclusiefOverwerk_6'].shift(1)
df['Arbeidsvolume_Diff'] = df['83451NED_ArbeidsvolumeMetSeizoenscorrectie_3'].diff()

# Calculate correlation matrix for numeric columns
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

# Select additional features based on correlation
additional_exog = [
    'Maandloon_Lag1',  # Lagged feature
    'Arbeidsvolume_Diff'  # Differenced feature
]

selected_exog_columns = exog_columns + additional_exog
selected_exog_columns = [col for col in selected_exog_columns if col in df.columns]  # Ensure columns exist

# Debug: Print selected columns
print("Final Selected Exogenous Columns:", selected_exog_columns)

Final Selected Exogenous Columns: ['83451NED_ArbeidsvolumeMetSeizoenscorrectie_3', '83451NED_MaandloonExclusiefOverwerk_6', 'Maandloon_Lag1', 'Arbeidsvolume_Diff']


In [9]:
# 8. Prepare the target and exogenous variables for training and testing
y = df[target_column]
exog = df[selected_exog_columns]

In [10]:
# 9. Define the year up to which training data is included
train_end_year = 2021  # Training includes data up to and including 2021
test_year = 2022       # Testing is for 2022

# Split data into training (2008-2021) and testing (2022)
train_df = df[df['Year'] <= train_end_year].copy()  # Add `.copy()` here
test_df = df[df['Year'] == test_year].copy()  # Add `.copy()` here

y_train = train_df[target_column]
exog_train = train_df[selected_exog_columns].copy()  # Add `.copy()` here
y_test = test_df[target_column]
exog_test = test_df[selected_exog_columns].copy()  # Add `.copy()` here


In [11]:
# 10. Handle missing values
exog_train.fillna(0, inplace=True)  # Handle NaN in exog_train
exog_test.fillna(0, inplace=True)  # Handle NaN in exog_test

In [12]:
# Normalize exogenous variables
scaler_exog = StandardScaler()
exog_train_scaled = pd.DataFrame(
    scaler_exog.fit_transform(exog_train),
    index=exog_train.index,
    columns=selected_exog_columns
)
exog_test_scaled = pd.DataFrame(
    scaler_exog.transform(exog_test),
    index=exog_test.index,
    columns=selected_exog_columns
)

# Debugging: Verify consistency
assert list(exog_train_scaled.columns) == list(exog_test_scaled.columns), "Column mismatch between train and test exog!"

In [13]:
# 12. Fit the SARIMAX model for G Handel
print(f"\nFitting SARIMAX model for branch '{branch_name}' with improved seasonal order...")
model = sm.tsa.SARIMAX(
    y_train,
    order=(1, 1, 1),  # Non-seasonal order
    seasonal_order=(2, 1, 1, 4),  # Updated seasonal order
    enforce_stationarity=False,
    enforce_invertibility=False
)
results = model.fit(disp=False)

# Print model summary
print(results.summary())

# Predict for the test set
forecast_test = results.get_forecast(steps=len(y_test))
y_pred_test = forecast_test.predicted_mean

# Calculate MAE for the test set
q1_mask = test_df['Quarter'] == 1
mae_q1 = mean_absolute_error(y_test[q1_mask], y_pred_test[q1_mask])
mae_all = mean_absolute_error(y_test, y_pred_test)

print(f"\nMAE for Q1 {test_year} for branch '{branch_name}': {mae_q1:.4f}")
print(f"MAE for all quarters of {test_year} for branch '{branch_name}': {mae_all:.4f}")





Fitting SARIMAX model for branch 'g handel' with improved seasonal order...
                                       SARIMAX Results                                        
Dep. Variable:     80072ned_Ziekteverzuimpercentage_1   No. Observations:                   56
Model:                  SARIMAX(1, 1, 1)x(2, 1, 1, 4)   Log Likelihood                  24.459
Date:                                Sat, 14 Dec 2024   AIC                            -36.917
Time:                                        14:19:15   BIC                            -26.491
Sample:                                    01-01-2008   HQIC                           -33.096
                                         - 10-01-2021                                         
Covariance Type:                                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1

In [14]:
# 13. Predictions for test year
forecast_test = results.get_forecast(steps=len(y_test), exog=exog_test_scaled)
y_pred_test = forecast_test.predicted_mean

# Back-transform predictions if log transformation was used
if 'log' in str(results.model.endog_names).lower():
    y_pred_test = np.exp(y_pred_test) - 1

# Calculate MAE
q1_mask = test_df['Quarter'] == 1
mae_q1 = mean_absolute_error(y_test[q1_mask], y_pred_test[q1_mask])
mae_all = mean_absolute_error(y_test, y_pred_test)

print(f"\nMAE for Q1 {test_year} for branch '{branch_name}': {mae_q1:.4f}")
print(f"MAE for all quarters of {test_year} for branch '{branch_name}': {mae_all:.4f}")




MAE for Q1 2022 for branch 'g handel': 0.1241
MAE for all quarters of 2022 for branch 'g handel': 0.3133


In [15]:
# 14. Visualization of Predictions for 2022
fig = go.Figure()

# Add actual sick leave percentage line (test data)
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_pred_test,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add MAE values as text annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.15, showarrow=False,
    text=f"MAE for all four quarters: {mae_all:.4f}",
    font=dict(size=12, color="black")
)

fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.20, showarrow=False,
    text=f"MAE for next quarter (Q1): {mae_q1:.4f}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title=f'Sick Leave Predictions - Trade Branch (2022)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",  # Year-Month format for clearer date labeling
        range=[y_test.index.min(), y_test.index.max()]  # Restrict range to 2022
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.5, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=80, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot as HTML
fig.write_html("C:/Users/c.hakker/Downloads/Sick_leave_predict_G_Handel_2022.html")
