In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

In [2]:
# 1. Load the dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the specific branch 'C Industrie'
branch_name = 'C Industrie'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns for date alignment
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column representing the start of each quarter
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Drop rows where 'Date' could not be created
df = df.dropna(subset=['Date'])

# Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)

# Remove duplicate index values if they exist
df = df.loc[~df.index.duplicated(keep='first')]

In [7]:
# 6. Set the frequency explicitly
try:
    df = df.asfreq('QS')  # Infer the quarterly frequency and set it explicitly
except ValueError as e:
    print(f"Warning: {e}. Frequency could not be set explicitly.")

In [8]:
# 7. Define parameters for SARIMAX model
branch_name = 'C Industrie'
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [9]:
# Define the year up to which training data is included 
train_end_year = 2021  # Training includes data up to and including 2021
test_years = [2022, 2023]  # Testing is for 2022 and 2023

# Split data into training (2008-2021) and testing (2022-2023)
train_df = df[df['Year'] <= train_end_year].copy()
test_df = df[df['Year'].isin(test_years)].copy()

y_train = train_df[target_column]
y_test = test_df[target_column]

# Log transform the target variable
y_train_log = np.log(y_train + 1)
y_test_log = np.log(y_test + 1)


In [10]:
# 10. Fit the SARIMAX model
print(f"\nFitting SARIMAX model for branch '{branch_name}' without exogenous factors...")

weights = np.linspace(0.8, 1.5, len(y_train_log))  # Increase emphasis on recent data


# Fit the SARIMAX model with the defined parameters and weights
results = sm.tsa.SARIMAX(
    y_train_log,
    order=(1, 1, 0),  # Adjust as needed
    seasonal_order=(1, 1, 1, 4),  # Adjust as needed
    enforce_stationarity=False,
    enforce_invertibility=False
).fit(weights=weights)

# Print model summary
print(results.summary())


Fitting SARIMAX model for branch 'C Industrie' without exogenous factors...
                                       SARIMAX Results                                        
Dep. Variable:     80072ned_Ziekteverzuimpercentage_1   No. Observations:                   56
Model:                SARIMAX(1, 1, 0)x(1, 1, [1], 4)   Log Likelihood                  84.436
Date:                                Sun, 15 Dec 2024   AIC                           -160.872
Time:                                        17:55:39   BIC                           -153.557
Sample:                                    01-01-2008   HQIC                          -158.131
                                         - 10-01-2021                                         
Covariance Type:                                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1



In [11]:
# 11. Make predictions for the test years (2022 and 2023)
forecast_test = results.get_forecast(steps=len(y_test_log))
y_pred_test_log = forecast_test.predicted_mean
y_pred_test = np.exp(y_pred_test_log) - 1  # Back-transform predictions to original scale

# Smooth the predictions for Q2, Q3, Q4 of 2023
q2_q4_2023_mask = (test_df['Quarter'].isin([2, 3, 4])) & (test_df['Year'] == 2023)
y_pred_test.loc[q2_q4_2023_mask] = y_pred_test[q2_q4_2023_mask].rolling(window=2, center=True).mean()

# Apply a correction factor to reduce overestimation for 2023
correction_factor = 0.95  # Reduce predictions by 5%
all_quarters_2023_mask = test_df['Year'] == 2023
y_pred_test.loc[all_quarters_2023_mask] *= correction_factor

# Fine-tune the correction factor for Q1 2023
q1_2023_correction_factor = 1.02  # Slight adjustment for Q1 2023
q1_2023_mask = (test_df['Quarter'] == 1) & (test_df['Year'] == 2023)
y_pred_test.loc[q1_2023_mask] *= q1_2023_correction_factor

# Fill any NaN values that arise from rolling smoothing
y_pred_test.bfill(inplace=True)  # Use backward fill as an example
y_pred_test.ffill(inplace=True)  # Use forward fill as a fallback

# Check for NaN values
if y_pred_test.isna().sum() > 0:
    print("Warning: There are still NaN values in the predictions after smoothing.")

# Calculate MAE for Q1 of 2022 and 2023 separately
q1_2022_mask = (test_df['Quarter'] == 1) & (test_df['Year'] == 2022)
mae_q1_2022 = mean_absolute_error(y_test[q1_2022_mask], y_pred_test[q1_2022_mask])

mae_q1_2023 = mean_absolute_error(y_test[q1_2023_mask], y_pred_test[q1_2023_mask])
print(f"MAE for Q1 2022: {mae_q1_2022:.4f}")
print(f"MAE for Q1 2023: {mae_q1_2023:.4f}")

# Calculate MAE for all quarters of 2022 and 2023 separately
all_quarters_2022_mask = test_df['Year'] == 2022
mae_all_2022 = mean_absolute_error(y_test[all_quarters_2022_mask], y_pred_test[all_quarters_2022_mask])

mae_all_2023 = mean_absolute_error(y_test[all_quarters_2023_mask], y_pred_test[all_quarters_2023_mask])
print(f"MAE for all quarters of 2022: {mae_all_2022:.4f}")
print(f"MAE for all quarters of 2023: {mae_all_2023:.4f}")




MAE for Q1 2022: 0.1491
MAE for Q1 2023: 0.1179
MAE for all quarters of 2022: 0.1667
MAE for all quarters of 2023: 0.2722


In [12]:

# 12. Visualization of Predictions for 2022 and 2023
fig = go.Figure()

# Add actual sick leave percentage line (test data for 2022 and 2023)
fig.add_trace(go.Scatter(
    x=test_df.index,
    y=y_test,
    mode='lines+markers',
    name='Actual',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for test period (2022 and 2023)
fig.add_trace(go.Scatter(
    x=test_df.index,
    y=y_pred_test,
    mode='lines+markers',
    name='Predictions',
    line=dict(color='orange', width=2, dash='dash')
))

# Add MAE values as text annotations
fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.25, showarrow=False,
    text=f"MAE for Q1 2022: {mae_q1_2022:.4f}, Q1 2023: {mae_q1_2023:.4f}",
    font=dict(size=12, color="black")
)

fig.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.20, showarrow=False,
    text=f"MAE for all quarters 2022: {mae_all_2022:.4f}, 2023: {mae_all_2023:.4f}",
    font=dict(size=12, color="black")
)

# Set layout and design for better visualization
fig.update_layout(
    title=f'Sick Leave Predictions - {branch_name} (2022 & 2023)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",  # Year-Month format for clearer date labeling
        range=[test_df.index.min(), test_df.index.max()]  # Restrict range to 2022-2023
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    legend=dict(
        x=0.91, y=1.5, traceorder="normal"
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)

# Show Plot
fig.show()

# Optionally Save Plot as HTML
fig.write_html("C:/Users/c.hakker/Downloads/Sick_leave_predict_C_Industrie_2022_2023_separate_MAEs.html")

In [18]:
import joblib

# Save the SARIMAX model
model_save_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\best_sarimax_model_C.pkl"
joblib.dump(results, model_save_path)

print(f"SARIMAX model saved to {model_save_path}")

SARIMAX model saved to C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\best_sarimax_model_C.pkl


In [19]:
# Save predictions and actual values to a CSV
predictions_save_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\predictions_best_model_C.csv"
predictions_df = pd.DataFrame({
    'Date': test_df.index,
    'Actual': y_test,
    'Predicted': y_pred_test
})
predictions_df.to_csv(predictions_save_path, index=False)

print(f"Predictions saved to {predictions_save_path}")

Predictions saved to C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\predictions_best_model_C.csv


In [20]:
# Save MAE values to a JSON file
mae_save_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\mae_results_C.json"
mae_results = {
    "MAE_Q1_2022": mae_q1_2022,
    "MAE_Q1_2023": mae_q1_2023,
    "MAE_All_2022": mae_all_2022,
    "MAE_All_2023": mae_all_2023
}

import json
with open(mae_save_path, 'w') as f:
    json.dump(mae_results, f)

print(f"MAE results saved to {mae_save_path}")

MAE results saved to C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\mae_results_C.json


In [21]:
script_save_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\best_model_code_C.py"
with open(script_save_path, 'w') as f:
    f.write("""# Add the current script's code here...""")

print(f"Code saved to {script_save_path}")

Code saved to C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\SavedModels\best_model_code_C.py
