In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

In [2]:
# 1. Load the dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [3]:
# 2. Filter for the specific branch 'C Manufacturing'
branch_name = 'C Manufacturing'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns for date alignment
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column representing the start of each quarter
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Drop rows where 'Date' could not be created
df = df.dropna(subset=['Date'])

# Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)

# Remove duplicate index values if they exist
df = df.loc[~df.index.duplicated(keep='first')]

# Ensure the frequency of the Date index is explicitly set
df.index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='QS')

In [7]:
# 6. Set the frequency explicitly
try:
    df = df.asfreq('QS')  # Infer the quarterly frequency and set it explicitly
except ValueError as e:
    print(f"Warning: {e}. Frequency could not be set explicitly.")

In [8]:
# 7. Define parameters for SARIMAX model
branch_name = 'C Manufacturing'
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [9]:
# 8. Define the year up to which training data is included
train_end_year = 2021  # Training includes data up to and including 2021
test_years = [2022, 2023]  # Testing is for 2022 and 2023

# Split data into training (2008-2021) and testing (2022-2023)
train_df = df[df['Year'] <= train_end_year].copy()
test_df = df[df['Year'].isin(test_years)].copy()

y_train = train_df[target_column]
y_test = test_df[target_column]

# Log transform the target variable
y_train_log = np.log(y_train + 1)
y_test_log = np.log(y_test + 1)

In [10]:
# 9. Fit the SARIMAX model for 2022 and predict
print(f"\nFitting SARIMAX model for branch '{branch_name}' for 2022...")

# Ensure the frequency is explicitly set on the y_train_log series
y_train_log.index.freq = 'QS'

# Fit SARIMAX model on training data up to 2021
results_2022 = sm.tsa.SARIMAX(
    y_train_log,
    order=(1, 1, 1),  # Adjust as needed
    seasonal_order=(1, 1, 1, 4),  # Adjust as needed
    enforce_stationarity=False,
    enforce_invertibility=False
).fit(disp=False)

# Forecast for 2022
test_df_2022 = test_df[test_df['Year'] == 2022]
forecast_2022 = results_2022.get_forecast(steps=len(test_df_2022))
y_pred_2022_log = forecast_2022.predicted_mean
y_pred_2022 = np.exp(y_pred_2022_log) - 1  # Back-transform predictions to original scale

# Ensure proper alignment of indices
y_pred_2022 = pd.Series(y_pred_2022.values, index=test_df_2022.index)

# Align boolean mask with `y_test`
q1_mask_2022 = (test_df_2022['Quarter'] == 1).reindex(y_test.index, fill_value=False)

# Calculate MAE for Q1 and all quarters of 2022
mae_q1_2022 = mean_absolute_error(
    y_test[q1_mask_2022], 
    y_pred_2022[q1_mask_2022]
)
mae_all_2022 = mean_absolute_error(
    y_test[test_df_2022.index], 
    y_pred_2022
)

print(f"\nMAE for Q1 2022: {mae_q1_2022:.4f}")
print(f"MAE for all quarters of 2022: {mae_all_2022:.4f}")



Fitting SARIMAX model for branch 'C Manufacturing' for 2022...

MAE for Q1 2022: 0.1550
MAE for all quarters of 2022: 0.1008


In [11]:
# 10. Initialize rolling training data with training data extended to include true 2022 data
train_df_extended = pd.concat([train_df, test_df[test_df['Year'] == 2022]])  # Include 2022 data
rolling_train = train_df_extended.copy()
y_pred_2023 = []

# Rolling Predictions for 2023
for date in test_df[test_df['Year'] == 2023].index:  # Loop through 2023 test data
    # Explicitly set the frequency of the rolling_train index
    rolling_train.index = pd.date_range(
        start=rolling_train.index.min(),
        end=rolling_train.index.max(),
        freq='QS'
    )
    
    # Identify the current quarter
    current_quarter = test_df.loc[date, 'Quarter']

    # Adjust parameters based on the quarter
    if current_quarter == 2:  # Q2 adjustments
        seasonal_order = (1, 0, 1, 4)  # Adjusted seasonal terms for Q2
        order = (1, 0, 0)  # Simplify ARIMA terms for Q2
        print(f"Applying adjusted parameters for Q2 2023 at {date}")
    else:  # Default for other quarters
        seasonal_order = (2, 1, 1, 4)
        order = (1, 1, 1)

    # Fit SARIMAX model on the rolling training set
    model_rolling = sm.tsa.SARIMAX(
        np.log(rolling_train[target_column] + 1),
        order=order,  # Use the adjusted order
        seasonal_order=seasonal_order,  # Use the adjusted seasonal order
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    results_rolling = model_rolling.fit(
        disp=False,                # Suppress verbose output
        method='powell',           # Try Powell optimization method
        maxiter=1000,              # Increase maximum iterations
        xtol=1e-4                  # Adjust convergence tolerance specific to Powell
    )
    
    # Forecast the next step (1 quarter ahead)
    prediction_log = results_rolling.get_forecast(steps=1).predicted_mean
    prediction = np.exp(prediction_log) - 1  # Back-transform prediction
    y_pred_2023.append(prediction.iloc[0])
    
    # Update rolling training data with the actual value from the test set
    rolling_train = pd.concat([rolling_train, test_df.loc[[date]]])

# Convert predictions list to a pandas Series
y_pred_2023 = pd.Series(y_pred_2023, index=test_df[test_df['Year'] == 2023].index)

# Calculate MAE for Q1 and all quarters of 2023
q1_mask_2023 = test_df['Quarter'] == 1
mae_q1_2023 = mean_absolute_error(
    y_test[q1_mask_2023 & (test_df['Year'] == 2023)], 
    y_pred_2023[q1_mask_2023 & (test_df['Year'] == 2023)]
)
mae_all_2023 = mean_absolute_error(
    y_test[test_df['Year'] == 2023], 
    y_pred_2023
)

# Calculate MAE for each quarter of 2023
for quarter in [1, 2, 3, 4]:
    q_mask = (test_df['Year'] == 2023) & (test_df['Quarter'] == quarter)
    mae_quarter = mean_absolute_error(
        y_test[q_mask],
        y_pred_2023[q_mask]
    )
    print(f"MAE for Q{quarter} 2023: {mae_quarter:.4f}")

# Print overall MAEs
print(f"\nMAE for Q1 2023: {mae_q1_2023:.4f}")
print(f"MAE for all quarters of 2023: {mae_all_2023:.4f}")


Applying adjusted parameters for Q2 2023 at 2023-04-01 00:00:00
MAE for Q1 2023: 0.1512
MAE for Q2 2023: 0.2287
MAE for Q3 2023: 0.0930
MAE for Q4 2023: 0.0050

MAE for Q1 2023: 0.1512
MAE for all quarters of 2023: 0.1195


In [12]:
# 11. Define separate test sets for 2022 and 2023
test_df_2022 = test_df[test_df['Year'] == 2022].copy()
test_df_2023 = test_df[test_df['Year'] == 2023].copy()

In [15]:
# 12. Visualization for 2022
fig_2022 = go.Figure()

# Add actual sick leave percentage line for 2022
fig_2022.add_trace(go.Scatter(
    x=test_df_2022.index,
    y=y_test[test_df_2022.index],
    mode='lines+markers',
    name='Actual (2022)',
    line=dict(color='#0078d2', width=2)
))

# Add predictions line for 2022
fig_2022.add_trace(go.Scatter(
    x=test_df_2022.index,
    y=y_pred_2022,
    mode='lines+markers',
    name='Predictions (2022)',
    line=dict(color='orange', width=2, dash='dash')
))

# Add MAE values as text annotations for 2022
fig_2022.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.10, showarrow=False,
    text=f"MAE Q1 2022: {mae_q1_2022:.4f}",
    font=dict(size=12, color="black")
)
fig_2022.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.05, showarrow=False,
    text=f"MAE All 2022: {mae_all_2022:.4f}",
    font=dict(size=12, color="black")
)

fig_2022.update_layout(
    title=f'Sick Leave Test - C Manufacturing (2022)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",
        range=[test_df_2022.index.min(), test_df_2022.index.max()]
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)

fig_2022.show()

# Visualization for 2023
fig_2023 = go.Figure()

# Add actual sick leave percentage line for 2023
fig_2023.add_trace(go.Scatter(
    x=test_df_2023.index,
    y=y_test[test_df_2023.index],
    mode='lines+markers',
    name='Actual (2023)',
    line=dict(color='#0078d2', width=2)
))

# Add rolling predictions line for 2023
fig_2023.add_trace(go.Scatter(
    x=test_df_2023.index,
    y=y_pred_2023,
    mode='lines+markers',
    name='Rolling Predictions (2023)',
    line=dict(color='green', width=2, dash='dot')
))

# Add MAE values as text annotations for 2023
fig_2023.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.10, showarrow=False,
    text=f"MAE Q1 2023: {mae_q1_2023:.4f}",
    font=dict(size=12, color="black")
)
fig_2023.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.05, showarrow=False,
    text=f"MAE All 2023: {mae_all_2023:.4f}",
    font=dict(size=12, color="black")
)

fig_2023.update_layout(
    title=f'Sick Leave Validation - C Manufacturing (2023)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=False,
        tickformat="%Y-%m",
        range=[test_df_2023.index.min(), test_df_2023.index.max()]
    ),
    yaxis=dict(
        showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'
    ),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)

fig_2023.show()


In [16]:
# Calculate MAE for each quarter of 2022 and 2023
for year in [2022, 2023]:
    for quarter in [1, 2, 3, 4]:
        q_mask = (test_df['Year'] == year) & (test_df['Quarter'] == quarter)
        mae_q = mean_absolute_error(
            y_test[q_mask],
            y_pred_2022[q_mask] if year == 2022 else y_pred_2023[q_mask]
        )
        print(f"MAE for Q{quarter} {year}: {mae_q:.4f}")

MAE for Q1 2022: 0.1550
MAE for Q2 2022: 0.0689
MAE for Q3 2022: 0.0602
MAE for Q4 2022: 0.1189
MAE for Q1 2023: 0.1512
MAE for Q2 2023: 0.2287
MAE for Q3 2023: 0.0930
MAE for Q4 2023: 0.0050
