In [1]:
# 0. Import the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

In [2]:
# 1. Load the updated dataset
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables_new.xlsx"
df = pd.read_excel(file_path)

In [3]:
# 2. Filter for the specific branch 'G Trade'
branch_name = 'G Trade'
df = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch_name.strip().lower()]

In [4]:
# 3. Verify and process the 'Year' and 'Quarter' columns for date alignment
if 'Year' not in df.columns or 'Quarter' not in df.columns:
    raise KeyError("The 'Year' and 'Quarter' columns are required in the dataset.")

In [5]:
# 4. Create a 'Date' column representing the start of each quarter
df['Year'] = df['Year'].astype(int)
df['Quarter'] = df['Quarter'].astype(int)
df['Date'] = pd.PeriodIndex.from_fields(year=df['Year'], quarter=df['Quarter'], freq='Q').to_timestamp()

In [6]:
# 5. Set 'Date' as the index for time-series analysis
df.set_index('Date', inplace=True, drop=True)

# Remove duplicate index values if they exist
df = df.loc[~df.index.duplicated(keep='first')]

# Ensure the frequency of the Date index is explicitly set
df.index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='QS')

In [7]:
# 6. Define parameters for SARIMAX model
branch_name = 'G Trade'
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [19]:
# 7. Function for rolling forecasts
def rolling_forecast(train_data, test_data, target_column, order, seasonal_order):
    predictions = []
    rolling_train = train_data.copy()

    for date in test_data.index:
        rolling_train.index.freq = 'QS'  # Ensure frequency is set
        model = sm.tsa.SARIMAX(
            np.log(rolling_train[target_column] + 1),
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        results = model.fit(disp=False, maxiter=5000, method='powell')  # Increase maxiter to 5000
        forecast_log = results.get_forecast(steps=1).predicted_mean
        forecast = np.exp(forecast_log) - 1  # Back-transform
        predictions.append(forecast.iloc[0])
        rolling_train = pd.concat([rolling_train, test_data.loc[[date]]])

    return pd.Series(predictions, index=test_data.index)

In [27]:
# 8. Split the data into training and testing sets
train_end_year = 2021  # Include data up to 2021 in training
train_df = df[df['Year'] <= train_end_year].copy()  # Training dataset
test_df = df[df['Year'].isin([2022, 2023])].copy()  # Testing dataset

# Define the target variable for the test dataset
y_test = test_df[target_column]

In [None]:
# 9. Hyperparameter tuning during validation (2022)
best_mae = float('inf')
best_order = None
best_seasonal_order = None

# Define the search grid for parameters
order_grid = [(1, 1, 1), (2, 1, 1)]
seasonal_order_grid = [(2, 1, 1, 4), (2, 1, 2, 4)]

# Loop through the parameter grid
for order in order_grid:
    for seasonal_order in seasonal_order_grid:
        y_pred_2022 = rolling_forecast(train_df, test_df[test_df['Year'] == 2022], 
                                       target_column, order, seasonal_order)
        mae_all_2022 = mean_absolute_error(
            y_test[test_df['Year'] == 2022], y_pred_2022)
        if mae_all_2022 < best_mae:
            best_mae = mae_all_2022
            best_order = order
            best_seasonal_order = seasonal_order

print(f"Best parameters: order={best_order}, seasonal_order={best_seasonal_order}, MAE: {best_mae:.4f}")

Best parameters: order=(2, 1, 1), seasonal_order=(2, 1, 1, 4), MAE: 0.1713


In [29]:
# 10. Predictions for 2023 using tuned parameters
y_pred_2023 = rolling_forecast(
    pd.concat([train_df, test_df[test_df['Year'] == 2022]]),
    test_df[test_df['Year'] == 2023],
    target_column,
    best_order,
    best_seasonal_order
)

mae_q1_2023 = mean_absolute_error(
y_test[(test_df['Year'] == 2023) & (test_df['Quarter'] == 1)],
y_pred_2023[test_df['Quarter'] == 1])
mae_all_2023 = mean_absolute_error(y_test[test_df['Year'] == 2023], y_pred_2023)
print(f"MAE for Q1 2023: {mae_q1_2023:.4f}")
print(f"MAE for all quarters of 2023: {mae_all_2023:.4f}")

MAE for Q1 2023: 0.0201
MAE for all quarters of 2023: 0.2183


In [30]:
# 11. Predictions for 2024 (Q1-Q3 only) using tuned parameters
forecast_df = df[(df['Year'] == 2024) & (df['Quarter'] <= 3)].copy()

y_pred_2024 = rolling_forecast(
    pd.concat([train_df, test_df]),
    forecast_df,
    target_column,
    best_order,
    best_seasonal_order
)

# Define the actual target values for forecasting evaluation
y_forecast = forecast_df[target_column]

mae_q1_2024 = mean_absolute_error(y_forecast[forecast_df['Quarter'] == 1], y_pred_2024[forecast_df['Quarter'] == 1])
mae_all_2024 = mean_absolute_error(y_forecast, y_pred_2024)
print(f"MAE for Q1 2024: {mae_q1_2024:.4f}")
print(f"MAE for all Quarters of 2024: {mae_all_2024:.4f}")

MAE for Q1 2024: 0.7387
MAE for all Quarters of 2024: 0.3641


In [None]:
# 12. Visualizations
# Visualization for 2022
fig_2022 = go.Figure()
fig_2022.add_trace(go.Scatter(
    x=test_df[test_df['Year'] == 2022].index,
    y=y_test[test_df['Year'] == 2022],
    mode='lines+markers',
    name='Actual (2022)',
    line=dict(color='#0078d2', width=2)
))
fig_2022.add_trace(go.Scatter(
    x=test_df[test_df['Year'] == 2022].index,
    y=y_pred_2022,
    mode='lines+markers',
    name='Predictions (2022)',
    line=dict(color='orange', width=2, dash='dash')
))
fig_2022.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.13, showarrow=False,
    text=f"MAE Q1 2022: {mae_q1_2022:.4f}",
    font=dict(size=12, color="black")
)
fig_2022.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.08, showarrow=False,
    text=f"MAE All 2022: {mae_all_2022:.4f}",
    font=dict(size=12, color="black")
)
fig_2022.update_layout(
    title=f'Sick Leave Test - G Trade (2022)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(showgrid=False, tickformat="%Y-%m"),
    yaxis=dict(showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)
fig_2022.show()

NameError: name 'mae_q1_2022' is not defined

In [49]:
# 13. Visualization for 2023
fig_2023 = go.Figure()
fig_2023.add_trace(go.Scatter(
    x=test_df[test_df['Year'] == 2023].index,
    y=y_test[test_df['Year'] == 2023],
    mode='lines+markers',
    name='Actual (2023)',
    line=dict(color='#0078d2', width=2)
))
fig_2023.add_trace(go.Scatter(
    x=test_df[test_df['Year'] == 2023].index,
    y=y_pred_2023,
    mode='lines+markers',
    name='Predictions (2023)',
    line=dict(color='green', width=2, dash='dot')
))
fig_2023.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.13, showarrow=False,
    text=f"MAE Q1 2023: {mae_q1_2023:.4f}",
    font=dict(size=12, color="black")
)
fig_2023.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.08, showarrow=False,
    text=f"MAE All 2023: {mae_all_2023:.4f}",
    font=dict(size=12, color="black")
)
fig_2023.update_layout(
    title=f'Sick Leave Validation - G Trade (2023)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(showgrid=False, tickformat="%Y-%m"),
    yaxis=dict(showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)
fig_2023.show()

In [50]:
# 14. Visualization for 2024 (Q1-Q3)
fig_2024 = go.Figure()
fig_2024.add_trace(go.Scatter(
    x=forecast_df.index,
    y=y_forecast,
    mode='lines+markers',
    name='Actual (2024 Q1-Q3)',
    line=dict(color='#0078d2', width=2)
))
fig_2024.add_trace(go.Scatter(
    x=forecast_df.index,
    y=y_pred_2024,
    mode='lines+markers',
    name='Predictions (2024)',
    line=dict(color='orange', width=2, dash='dash')
))
fig_2024.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.13, showarrow=False,
    text=f"MAE Q1 2024: {mae_q1_2024:.4f}",
    font=dict(size=12, color="black")
)
fig_2024.add_annotation(
    xref="paper", yref="paper", x=0.00, y=1.08, showarrow=False,
    text=f"MAE All (Q1-Q3) 2024: {mae_all_2024:.4f}",
    font=dict(size=12, color="black")
)
fig_2024.update_layout(
    title=f'Sick Leave Forecast - G Trade (2024 Q1-Q3)',
    xaxis_title='Date',
    yaxis_title='Sick Leave Percentage',
    plot_bgcolor='white',
    xaxis=dict(showgrid=False, tickformat="%Y-%m"),
    yaxis=dict(showgrid=True, gridcolor='lightgrey', showline=True, linewidth=0.5, linecolor='black'),
    font=dict(family="Roboto", size=14),
    margin=dict(l=50, r=50, t=100, b=50),
    width=1100, height=500
)
fig_2024.show()


In [51]:
# 15. Overview of MAE per quarter for 2022, 2023, and 2024

# Initialize a dictionary to store MAE values
mae_overview = {
    "Year": [],
    "Quarter": [],
    "MAE": []
}

# Function to calculate MAE for a specific year and quarter
def calculate_mae(year, quarter, actual, predicted):
    mask = (df['Year'] == year) & (df['Quarter'] == quarter)
    if mask.any():
        return mean_absolute_error(actual[mask], predicted[mask])
    return None

# Add MAE for 2022
for quarter in [1, 2, 3, 4]:
    mae_value = calculate_mae(2022, quarter, y_test, y_pred_2022)
    mae_overview["Year"].append(2022)
    mae_overview["Quarter"].append(quarter)
    mae_overview["MAE"].append(mae_value)

# Add MAE for 2023
for quarter in [1, 2, 3, 4]:
    mae_value = calculate_mae(2023, quarter, y_test, y_pred_2023)
    mae_overview["Year"].append(2023)
    mae_overview["Quarter"].append(quarter)
    mae_overview["MAE"].append(mae_value)

# Add MAE for 2024 (Q1-Q3 only)
for quarter in [1, 2, 3]:
    mae_value = calculate_mae(2024, quarter, y_forecast, y_pred_2024)
    mae_overview["Year"].append(2024)
    mae_overview["Quarter"].append(quarter)
    mae_overview["MAE"].append(mae_value)

# Convert to a DataFrame for easier visualization
mae_df = pd.DataFrame(mae_overview)

# Print the overview
print(mae_df)

# Save the overview as a CSV file
mae_df.to_csv("mae_overview.csv", index=False)


    Year  Quarter       MAE
0   2022        1  0.007430
1   2022        2  0.237451
2   2022        3  0.196701
3   2022        4  0.244285
4   2023        1  0.038545
5   2023        2  0.274506
6   2023        3  0.379878
7   2023        4  0.210999
8   2024        1  0.457429
9   2024        2  0.103436
10  2024        3  0.104835
