In [55]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from prettytable import PrettyTable

# Model selection based on complete dataset

In [25]:
# Load csv into a dataframe
df = pd.read_csv("data/merged_table.csv")

# Set the Year_Quarter column as the index for easy operations
df.set_index('Year_Quarter', inplace=True)

# Filter the index to include only values between 2008 and 2021 (inclusive)
df_filtered = df[(df.index >= '2008') & (df.index <= '2022')]

df_filtered.head()

Unnamed: 0_level_0,BedrijfstakkenBranchesSBI2008,80072ned_Ziekteverzuimpercentage_1,83451NED_BanenMetSeizoenscorrectie_1,83451NED_BanenZonderSeizoenscorrectie_2,83451NED_ArbeidsvolumeMetSeizoenscorrectie_3,83451NED_ArbeidsvolumeZonderSeizoenscorrectie_4,83451NED_MaandloonInclusiefOverwerk_5,83451NED_MaandloonExclusiefOverwerk_6,85928NED_PrijsindexArbeid_1,85928NED_JaarmutatiePrijsVanArbeid_2,...,80072ned_Ziekteverzuimpercentage_1_lag_5,80072ned_Ziekteverzuimpercentage_1_lag_6,80072ned_Ziekteverzuimpercentage_1_lag_7,80072ned_Ziekteverzuimpercentage_1_lag_8,covid_19,airpressure,maximum_temperatures,mean_temperatures,minimum_temperatures,precipitation
Year_Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-Q1,C Industrie,5.4,12198.0,12186.0,10997.0,10994.0,133808.0,130894.0,78.5,3.9,...,5.9,5.0,5.4,6.1,0,10143.0,91.333333,58.333333,23.666667,758.0
2008-Q1,G Handel,3.7,18644.0,18547.0,13323.0,13201.0,111633.0,109999.0,80.4,2.4,...,3.3,2.9,3.1,3.5,0,10143.0,91.333333,58.333333,23.666667,758.0
2008-Q1,Q Gezondheids- en welzijnszorg,6.0,17568.0,17535.0,12161.0,12131.0,107804.0,107351.0,74.5,3.3,...,5.4,4.5,5.2,6.0,0,10143.0,91.333333,58.333333,23.666667,758.0
2008-Q2,C Industrie,4.8,12218.0,12238.0,11005.0,11006.0,134681.0,131518.0,79.8,5.0,...,5.4,5.9,5.0,5.4,0,10143.0,188.666667,137.0,80.0,354.333333
2008-Q2,G Handel,3.2,18771.0,18772.0,13384.0,13360.0,112189.0,110428.0,81.2,2.9,...,3.6,3.3,2.9,3.1,0,10143.0,188.666667,137.0,80.0,354.333333


In [26]:
df_timeseries = df_filtered[['BedrijfstakkenBranchesSBI2008'] + ["covid_19"] + [col for col in df_filtered.columns if '80072ned_Ziekteverzuimpercentage_1' in col]]
df_timeseries.head()

Unnamed: 0_level_0,BedrijfstakkenBranchesSBI2008,covid_19,80072ned_Ziekteverzuimpercentage_1,80072ned_Ziekteverzuimpercentage_1_lag_1,80072ned_Ziekteverzuimpercentage_1_lag_2,80072ned_Ziekteverzuimpercentage_1_lag_3,80072ned_Ziekteverzuimpercentage_1_lag_4,80072ned_Ziekteverzuimpercentage_1_lag_5,80072ned_Ziekteverzuimpercentage_1_lag_6,80072ned_Ziekteverzuimpercentage_1_lag_7,80072ned_Ziekteverzuimpercentage_1_lag_8
Year_Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-Q1,C Industrie,0,5.4,5.2,4.3,4.7,5.4,5.9,5.0,5.4,6.1
2008-Q1,G Handel,0,3.7,3.4,3.0,3.2,3.6,3.3,2.9,3.1,3.5
2008-Q1,Q Gezondheids- en welzijnszorg,0,6.0,5.3,4.4,5.1,5.9,5.4,4.5,5.2,6.0
2008-Q2,C Industrie,0,4.8,5.4,5.2,4.3,4.7,5.4,5.9,5.0,5.4
2008-Q2,G Handel,0,3.2,3.7,3.4,3.0,3.2,3.6,3.3,2.9,3.1


## Linear regression

In [69]:
# Define a function to perform training/validation and save results
def train_and_validate(df, target_column, model, model_name, save_csv=False):
    """
    Perform time-series cross-validation using the specified model and optionally save results to a CSV.
    
    Parameters:
        df (DataFrame): Time-series data for all industries.
        target_column (str): Target column for prediction.
        model: A scikit-learn-compatible model instance (e.g., LinearRegression, RandomForest).
        model_name (str): Name of the model to use in file naming.
        save_csv (bool): Whether to save the results as a CSV. Default is False.
    """
    industries = df['BedrijfstakkenBranchesSBI2008'].unique()

    # Placeholder for storing results
    cv_results = []

    # Perform the analysis for each industry
    for industry in industries:
        result = perform_time_series_cv(df, target_column, industry, model)
        cv_results.append(result)

    # Prepare the results for saving to CSV
    output_rows = []
    for result in cv_results:
        for res in result["results"]:
            output_rows.append({
                "Industry": result["industry"],
                "Train Start": res["train_range"][0],
                "Train End": res["train_range"][1],
                "Validation Start": res["validation_range"][0],
                "Validation End": res["validation_range"][1],
                "MAE": res["mae"],
                "Actual Values": res["actual_values"],
                "Predicted Values": res["predicted_values"]
            })

    # Optionally save to CSV
    if save_csv:
        output_df = pd.DataFrame(output_rows)
        output_folder = "data"
        os.makedirs(output_folder, exist_ok=True)
        output_path = os.path.join(output_folder, f"{model_name}_training_results.csv")
        output_df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}\n")

    # Log the results in table format
    log_results(cv_results, model_name)

# Updated function to include model as an input
def perform_time_series_cv(df, target_column, industry, model):
    """
    Perform time-series cross-validation using the specified model.
    
    Parameters:
        df (DataFrame): Time-series data for all industries.
        target_column (str): Target column for prediction.
        industry (str): Specific industry to filter data for.
        model: A scikit-learn-compatible model instance.
    
    Returns:
        dict: Results including average MAE, predictions, and actual values per fold.
    """
    # Filter for the specific industry
    industry_data = df[df['BedrijfstakkenBranchesSBI2008'] == industry].copy()

    # Ensure the index is in Period format
    industry_data.index = pd.PeriodIndex(industry_data.index, freq='Q')

    # Define X (features) and y (target)
    X = industry_data.drop(columns=[target_column, 'BedrijfstakkenBranchesSBI2008'])
    y = industry_data[target_column]

    # Generate train-validation splits
    def generate_splits():
        train_start = pd.Period("2008Q1", freq="Q")
        train_end = pd.Period("2010Q4", freq="Q")
        validation_start = pd.Period("2011Q1", freq="Q")
        validation_end = pd.Period("2021Q4", freq="Q")  # Ensure four-quarter validation remains in range

        splits = []

        while validation_start + 3 <= validation_end:
            splits.append({
                "train_start": train_start,
                "train_end": train_end,
                "validation_start": validation_start,
                "validation_end": validation_start + 3  # Four quarters ahead
            })

            train_end += 1
            validation_start += 1

        return splits

    splits = generate_splits()

    # Perform cross-validation
    results = []
    quarterly_mae = {f"Q{i}": [] for i in range(1, 5)}  # Track MAEs by quarter for this industry

    for split in splits:
        # Identify train and validation indices
        train_index = (industry_data.index >= split["train_start"]) & (industry_data.index <= split["train_end"])
        validation_index = (industry_data.index >= split["validation_start"]) & (industry_data.index <= split["validation_end"])

        # Ensure there is data in both sets
        if train_index.sum() == 0 or validation_index.sum() == 0:
            print(f"Skipping split: Train ({split['train_start']} to {split['train_end']}), Validation ({split['validation_start']} to {split['validation_end']})")
            continue

        # Extract train and validation sets
        X_train, X_validation = X[train_index], X[validation_index]
        y_train, y_validation = y[train_index], y[validation_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict on validation set
        y_pred = model.predict(X_validation)

        # Store actual and predicted values
        actual_values = [float(round(val, 2)) for val in y_validation.values.tolist()]
        predicted_values = [float(round(pred, 2)) for pred in y_pred]  # Explicitly cast to float

        # Calculate MAE for each quarter in the validation period
        validation_periods = industry_data.index[validation_index]
        quarter_mae = {
            str(period): float(round(mean_absolute_error([y_validation.loc[period]], [y_pred[i]]), 2))
            for i, period in enumerate(validation_periods)
        }

        # Append MAE to quarterly tracking
        for period, mae in quarter_mae.items():
            quarter = int(period[-1])  # Extract quarter from "YYYYQx"
            quarterly_mae[f"Q{quarter}"].append(mae)

        results.append({
            "train_range": (split["train_start"], split["train_end"]),
            "validation_range": (split["validation_start"], split["validation_end"]),
            "mae": quarter_mae,
            "actual_values": actual_values,
            "predicted_values": predicted_values
        })

    # Calculate average MAE for this industry by quarter
    avg_mae_by_quarter = {q: float(round(np.mean(maes), 2)) if maes else None for q, maes in quarterly_mae.items()}
    return {"industry": industry, "avg_mae": avg_mae_by_quarter, "results": results}

def log_results(cv_results, model_name):
    """
    Logs and formats the results of cross-validation.
    
    Parameters:
        cv_results (list): Results of the cross-validation for each industry.
        model_name (str): Name of the model used.
    """

    # Create a table for formatted results
    table = PrettyTable()
    table.field_names = ["Industry", "Q1 MAE", "Q2 MAE", "Q3 MAE", "Q4 MAE"]

    # Add each industry's results to the table
    for result in cv_results:
        avg_mae = result["avg_mae"]
        table.add_row([
            result["industry"],
            avg_mae.get("Q1", "N/A"),
            avg_mae.get("Q2", "N/A"),
            avg_mae.get("Q3", "N/A"),
            avg_mae.get("Q4", "N/A")
        ])

    # Print the formatted table
    print("Cross-Validation Results:")
    print(table)

def compute_feature_importance_linear(X, model):
    """
    Compute feature importance using coefficients for a linear model.
    
    Parameters:
        X (DataFrame): Input features used for training the model.
        model: A fitted linear model with coefficients.
    
    Returns:
        DataFrame: A DataFrame with features and their corresponding importance values.
    """
    if not hasattr(model, "coef_"):
        raise ValueError("Model must have coefficients (e.g., LinearRegression).")

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Extract feature names and coefficients
    feature_importance = pd.DataFrame({
        "Feature": X.columns,
        "Importance": model.coef_
    })

    # Sort by absolute importance
    feature_importance["Abs_Importance"] = feature_importance["Importance"].abs()
    feature_importance = feature_importance.sort_values(by="Abs_Importance", ascending=False)

    return feature_importance[["Feature", "Importance"]]

In [66]:
train_and_validate(df_timeseries, target_column='80072ned_Ziekteverzuimpercentage_1', model=LinearRegression(), model_name="LinearRegression", save_csv=True)

Results saved to data/LinearRegression_training_results.csv

Cross-Validation Results:
+--------------------------------+--------+--------+--------+--------+
|            Industry            | Q1 MAE | Q2 MAE | Q3 MAE | Q4 MAE |
+--------------------------------+--------+--------+--------+--------+
|          C Industrie           |  0.44  |  0.2   |  0.22  |  0.2   |
|            G Handel            |  0.3   |  0.18  |  0.17  |  0.18  |
| Q Gezondheids- en welzijnszorg |  0.37  |  0.22  |  0.26  |  0.27  |
+--------------------------------+--------+--------+--------+--------+


In [70]:
feature_importance_df = compute_feature_importance_linear(X_train, fitted_model)


NameError: name 'X_train' is not defined

In [67]:
train_and_validate(df_timeseries, target_column='80072ned_Ziekteverzuimpercentage_1', model=Ridge(), model_name="Ridge", save_csv=False)

Cross-Validation Results:
+--------------------------------+--------+--------+--------+--------+
|            Industry            | Q1 MAE | Q2 MAE | Q3 MAE | Q4 MAE |
+--------------------------------+--------+--------+--------+--------+
|          C Industrie           |  0.42  |  0.2   |  0.23  |  0.15  |
|            G Handel            |  0.3   |  0.19  |  0.18  |  0.15  |
| Q Gezondheids- en welzijnszorg |  0.31  |  0.18  |  0.22  |  0.27  |
+--------------------------------+--------+--------+--------+--------+


In [68]:
train_and_validate(df_timeseries, target_column='80072ned_Ziekteverzuimpercentage_1', model=Lasso(), model_name="Lasso", save_csv=False)

Cross-Validation Results:
+--------------------------------+--------+--------+--------+--------+
|            Industry            | Q1 MAE | Q2 MAE | Q3 MAE | Q4 MAE |
+--------------------------------+--------+--------+--------+--------+
|          C Industrie           |  0.67  |  0.39  |  0.43  |  0.32  |
|            G Handel            |  0.54  |  0.36  |  0.39  |  0.35  |
| Q Gezondheids- en welzijnszorg |  0.8   |  0.45  |  0.52  |  0.51  |
+--------------------------------+--------+--------+--------+--------+


## Met crossfall

In [31]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer

In [38]:
# Function to perform cross-validation and hyperparameter tuning
def perform_tuning(df, target_column, lead, industry):
    # Filter data for the specific industry
    df = df[df['BedrijfstakkenBranchesSBI2008'] == industry].copy()

    # Convert year_quarter to a proper format
    df.loc[:, 'Year_Quarter'] = pd.PeriodIndex(df['Year_Quarter'], freq='Q')

    year_quarter = df['Year_Quarter']

    # Ensure alignment of indices
    X = df.drop(columns=[target_column, 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008']).reset_index(drop=True)
    y = df[target_column].reset_index(drop=True)
    year_quarter = year_quarter.reset_index(drop=True)

    # Generate train/validation splits
    def generate_splits():
        train_start = pd.Period("2008Q1", freq="Q")
        train_end = pd.Period("2010Q4", freq="Q")
        validation_start = pd.Period("2011Q1", freq="Q")
        validation_end = pd.Period("2021Q4", freq="Q")

        splits = []

        while validation_start <= validation_end:
            splits.append({
                "train_start": "2008Q1",  # Always start training from 2008Q1
                "train_end": str(train_end),
                "validation": str(validation_start)
            })

            train_end += 1
            validation_start += 1

        return splits

    splits = generate_splits()

    # Hyperparameter grid
    alphas = [0.1, 1.0, 10.0, 100.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0]
    tuning_results = []

    print(f"Time Series Cross-Validation with Hyperparameter Tuning for Q+{lead} ({industry}):")

    # Perform cross-validation with hyperparameter tuning
    for alpha in alphas:
        fold_results = []
        print(f"Testing alpha: {alpha}\n")

        for fold, split in enumerate(splits, 1):
            # Convert string dates to Period
            train_start = pd.Period(split["train_start"], freq='Q')
            train_end = pd.Period(split["train_end"], freq='Q')
            validation_date = pd.Period(split["validation"], freq='Q')

            # Get train and validation indices
            train_index = year_quarter[(year_quarter >= train_start) & (year_quarter <= train_end)].index
            validation_index = year_quarter[year_quarter == validation_date].index

            # Extract train and validation sets
            X_train, X_validation = X.iloc[train_index], X.iloc[validation_index]
            y_train, y_validation = y.iloc[train_index], y.iloc[validation_index]
            train_dates, validation_dates = year_quarter.iloc[train_index], year_quarter.iloc[validation_index]

            # Train the model with the current alpha
            model = Ridge(alpha=alpha)
            model.fit(X_train, y_train)

            # Predict and evaluate
            y_pred = model.predict(X_validation)
            mae = mean_absolute_error(y_validation, y_pred)

            # Save results for this fold
            fold_results.append(mae)

            print(f"Fold {fold}:")
            print(f"Train Date Range: {train_dates.min()} to {train_dates.max()}")
            print(f"Validation Date: {validation_dates.iloc[0]}")
            print(f"MAE: {mae:.4f}")

        # Calculate average MAE for this alpha
        avg_mae = np.mean(fold_results) if fold_results else float('inf')
        tuning_results.append({"alpha": alpha, "average_mae": avg_mae})
        print(f"\nAverage MAE for alpha {alpha}: {avg_mae:.4f}\n")

    # Find the best alpha
    best_result = min(tuning_results, key=lambda x: x["average_mae"])

    # Summary of all hyperparameters
    print(f"\nHyperparameter Tuning Summary for Q+{lead} ({industry}):")
    for result in tuning_results:
        print(f"Alpha: {result['alpha']}, Average MAE: {result['average_mae']:.4f}")

    print(f"\nBest alpha for Q+{lead} ({industry}): {best_result['alpha']} with Average MAE: {best_result['average_mae']:.4f}\n")

    return tuning_results, best_result

# Initialize variables for industries and data files
industries = ['C Industrie', 'G Handel', 'Q Gezondheids- en welzijnszorg']

# Data files for Q+1 to Q+4
data_files = [
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_1.csv', '80072ned_Ziekteverzuimpercentage_1_lead_1', 1),
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_2.csv', '80072ned_Ziekteverzuimpercentage_1_lead_2', 2),
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_3.csv', '80072ned_Ziekteverzuimpercentage_1_lead_3', 3),
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_4.csv', '80072ned_Ziekteverzuimpercentage_1_lead_4', 4)
]

# Placeholder for all summaries
all_summaries = []

# Perform tuning for each lead and each industry
for file_path, target_column, lead in data_files:
    print(f"Running tuning for Q+{lead}")
    df_lead = pd.read_csv(file_path)
    for industry in industries:
        print(f"\nIndustry: {industry}")
        tuning_results, best_result = perform_tuning(df_lead, target_column, lead, industry)
        all_summaries.append({
            "lead": lead,
            "industry": industry,
            "tuning_results": tuning_results,
            "best_result": best_result
        })

Running tuning for Q+1

Industry: C Industrie
Time Series Cross-Validation with Hyperparameter Tuning for Q+1 (C Industrie):
Testing alpha: 0.1



ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [30]:
# Function to apply the best model to the test set
def apply_best_model(df, target_column, best_alpha, industry, lead):
    # Filter data for the specific industry and create a copy
    df = df[df['BedrijfstakkenBranchesSBI2008'] == industry].copy()

    # Convert 'Year_Quarter' to PeriodIndex
    df['Year_Quarter'] = pd.PeriodIndex(df['Year_Quarter'], freq='Q')

    # Define the test period
    test_start = pd.Period("2022Q1", freq="Q")
    test_end = pd.Period("2023Q4", freq="Q")

    results = []

    # Apply models for each quarter in the test period
    while test_start <= test_end:
        # The Prediction Period is the current `test_start`
        prediction_period = test_start

        # Train data up to the current `test_start - lead`
        train_end = prediction_period - lead
        if train_end < pd.Period("2008Q1", freq="Q"):
            print(f"Skipping {prediction_period} as training period {train_end} is out of range.")
            test_start += 1
            continue

        train_data = df[df['Year_Quarter'] <= train_end]

        # Test data corresponds to the `prediction_period`
        test_data = df[df['Year_Quarter'] == prediction_period]

        if test_data.empty:
            print(f"No test data available for {prediction_period}. Skipping.")
            test_start += 1
            continue

        # Ensure alignment of indices
        X_train = train_data.drop(columns=[target_column, 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
        y_train = train_data[target_column]
        X_test = test_data.drop(columns=[target_column, 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
        y_test = test_data[target_column]  # Actual values for `prediction_period`

        # Handle missing values
        imputer = SimpleImputer(strategy='mean')
        X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
        y_train = y_train.fillna(y_train.mean())  # Fill missing target values in training set
        X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

        # Train the model with the best alpha
        model = Ridge(alpha=best_alpha)
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)

        # Append results for this period
        results.append({
            "Industry": industry,
            "Model": f"Q+{lead}",
            "Input Period": f"2008Q1 to {train_end}",
            "Prediction Period": str(prediction_period),
            "Predicted Value": y_pred.tolist(),
            "Actual Value": y_test.tolist(),  # Actual values match the prediction period
            "MAE": mae
        })

        # Move to the next test period
        test_start += 1

    return results


# Updated script to include Q+1 and ensure testing works as expected
summaries = []
industries = ['C Industrie', 'G Handel', 'Q Gezondheids- en welzijnszorg']

# Perform tuning for Q+1 for each industry
df_lead_1 = pd.read_csv('data/80072ned_Ziekteverzuimpercentage_1_lead_1.csv')
for industry in industries:
    summary_q1, best_result_q1 = perform_tuning(df_lead_1, '80072ned_Ziekteverzuimpercentage_1_lead_1', lead=1, industry=industry)
    summaries.append({"lead": 1, "industry": industry, "summary": summary_q1, "best_result": best_result_q1})

# Perform tuning for Q+2, Q+3, and Q+4 for each industry
data_files = [
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_2.csv', '80072ned_Ziekteverzuimpercentage_1_lead_2', 2),
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_3.csv', '80072ned_Ziekteverzuimpercentage_1_lead_3', 3),
    ('data/80072ned_Ziekteverzuimpercentage_1_lead_4.csv', '80072ned_Ziekteverzuimpercentage_1_lead_4', 4)
]

for file_path, target_column, lead in data_files:
    df_lead = pd.read_csv(file_path)
    for industry in industries:
        summary, best_result = perform_tuning(df_lead, target_column, lead=lead, industry=industry)
        summaries.append({"lead": lead, "industry": industry, "summary": summary, "best_result": best_result})

# Apply best models to the test set
final_results = []

# Include Q+1 predictions explicitly
data_files.insert(0, ('data/80072ned_Ziekteverzuimpercentage_1_lead_1.csv', '80072ned_Ziekteverzuimpercentage_1_lead_1', 1))

for file_path, target_column, lead in data_files:
    df_lead = pd.read_csv(file_path)
    for summary in summaries:
        if summary['lead'] == lead:
            industry = summary['industry']
            best_alpha = summary['best_result']['alpha']
            results = apply_best_model(df_lead, target_column, best_alpha, industry, lead)
            final_results.extend(results)

# Store final results in a DataFrame for analysis
results_df = pd.DataFrame(final_results)

# Save results to CSV
results_df.to_csv('test_results.csv', index=False)

# Calculate and display average MAE per industry and model
average_mae_summary = results_df.groupby(['Industry', 'Model'])['MAE'].mean().reset_index()
print("\nAverage MAE per Industry and Model:")
print(average_mae_summary)

print("\nTest results saved to 'test_results.csv'.")


Time Series Cross-Validation with Hyperparameter Tuning for Q+1 (C Industrie):
Testing alpha: 0.1

  Fold 1:
    Train Date Range: 2008Q1 to 2010Q4
    Validation Date: 2011Q1
    MAE: 1.0798
  Fold 2:
    Train Date Range: 2008Q1 to 2011Q1
    Validation Date: 2011Q2
    MAE: 0.2500
  Fold 3:
    Train Date Range: 2008Q1 to 2011Q2
    Validation Date: 2011Q3
    MAE: 0.0807
  Fold 4:
    Train Date Range: 2008Q1 to 2011Q3
    Validation Date: 2011Q4
    MAE: 0.0602
  Fold 5:
    Train Date Range: 2008Q1 to 2011Q4
    Validation Date: 2012Q1
    MAE: 0.1730
  Fold 6:
    Train Date Range: 2008Q1 to 2012Q1
    Validation Date: 2012Q2
    MAE: 0.4396
  Fold 7:
    Train Date Range: 2008Q1 to 2012Q2
    Validation Date: 2012Q3
    MAE: 0.5739
  Fold 8:
    Train Date Range: 2008Q1 to 2012Q3
    Validation Date: 2012Q4
    MAE: 0.9442
  Fold 9:
    Train Date Range: 2008Q1 to 2012Q4
    Validation Date: 2013Q1
    MAE: 0.3248
  Fold 10:
    Train Date Range: 2008Q1 to 2013Q1
    Validation

In [None]:
import pandas as pd

# Initialize the start and end points for training and validation
train_start = pd.Period("2008Q1", freq="Q")
train_end = pd.Period("2010Q4", freq="Q")
validation_start = pd.Period("2011Q1", freq="Q")
validation_end = pd.Period("2022Q4", freq="Q")

# List to store the splits
splits = []

# Loop until validation exceeds the desired end date
while validation_start <= validation_end:
    splits.append({
        "train_start": str(train_start),
        "train_end": str(train_end),
        "validation": str(validation_start)
    })

    # Move the train_end and validation forward by one quarter
    train_end += 1
    validation_start += 1

# Print the splits
for split in splits:
    print(split)


In [None]:
print(f"Fold {fold}: train_index = {train_index}, test_index = {test_index}")


## Hieronder een test zonder Crossval

In [None]:
df.tail()

In [None]:
df = df.dropna()

In [None]:
train_df = df[df['Year_Quarter'] < '2021-Q4']
test_df = df[df['Year_Quarter'] == '2021-Q4']

In [None]:
train_df.tail()

In [None]:
test_df.tail()

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

In [None]:
# Prepare the test data
X_test = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_1', 'Year_Quarter'])
y_test = test_df['80072ned_Ziekteverzuimpercentage_1_lead_1']

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df = test_df[['Year_Quarter']].copy()
comparison_df['Actual'] = y_test.values
comparison_df['Predicted'] = y_pred

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df)

# Optional: Save the comparison to a CSV for review
comparison_df.to_csv('actual_vs_predicted.csv', index=False)


In [None]:
df_lead_2 = df_lead_2.dropna()

In [None]:
train_df_lead_2 = df_lead_2[df_lead_2['Year_Quarter'] < '2021-Q4']
test_df_lead_2 = df_lead_2[df_lead_2['Year_Quarter'] == '2021-Q4']

In [None]:
train_df_lead_2.tail()

In [None]:
test_df_lead_2.tail()

In [None]:
X_train_df_lead_2 = train_df_lead_2.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_2', 'Year_Quarter'])
y_train_df_lead_2 = train_df_lead_2['80072ned_Ziekteverzuimpercentage_1_lead_2']

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train_df_lead_2, y_train_df_lead_2)

In [None]:
# Prepare the test data
X_test_df_lead_2 = test_df_lead_2.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_2', 'Year_Quarter'])
y_test_df_lead_2 = test_df_lead_2['80072ned_Ziekteverzuimpercentage_1_lead_2']

# Make predictions on the test set
y_pred_df_lead_2 = model.predict(X_test_df_lead_2)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_df_lead_2, y_pred_df_lead_2))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df_lead_2 = test_df[['Year_Quarter']].copy()
comparison_df_lead_2['Actual'] = y_test_df_lead_2.values
comparison_df_lead_2['Predicted'] = y_pred_df_lead_2

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df_lead_2)

# Optional: Save the comparison to a CSV for review
comparison_df_lead_2.to_csv('actual_vs_predicted_lead_2.csv', index=False)


In [None]:
df_lead_3 = df_lead_3.dropna()

In [None]:
train_df_lead_3 = df_lead_3[df_lead_3['Year_Quarter'] < '2021-Q4']
test_df_lead_3 = df_lead_3[df_lead_3['Year_Quarter'] == '2021-Q4']

In [None]:
train_df_lead_3.tail()

In [None]:
test_df_lead_3.tail()

In [None]:
X_train_df_lead_3 = train_df_lead_3.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_3', 'Year_Quarter'])
y_train_df_lead_3 = train_df_lead_3['80072ned_Ziekteverzuimpercentage_1_lead_3']

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train_df_lead_3, y_train_df_lead_3)

In [None]:
# Prepare the test data
X_test_df_lead_3 = test_df_lead_3.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_3', 'Year_Quarter'])
y_test_df_lead_3 = test_df_lead_3['80072ned_Ziekteverzuimpercentage_1_lead_3']

# Make predictions on the test set
y_pred_df_lead_3 = model.predict(X_test_df_lead_3)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_df_lead_3, y_pred_df_lead_3))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df_lead_3 = test_df[['Year_Quarter']].copy()
comparison_df_lead_3['Actual'] = y_test_df_lead_3.values
comparison_df_lead_3['Predicted'] = y_pred_df_lead_3

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df_lead_3)

# Optional: Save the comparison to a CSV for review
comparison_df_lead_3.to_csv('actual_vs_predicted_lead_3.csv', index=False)

In [None]:
df_lead_4 = df_lead_4.dropna()

In [None]:
train_df_lead_4 = df_lead_4[df_lead_4['Year_Quarter'] < '2021-Q4']
test_df_lead_4 = df_lead_4[df_lead_4['Year_Quarter'] == '2021-Q4']

In [None]:
train_df_lead_4.tail()

In [None]:
test_df_lead_4.tail()

In [None]:
X_train_df_lead_4 = train_df_lead_4.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_4', 'Year_Quarter'])
y_train_df_lead_4 = train_df_lead_4['80072ned_Ziekteverzuimpercentage_1_lead_4']

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train_df_lead_4, y_train_df_lead_4)

In [None]:
# Prepare the test data
X_test_df_lead_4 = test_df_lead_4.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_4', 'Year_Quarter'])
y_test_df_lead_4 = test_df_lead_4['80072ned_Ziekteverzuimpercentage_1_lead_4']

# Make predictions on the test set
y_pred_df_lead_4 = model.predict(X_test_df_lead_4)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_df_lead_4, y_pred_df_lead_4))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df_lead_4 = test_df[['Year_Quarter']].copy()
comparison_df_lead_4['Actual'] = y_test_df_lead_4.values
comparison_df_lead_4['Predicted'] = y_pred_df_lead_4

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df_lead_4)

# Optional: Save the comparison to a CSV for review
comparison_df_lead_4.to_csv('actual_vs_predicted_lead_4.csv', index=False)

## Verder gaan

In [None]:
# Filter the dataset for "C Industrie"
industry = 'C Industrie'
industry_df = df[df['BedrijfstakkenBranchesSBI2008'] == industry]

# Split into training and testing based on 'Year_Quarter'
train_df = industry_df[industry_df['Year_Quarter'] < '2022-Q1']
test_df = industry_df[industry_df['Year_Quarter'] >= '2022-Q1']

# Separate features and target for training
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1']

# Separate the initial features and target for testing
# We’ll use X_test_initial for recursive predictions
X_test_initial = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008']).iloc[-1]


In [None]:
import pandas as pd

# Set display options for Pandas to show all columns if it's a DataFrame or Series
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Print the full content of X_test_initial
print(X_test_initial)


In [None]:


# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train, y_train)


In [None]:
# Create lagged features for the target variable
for lag in [1, 2, 3, 4]:  # Create lags for the last 4 quarters
    df[f'80072ned_Ziekteverzuimpercentage_1_lag_{lag}'] = df['80072ned_Ziekteverzuimpercentage_1'].shift(lag)

# Drop rows with missing values due to lagging
df = df.dropna().reset_index(drop=True)


In [None]:
# Define features and target for training
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1']


In [None]:
# Set the initial data for recursive forecasting
X_test_initial = X_train.iloc[-1].copy()  # Use the last row of training data as the starting point
X_test_initial = pd.DataFrame([X_test_initial], columns=X_train.columns)


In [None]:
import numpy as np
import pandas as pd

# Placeholder to store predictions for each quarter in 2022
predictions = []

# Number of future quarters we want to predict (e.g., all quarters in 2022)
future_periods = 4

# Start with a copy of the initial test data for recursive predictions
X_current = X_test_initial.copy()

# Ensure X_current is a DataFrame with the correct feature names
X_current = pd.DataFrame([X_current], columns=X_train.columns)

for i in range(future_periods):
    # Predict for the next quarter
    y_pred = model.predict(X_current)[0]
    predictions.append(y_pred)
    
    # Update lag features for the next prediction
    for lag in range(4, 1, -1):  # Update lags 4 -> 3 -> 2 -> 1
        X_current.loc[:, f'80072ned_Ziekteverzuimpercentage_1_lag_{lag}'] = X_current[f'80072ned_Ziekteverzuimpercentage_1_lag_{lag-1}']
    X_current.loc[:, '80072ned_Ziekteverzuimpercentage_1_lag_1'] = y_pred  # Set lag 1 to the current prediction

# Display predictions for each quarter in 2022
print("Predicted sick leave percentages for 'C Industrie' in 2022:", predictions)


## Samenvoegen modellen

In [None]:
# List of unique industries
industries = df['BedrijfstakkenBranchesSBI2008'].unique()

# Dictionary to store train and test sets for each industry
industry_splits = {}

for industry in industries:
    # Filter data for the specific industry
    industry_df = df[df['BedrijfstakkenBranchesSBI2008'] == industry]
    
    # Split into train and test based on Year_Quarter
    train_df = industry_df[industry_df['Year_Quarter'] < '2022-Q1']
    test_df = industry_df[industry_df['Year_Quarter'] >= '2022-Q1']
    
    # Separate features and target for training and testing
    X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
    y_train = train_df['80072ned_Ziekteverzuimpercentage_1']
    X_test = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
    y_test = test_df['80072ned_Ziekteverzuimpercentage_1']
    
    # Store train and test sets in the dictionary
    industry_splits[industry] = {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

    print(f"Data for {industry}:")
    print("  Training data:", X_train.shape, y_train.shape)
    print("  Testing data:", X_test.shape, y_test.shape)

# First, select numeric columns for grouping
df_numeric = df.select_dtypes(include=[float, int])

# Group by 'Year_Quarter' and calculate the mean only for numeric columns
df_grouped = df_numeric.groupby(df['Year_Quarter']).mean().reset_index()

# Now split into train and test based on 'Year_Quarter'
train_df = df_grouped[df_grouped['Year_Quarter'] < '2022-Q1']
test_df = df_grouped[df_grouped['Year_Quarter'] >= '2022-Q1']

# Separate features and target for the combined dataset
X_train_combined = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
y_train_combined = train_df['80072ned_Ziekteverzuimpercentage_1']
X_test_combined = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
y_test_combined = test_df['80072ned_Ziekteverzuimpercentage_1']

print("\nCombined data (after grouping by Year_Quarter):")
print("  Training data:", X_train_combined.shape, y_train_combined.shape)
print("  Testing data:", X_test_combined.shape, y_test_combined.shape)

In [None]:
industry_splits['C Industrie']['X_train']

In [None]:
industry_splits['C Industrie']['y_train']

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define function to train and evaluate a model, and capture predictions vs actuals
def train_and_evaluate(X_train, y_train, X_test, y_test, industry_name):
    model = LinearRegression()  # Initialize the model
    model.fit(X_train, y_train)  # Train the model
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{industry_name} Model Evaluation:")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Root Mean Squared Error (RMSE): {rmse}")
    print(f"  R-squared (R2): {r2}")
    
    # Create a DataFrame with predictions and actuals for comparison
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred
    })
    
    return model, results_df

# Dictionary to store models and results for each industry
industry_models = {}
industry_results = {}

# 1. Train and evaluate models for each industry and store results
for industry, data in industry_splits.items():
    print(f"\nTraining model for industry: {industry}")
    
    # Ensure only numeric columns are used
    X_train = data['X_train'].copy().select_dtypes(include=[float, int])
    y_train = data['y_train']
    X_test = data['X_test'].copy().select_dtypes(include=[float, int])
    y_test = data['y_test']
    
    # Train and evaluate model for this industry
    model, results_df = train_and_evaluate(X_train, y_train, X_test, y_test, industry)
    industry_models[industry] = model
    industry_results[industry] = results_df

# 2. Train and evaluate the combined model
print("\nTraining combined model:")

# Ensure only numeric columns are in combined training and testing sets
X_train_combined = X_train_combined.select_dtypes(include=[float, int])
X_test_combined = X_test_combined.select_dtypes(include=[float, int])

combined_model, combined_results_df = train_and_evaluate(X_train_combined, y_train_combined, X_test_combined, y_test_combined, "Combined")

# Store the combined model and results separately for easy reference
industry_models["Combined"] = combined_model
industry_results["Combined"] = combined_results_df

# Display results for each industry
for industry, results_df in industry_results.items():
    print(f"\nPredictions and Actuals for {industry}:\n", results_df)
