In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Step 1: Create a sorted list of unique 'Year_Quarter' values across the entire DataFrame
unique_year_quarters = sorted(df['Year_Quarter'].unique())

# Step 2: Create a global ordinal mapping that assigns each 'Year_Quarter' a unique integer
year_quarter_to_ordinal = {val: idx for idx, val in enumerate(unique_year_quarters)}
ordinal_to_year_quarter = {idx: val for idx, val in enumerate(unique_year_quarters)}

# Step 3: Apply the encoding to the 'Year_Quarter' column in the main dataframe
df['Year_Quarter_Encoded'] = df['Year_Quarter'].map(year_quarter_to_ordinal)

# Step 4: Verify the encoding (optional but useful for assurance)
# print("Sample of Year_Quarter encoding mapping:")
for i in range(min(100, len(year_quarter_to_ordinal))):  # Display first 10 mappings for verification
    quarter = unique_year_quarters[i]
    # print(f"{quarter} -> {year_quarter_to_ordinal[quarter]}")
    
# At this point, each 'Year_Quarter' in df['Year_Quarter'] has been encoded consistently


In [None]:
# df = pd.read_json('data/df_final.json')
df = pd.read_csv('data/80072ned_Ziekteverzuimpercentage_1_lead_1.csv')
df_lead_2 = pd.read_csv('data/80072ned_Ziekteverzuimpercentage_1_lead_2.csv')
df_lead_3 = pd.read_csv('data/80072ned_Ziekteverzuimpercentage_1_lead_3.csv')
df_lead_4 = pd.read_csv('data/80072ned_Ziekteverzuimpercentage_1_lead_4.csv')

In [None]:
df = df.dropna()

In [None]:
train_df = df[df['Year_Quarter'] < '2022-Q1']
test_df = df[df['Year_Quarter'] >= '2022-Q1']

In [None]:
test_df.head()

In [None]:
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_1', 'Year_Quarter'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1_lead_1']

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Prepare the test data
X_test = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_1', 'Year_Quarter'])
y_test = test_df['80072ned_Ziekteverzuimpercentage_1_lead_1']

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df = test_df[['Year_Quarter']].copy()
comparison_df['Actual'] = y_test.values
comparison_df['Predicted'] = y_pred

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df)

# Optional: Save the comparison to a CSV for review
comparison_df.to_csv('actual_vs_predicted.csv', index=False)


In [None]:
df = df_lead_2.dropna()

In [None]:
train_df = df[df['Year_Quarter'] < '2022-Q1']
test_df = df[df['Year_Quarter'] >= '2022-Q1']

In [None]:
train_df.tail()

In [None]:
test_df.tail()

In [None]:
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_2', 'Year_Quarter'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1_lead_2']

In [None]:
# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Prepare the test data
X_test = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1_lead_2', 'Year_Quarter'])
y_test = test_df['80072ned_Ziekteverzuimpercentage_1_lead_2']

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Combine actual vs predicted into a DataFrame for comparison
comparison_df = test_df[['Year_Quarter']].copy()
comparison_df['Actual'] = y_test.values
comparison_df['Predicted'] = y_pred

# Print the comparison DataFrame
print("Actual vs Predicted:")
print(comparison_df)

# Optional: Save the comparison to a CSV for review
comparison_df.to_csv('actual_vs_predicted.csv', index=False)


## Verder gaan

In [None]:
# Filter the dataset for "C Industrie"
industry = 'C Industrie'
industry_df = df[df['BedrijfstakkenBranchesSBI2008'] == industry]

# Split into training and testing based on 'Year_Quarter'
train_df = industry_df[industry_df['Year_Quarter'] < '2022-Q1']
test_df = industry_df[industry_df['Year_Quarter'] >= '2022-Q1']

# Separate features and target for training
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1']

# Separate the initial features and target for testing
# We’ll use X_test_initial for recursive predictions
X_test_initial = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008']).iloc[-1]


In [None]:
import pandas as pd

# Set display options for Pandas to show all columns if it's a DataFrame or Series
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Print the full content of X_test_initial
print(X_test_initial)


In [None]:
from sklearn.linear_model import Ridge

# Initialize the model
model = Ridge()  # Or use another model like RandomForestRegressor

# Train the model on the training data
model.fit(X_train, y_train)


In [None]:
# Create lagged features for the target variable
for lag in [1, 2, 3, 4]:  # Create lags for the last 4 quarters
    df[f'80072ned_Ziekteverzuimpercentage_1_lag_{lag}'] = df['80072ned_Ziekteverzuimpercentage_1'].shift(lag)

# Drop rows with missing values due to lagging
df = df.dropna().reset_index(drop=True)


In [None]:
# Define features and target for training
X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1', 'Year_Quarter', 'BedrijfstakkenBranchesSBI2008'])
y_train = train_df['80072ned_Ziekteverzuimpercentage_1']


In [None]:
# Set the initial data for recursive forecasting
X_test_initial = X_train.iloc[-1].copy()  # Use the last row of training data as the starting point
X_test_initial = pd.DataFrame([X_test_initial], columns=X_train.columns)


In [None]:
import numpy as np
import pandas as pd

# Placeholder to store predictions for each quarter in 2022
predictions = []

# Number of future quarters we want to predict (e.g., all quarters in 2022)
future_periods = 4

# Start with a copy of the initial test data for recursive predictions
X_current = X_test_initial.copy()

# Ensure X_current is a DataFrame with the correct feature names
X_current = pd.DataFrame([X_current], columns=X_train.columns)

for i in range(future_periods):
    # Predict for the next quarter
    y_pred = model.predict(X_current)[0]
    predictions.append(y_pred)
    
    # Update lag features for the next prediction
    for lag in range(4, 1, -1):  # Update lags 4 -> 3 -> 2 -> 1
        X_current.loc[:, f'80072ned_Ziekteverzuimpercentage_1_lag_{lag}'] = X_current[f'80072ned_Ziekteverzuimpercentage_1_lag_{lag-1}']
    X_current.loc[:, '80072ned_Ziekteverzuimpercentage_1_lag_1'] = y_pred  # Set lag 1 to the current prediction

# Display predictions for each quarter in 2022
print("Predicted sick leave percentages for 'C Industrie' in 2022:", predictions)


## Samenvoegen modellen

In [None]:
# List of unique industries
industries = df['BedrijfstakkenBranchesSBI2008'].unique()

# Dictionary to store train and test sets for each industry
industry_splits = {}

for industry in industries:
    # Filter data for the specific industry
    industry_df = df[df['BedrijfstakkenBranchesSBI2008'] == industry]
    
    # Split into train and test based on Year_Quarter
    train_df = industry_df[industry_df['Year_Quarter'] < '2022-Q1']
    test_df = industry_df[industry_df['Year_Quarter'] >= '2022-Q1']
    
    # Separate features and target for training and testing
    X_train = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
    y_train = train_df['80072ned_Ziekteverzuimpercentage_1']
    X_test = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
    y_test = test_df['80072ned_Ziekteverzuimpercentage_1']
    
    # Store train and test sets in the dictionary
    industry_splits[industry] = {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

    print(f"Data for {industry}:")
    print("  Training data:", X_train.shape, y_train.shape)
    print("  Testing data:", X_test.shape, y_test.shape)

# First, select numeric columns for grouping
df_numeric = df.select_dtypes(include=[float, int])

# Group by 'Year_Quarter' and calculate the mean only for numeric columns
df_grouped = df_numeric.groupby(df['Year_Quarter']).mean().reset_index()

# Now split into train and test based on 'Year_Quarter'
train_df = df_grouped[df_grouped['Year_Quarter'] < '2022-Q1']
test_df = df_grouped[df_grouped['Year_Quarter'] >= '2022-Q1']

# Separate features and target for the combined dataset
X_train_combined = train_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
y_train_combined = train_df['80072ned_Ziekteverzuimpercentage_1']
X_test_combined = test_df.drop(columns=['80072ned_Ziekteverzuimpercentage_1'])
y_test_combined = test_df['80072ned_Ziekteverzuimpercentage_1']

print("\nCombined data (after grouping by Year_Quarter):")
print("  Training data:", X_train_combined.shape, y_train_combined.shape)
print("  Testing data:", X_test_combined.shape, y_test_combined.shape)

In [None]:
industry_splits['C Industrie']['X_train']

In [None]:
industry_splits['C Industrie']['y_train']

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define function to train and evaluate a model, and capture predictions vs actuals
def train_and_evaluate(X_train, y_train, X_test, y_test, industry_name):
    model = LinearRegression()  # Initialize the model
    model.fit(X_train, y_train)  # Train the model
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{industry_name} Model Evaluation:")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Root Mean Squared Error (RMSE): {rmse}")
    print(f"  R-squared (R2): {r2}")
    
    # Create a DataFrame with predictions and actuals for comparison
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred
    })
    
    return model, results_df

# Dictionary to store models and results for each industry
industry_models = {}
industry_results = {}

# 1. Train and evaluate models for each industry and store results
for industry, data in industry_splits.items():
    print(f"\nTraining model for industry: {industry}")
    
    # Ensure only numeric columns are used
    X_train = data['X_train'].copy().select_dtypes(include=[float, int])
    y_train = data['y_train']
    X_test = data['X_test'].copy().select_dtypes(include=[float, int])
    y_test = data['y_test']
    
    # Train and evaluate model for this industry
    model, results_df = train_and_evaluate(X_train, y_train, X_test, y_test, industry)
    industry_models[industry] = model
    industry_results[industry] = results_df

# 2. Train and evaluate the combined model
print("\nTraining combined model:")

# Ensure only numeric columns are in combined training and testing sets
X_train_combined = X_train_combined.select_dtypes(include=[float, int])
X_test_combined = X_test_combined.select_dtypes(include=[float, int])

combined_model, combined_results_df = train_and_evaluate(X_train_combined, y_train_combined, X_test_combined, y_test_combined, "Combined")

# Store the combined model and results separately for easy reference
industry_models["Combined"] = combined_model
industry_results["Combined"] = combined_results_df

# Display results for each industry
for industry, results_df in industry_results.items():
    print(f"\nPredictions and Actuals for {industry}:\n", results_df)
