# Importing libraries

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from IPython.display import display
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Loading dataframe

In [105]:
# Load csv into a dataframe
df = pd.read_csv("data/merged_table.csv")

# Selecting time range (exclude the test set data)

In [106]:
# Set the Year_Quarter column as the index for easy operations
df.set_index('Year_Quarter', inplace=True)

# Filter the index to include only values between 2008 and 2021 (inclusive)
df_filtered = df[(df.index >= '2008') & (df.index <= '2022')]

# General view of data & missing data
There is not missing data, the selected time frame is succesfully applied, no strange things to see

In [None]:
# Overview of the DataFrame
df_filtered.info()

# Missing values count and percentage
missing_counts = df_filtered.isnull().sum()
missing_percentage = (df_filtered.isnull().mean() * 100).sort_values(ascending=False)

# Display columns with missing values
print("\nColumns with missing values and their percentages:")
print(missing_percentage[missing_percentage > 0])

# Quick statistical summary
print("\nStatistical summary of the dataset:")
display(df_filtered.describe(include='all'))

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
display(df_filtered.head())

# Display the first few rows of the dataset
print("\nLast few rows of the dataset:")
display(df_filtered.tail())

# Check the shape of the dataset
print("\nShape of the dataset:")
print(f"Rows: {df_filtered.shape[0]}, Columns: {df_filtered.shape[1]}")


# Visualization
- Sick leave data shows a clear seasonal pattern
- Temperature data reflects a similar seasonal pattern
- Sick leave also seems to have a slowly upward trend
- Number of natural persons and rechtspersonen shows a similar pattern
- Many features show a completely different pattern than sick leave

In [None]:
batch_size = 5
numerical_columns = df_filtered.select_dtypes(include='number').columns

# Group by 'BedrijfstakkenBranchesSBI2008' and plot for each industry
for industry, group in df_filtered.groupby('BedrijfstakkenBranchesSBI2008'):
    print(f"Processing industry: {industry}")
    
    # Iterate through batches of numerical columns
    for i in range(0, len(numerical_columns), batch_size):
        batch_columns = numerical_columns[i:i + batch_size]
        
        # Plot the selected batch for the current industry group
        group[batch_columns].plot(
            subplots=True, 
            figsize=(12, 8), 
            title=f'{industry} - Time Series Trends for Batch {i//batch_size + 1}'
        )
        plt.tight_layout()
        plt.show()

## Correlation

In [None]:
# Define the target variable and industry column
target_column = '80072ned_Ziekteverzuimpercentage_1'  # Replace with your target variable column
industry_column = 'BedrijfstakkenBranchesSBI2008'

# Set the correlation threshold
threshold = 0.7

# Dictionary to store selected columns for each industry
selected_columns_by_industry = {}

# List to store all unique selected columns across industries
all_selected_columns = []

# Get the unique industries
industries = df_filtered[industry_column].unique()

# Iterate over each industry and calculate correlation
for industry in industries:
    # Filter the DataFrame for the current industry
    industry_data = df_filtered[df_filtered[industry_column] == industry]
    
    # Select only numeric columns
    numeric_data = industry_data.select_dtypes(include='number')
    
    # Calculate correlation matrix
    correlation_matrix = numeric_data.corr()
    
    # Sort features by their absolute correlation with the target
    correlation_with_target = correlation_matrix[target_column].abs().sort_values(ascending=False)
    
    # Display columns with correlation > threshold
    filtered_columns = correlation_with_target[correlation_with_target > threshold]
    print(f"\nColumns with correlation > {threshold} for {industry}:\n")
    print(filtered_columns)
    
    # Add selected columns to the industry-specific dictionary
    selected_columns_by_industry[industry] = filtered_columns.index.tolist()
    
    # Add columns to the master list
    all_selected_columns.extend(filtered_columns.index.tolist())

# Remove duplicates from the combined list
all_selected_columns = list(set(all_selected_columns))

# Display the combined list of selected columns
print("\nAll selected columns across industries:")
print(all_selected_columns)


In [None]:
# Add the industry column explicitly to the selected columns
industry_column = 'BedrijfstakkenBranchesSBI2008'  # Replace with your actual industry column name
if industry_column not in all_selected_columns:
    all_selected_columns.append(industry_column)

# Create a new DataFrame with the selected columns and the industry column
selected_df = df_filtered[all_selected_columns]

# Display the shape and first few rows of the new DataFrame
print(f"New DataFrame shape: {selected_df.shape}")
selected_df.head()

In [None]:
import scipy.cluster.hierarchy as sch

# Compute the correlation matrix for numeric columns
correlation_matrix = selected_df.select_dtypes(include='number').corr()

# Perform hierarchical clustering to order the correlation matrix
linkage = sch.linkage(correlation_matrix, method='ward')
dendrogram_order = sch.leaves_list(linkage)
ordered_corr_matrix = correlation_matrix.iloc[dendrogram_order, dendrogram_order]

# Set up the matplotlib figure
plt.figure(figsize=(8, 6))  # Decrease the figure size

# Create a heatmap with seaborn
sns.heatmap(
    ordered_corr_matrix, 
    annot=True,        # Display correlation values
    cmap='coolwarm',   # Color scheme
    fmt='.2f',         # Format for correlation values
    vmin=-1, vmax=1,   # Range for correlation values
    square=True,       # Make cells square
    cbar_kws={"shrink": .8},  # Colorbar size
    annot_kws={"size": 8}  # Set annotation text size
)

# Add titles and labels
plt.title('Clustered Feature Correlation Heatmap', fontsize=14)
plt.xticks(fontsize=10, rotation=45)  # Adjust x-axis labels
plt.yticks(fontsize=10)              # Adjust y-axis labels
plt.show()


# VIF

In [None]:
# Function to calculate Variance Inflation Factor (VIF)
def calculate_vif(dataframe):
    """
    Calculate Variance Inflation Factor (VIF) for each feature in the dataframe.
    
    Parameters:
        dataframe (DataFrame): DataFrame containing numerical features.

    Returns:
        DataFrame: DataFrame with features and their corresponding VIF values.
    """
    vif_data = pd.DataFrame()
    vif_data["Feature"] = dataframe.columns
    vif_data["VIF"] = [
        variance_inflation_factor(dataframe.values, i) for i in range(dataframe.shape[1])
    ]
    return vif_data

# Select only numeric columns from the filtered DataFrame
numeric_columns = selected_df.select_dtypes(include='number')

# Calculate VIF for numeric columns
vif_df = calculate_vif(numeric_columns)

# Exclude rows with NaN or infinite VIF values
vif_df = vif_df[~vif_df['VIF'].isin([float('inf'), float('nan')])]

# Filter for columns with VIF <= 100
low_vif_df = vif_df[vif_df['VIF'] <= 100]

# Display columns with VIF <= 10
print("Features with VIF <= 10:")
print(low_vif_df)

# Filter the original DataFrame to retain only columns with VIF <= 10
selected_df_no_multicollinearity = df_filtered[low_vif_df['Feature']]

# Display updated DataFrame shape
print(f"Updated DataFrame shape after VIF filtering: {selected_df_no_multicollinearity.shape}")

# Time series analysis

In [None]:
df_timeseries = df_filtered[['BedrijfstakkenBranchesSBI2008'] + [col for col in df_filtered.columns if '80072ned_Ziekteverzuimpercentage_1' in col]]

# Define the industry column
industry_column = 'BedrijfstakkenBranchesSBI2008'

# Get the unique industries
industries = df_timeseries[industry_column].unique()

df_timeseries.head()

## ACF
This is an Autocorrelation Function (ACF) plot. Here’s how to interpret it:

Initial Observation:

The first bar (lag 0) is always 1, as it represents the autocorrelation of the series with itself.
Subsequent bars represent the correlation of the series with lagged versions of itself.
High Correlation at Specific Lags:

You see significant spikes at regular intervals (e.g., lags 1, 5, 9, 13, 17). This suggests a seasonal pattern in the data with a periodicity of approximately 4 lags.
Gradual Decline:

The values decline gradually (but not strictly exponentially). This could indicate a trend or non-stationarity in the data. The presence of a trend suggests the need for differencing to achieve stationarity before modeling.
Confidence Intervals (Shaded Region):

The blue shaded region represents the 95% confidence interval. Any spikes outside this region indicate statistically significant autocorrelations.
Spikes at lags like 1, 5, and so on are significant, reinforcing the idea of seasonality.
Key Takeaways for This Plot:
Seasonality:

Regular spikes suggest that the data exhibits seasonality. If this is quarterly data, the periodicity may align with yearly cycles (e.g., sales, weather, etc.).
Non-Stationarity:

The slow decay in autocorrelation suggests the presence of a trend, implying non-stationarity. You may need to apply differencing (e.g., first difference) to remove the trend and stabilize the mean.
Next Steps:

Check the Partial Autocorrelation Function (PACF) to determine the order of the AR component.
Consider applying seasonal differencing (e.g., SARIMA) to account for the observed periodicity.

## PACF
This is a Partial Autocorrelation Function (PACF) plot. Here’s how to interpret it and correlate it with the ACF plot you provided earlier:

Key Observations from the PACF Plot:
Lag 1 Spike:

The first significant spike is at lag 1, indicating a strong relationship between the current value and the value at lag 1.
This suggests that an AR(1) (AutoRegressive model of order 1) component may be useful in modeling.
Seasonal Pattern:

There are other significant spikes at lags such as 5, 9, and possibly 13, which align with the seasonal pattern observed in the ACF.
This reinforces the idea of seasonality with periodicity around 4 lags.
Decay:

Unlike the ACF, the PACF doesn't show gradual decay; instead, it drops to near zero after lag 1 (for non-seasonal components). This indicates that AR terms (rather than MA terms) are likely more relevant for the non-seasonal part of the model.
Confidence Intervals (Shaded Region):

Spikes within the blue region are not statistically significant. The significant spikes (outside this region) should guide the choice of AR terms and seasonal components.

## Next step
Based on this PACF plot, consider fitting an ARIMA model, starting with AR(1) for the non-seasonal part.
Use seasonal decomposition or additional diagnostics to confirm seasonal effects before adding seasonal components to the model.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

# Define the target variable
y = df_filtered['80072ned_Ziekteverzuimpercentage_1']  # Replace with your target variable column name

# Plot ACF and PACF
plt.figure(figsize=(10, 6))
plot_acf(y, lags=20, title='Autocorrelation Function (ACF)')
plt.show()

plt.figure(figsize=(10, 6))
plot_pacf(y, lags=20, title='Partial Autocorrelation Function (PACF)')
plt.show()
