In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/merged_tables.csv')
df_knmi = pd.read_csv('data/knmi_data.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
print(df_knmi.shape)
df_knmi.head()

In [None]:
# Create a combined Year-Quarter column in the format 'YYYY-QX' and insert it as the first column
df.insert(0, 'Year_Quarter', df['Year'].astype(int).astype(str) + '-Q' + df['Quarter'].astype(int).astype(str))

# Drop the now redundant 'Year' and 'Quarter' columns
df = df.drop(columns=['Year', 'Quarter'])

# Sort the DataFrame by 'BedrijfstakkenBranchesSBI2008' and 'Year_Quarter' for better chronological order
df = df.sort_values(by=['BedrijfstakkenBranchesSBI2008', 'Year_Quarter']).reset_index(drop=True)

# Display the modified DataFrame with the new column and sorting
print(df.shape)
df.head()

In [None]:
# Create the four new columns by shifting the values of '80072ned_Ziekteverzuimpercentage_1' by 1, 2, 3, and 4 quarters
df['80072ned_Ziekteverzuimpercentage_1_lag_1'] = df.groupby('BedrijfstakkenBranchesSBI2008')['80072ned_Ziekteverzuimpercentage_1'].shift(1)
df['80072ned_Ziekteverzuimpercentage_1_lag_2'] = df.groupby('BedrijfstakkenBranchesSBI2008')['80072ned_Ziekteverzuimpercentage_1'].shift(2)
df['80072ned_Ziekteverzuimpercentage_1_lag_3'] = df.groupby('BedrijfstakkenBranchesSBI2008')['80072ned_Ziekteverzuimpercentage_1'].shift(3)
df['80072ned_Ziekteverzuimpercentage_1_lag_4'] = df.groupby('BedrijfstakkenBranchesSBI2008')['80072ned_Ziekteverzuimpercentage_1'].shift(4)

# Display the full DataFrame to verify the new columns
print(df.shape)
df.head()

In [None]:
# Assume df and df_knmi are already loaded as per your examples

# Step 1: Create a DataFrame of unique industries from df
industries = df[['BedrijfstakkenBranchesSBI2008']].drop_duplicates()

# Step 2: Perform a Cartesian product to expand df_knmi across all industries
expanded_knmi = df_knmi.merge(industries, how='cross')

# Step 3: Merge the expanded df_knmi with the main df based on 'Year_Quarter'
df = df.merge(expanded_knmi, on=['Year_Quarter', 'BedrijfstakkenBranchesSBI2008'], how='left')

# Display the result
print(df.shape)
df.head()

In [None]:
# Define a function to label COVID-19 years
def label_covid_period(row):
    if row['Year_Quarter'].startswith('2020') or row['Year_Quarter'].startswith('2021') or row['Year_Quarter'].startswith('2022'):
        return 1
    else:
        return 0

# Apply the function to create the covid_19 column
df['covid_19'] = df.apply(label_covid_period, axis=1)

# Display the DataFrame to confirm the changes
print(df[['Year_Quarter', 'covid_19']].head())

In [None]:
df.tail()

In [None]:
df.info(verbose=True, show_counts=True) #verbose is true because it otherwise doesn't output the entire list of columns

## Correlation

In [None]:
# Select only numeric columns (float and int types) from the DataFrame
df_numeric = df.select_dtypes(include=['float', 'int'])

# Ensure the target column exists in the filtered numeric DataFrame
if '80072ned_Ziekteverzuimpercentage_1' in df_numeric.columns:
    # Calculate the correlation matrix and extract correlations with the target column
    df_num_corr = df_numeric.corr()['80072ned_Ziekteverzuimpercentage_1']
    
    # Filter for features with strong correlations (absolute correlation > 0.5)
    golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
    
    # Print the results if there are any strongly correlated features
    print("There are {} strongly correlated values with ziekteverzuim:\n{}".format(len(golden_features_list), golden_features_list))
else:
    print("The target column '80072ned_Ziekteverzuimpercentage_1' is not in the numeric columns.")


In [None]:
# Initialize an empty dictionary to store correlations by industry
industry_correlations = {}

# Group by industry and calculate correlations within each industry
for industry, group_df in df.groupby('BedrijfstakkenBranchesSBI2008'):
    # Select only numeric columns to avoid non-numeric data in correlation calculations
    group_df_numeric = group_df.select_dtypes(include=[float, int])
    
    # Calculate the correlation matrix for the numeric columns within the industry
    df_num_corr = group_df_numeric.corr()['80072ned_Ziekteverzuimpercentage_1']
    
    # Filter strongly correlated features (absolute correlation > 0.5)
    golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
    
    # Store the result in the dictionary, only if there are strongly correlated features
    if not golden_features_list.empty:
        industry_correlations[industry] = golden_features_list
        print(f"There are {len(golden_features_list)} strongly correlated values with ziekteverzuim for industry '{industry}':\n{golden_features_list}\n")


In [None]:
# Define the columns for which we want lagged correlations
columns_to_lag = [
    'airpressure', 'maximum_temperatures', 'mean_temperatures', 
    'minimum_temperatures', 'precipitation', 'covid_19'
]

# Create lagged variables for each column in `columns_to_lag`
for col in columns_to_lag:
    df[f'{col}_lag_1'] = df[col].shift(1)
    df[f'{col}_lag_2'] = df[col].shift(2)

# Recompute numeric-only DataFrame to include new lagged columns
df_numeric = df.select_dtypes(include=[float, int])

# Calculate correlations for the primary column of interest and its lags
correlation_columns = columns_to_lag + [f"{col}_lag_1" for col in columns_to_lag] + [f"{col}_lag_2" for col in columns_to_lag]
correlations = df_numeric.corr()['80072ned_Ziekteverzuimpercentage_1'][correlation_columns]

print("Correlations with 80072ned_Ziekteverzuimpercentage_1:")
print(correlations)


In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

# Adjust pandas display options to show up to 200 rows
pd.set_option('display.max_rows', 200)

# Select only numeric columns (float and int types) from the DataFrame
df_numeric = df.select_dtypes(include=['float', 'int'])

# Ensure there are numeric columns in the DataFrame
if not df_numeric.empty:
    # Handle missing and infinite values
    df_numeric = df_numeric.replace([np.inf, -np.inf], np.nan)  # Replace infinite values with NaN
    df_numeric = df_numeric.dropna()  # Drop rows with NaN values

    # Prepare a function to calculate VIF
    def calculate_vif(dataframe):
        vif_data = pd.DataFrame()
        vif_data['Feature'] = dataframe.columns
        vif_data['VIF'] = [
            variance_inflation_factor(dataframe.values, i) 
            for i in range(dataframe.shape[1])
        ]
        return vif_data

    # Calculate VIF for the cleaned numeric DataFrame
    vif_results = calculate_vif(df_numeric)

    # Sort the results by VIF values in ascending order
    vif_results = vif_results.sort_values(by='VIF', ascending=True)

    # Print the results
    print("Variance Inflation Factor (VIF) for numeric columns, sorted by VIF:\n")
    print(vif_results)
else:
    print("No numeric columns available for VIF calculation.")


In [None]:
# Access the index (column names) from the correlation Series
column_names = golden_features_list.index.tolist()

# Create a DataFrame for the VIF results
vif_threshold = 50
vif_filtered = vif_results[(vif_results['VIF'] <= vif_threshold) | (vif_results['Feature'].str.contains('lag'))]

# Get the column names that meet the VIF and lag condition
vif_filtered_columns = vif_filtered['Feature'].tolist()

# Combine correlation-filtered and VIF-filtered columns
selected_columns = list(set(column_names) & set(vif_filtered_columns))

# Add additional necessary columns
additional_columns = ['Year_Quarter', 'BedrijfstakkenBranchesSBI2008']
all_columns = selected_columns + additional_columns

# Filter the DataFrame
df_filtered = df[all_columns]

# Display the filtered DataFrame
df_filtered.head()


In [None]:
df_final = df_filtered.dropna()
print(df_final.shape)
df_final.head()

In [None]:
df_final.to_json('data/df_final.json')

## Ziekteverzuim

In [None]:
# Print the description of the Ziekteverzuimpercentage_1 column
print(df['80072ned_Ziekteverzuimpercentage_1'].describe())

# Plot the distribution with seaborn
plt.figure(figsize=(9, 8))
sns.displot(df['80072ned_Ziekteverzuimpercentage_1'], color='g', bins=10, kde=True)
plt.title('Distribution of Ziekteverzuimpercentage_1')
plt.xlabel('Ziekteverzuimpercentage_1')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot the data
plt.figure(figsize=(14, 8))

# Boxplot
sns.boxplot(data=df, x='Year_Quarter', y='80072ned_Ziekteverzuimpercentage_1', color='lightblue')

# Adding labels and title
plt.title('Distribution of Ziekteverzuimpercentage_1 by Year-Quarter with Median Trend')
plt.xlabel('Year-Quarter')
plt.ylabel('Ziekteverzuimpercentage_1')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.tight_layout()
plt.legend()
plt.show()

## Numerical data distribution

In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
# Calculate number of rows and columns for subplots
num_cols = 3  # Number of columns in the grid
num_rows = (len(df_num.columns) - 1) // num_cols + 1  # Number of rows needed

# Set up the figure and axes
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, num_rows * 4))
axes = axes.flatten()  # Flatten the 2D array of axes for easier indexing

# Plot each numeric column
for i, col in enumerate(df_num.columns):
    ax = axes[i]
    sns.histplot(df_num[col], bins=50, kde=True, ax=ax)
    ax.set_title(col)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=['80072ned_Ziekteverzuimpercentage_1'])