In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_json('concatenated_data.json')

In [None]:
df.head()

In [None]:
df.info(verbose=True, show_counts=True) #verbose is true because it otherwise doesn't output the entire list of columns

## Ziekteverzuim

In [None]:
# Print the description of the Ziekteverzuimpercentage_1 column
print(df['Ziekteverzuimpercentage_1'].describe())

# Plot the distribution with seaborn
plt.figure(figsize=(9, 8))
sns.displot(df['Ziekteverzuimpercentage_1'], color='g', bins=10, kde=True)
plt.title('Distribution of Ziekteverzuimpercentage_1')
plt.xlabel('Ziekteverzuimpercentage_1')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Create 'Year-Quarter' column
df['Year-Quarter'] = df['Jaar'].astype(str) + '-' + df['Kwartaal'].astype(str)

# Plot the data
plt.figure(figsize=(14, 8))

# Boxplot
sns.boxplot(data=df, x='Year-Quarter', y='Ziekteverzuimpercentage_1', color='lightblue')

# Adding labels and title
plt.title('Distribution of Ziekteverzuimpercentage_1 by Year-Quarter with Median Trend')
plt.xlabel('Year-Quarter')
plt.ylabel('Ziekteverzuimpercentage_1')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.tight_layout()
plt.legend()
plt.show()

## Numerical data distribution

In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
# Calculate number of rows and columns for subplots
num_cols = 3  # Number of columns in the grid
num_rows = (len(df_num.columns) - 1) // num_cols + 1  # Number of rows needed

# Set up the figure and axes
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, num_rows * 4))
axes = axes.flatten()  # Flatten the 2D array of axes for easier indexing

# Plot each numeric column
for i, col in enumerate(df_num.columns):
    ax = axes[i]
    sns.histplot(df_num[col], bins=50, kde=True, ax=ax)
    ax.set_title(col)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=['Ziekteverzuimpercentage_1'])

## Correlation

In [None]:
df_num_corr = df_num.corr()['Ziekteverzuimpercentage_1']
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with ziekteverzuim:\n{}".format(len(golden_features_list), golden_features_list))

In [None]:
print(golden_features_list)

In [None]:
# Access the index (column names) from the Series
column_names = golden_features_list.index.tolist()
print(column_names)

additional_columns = ['Jaar', 'Kwartaal', 'BedrijfskenmerkenSBI2008']

all_columns = column_names + additional_columns

df_filtered = df[all_columns]
df_filtered.head()

In [None]:
df_final = df_filtered.dropna()
print(df_final.shape)
df_final.head()

In [None]:
df_final.to_json('df_final.json')