In [16]:
import numpy as np
import plotly.express as px

# Filter the dataset for specific branches
branches = ['Q Gezondheids- en welzijnszorg', 'G Handel', 'C Industrie']
correlation_results = {}

for branch in branches:
    # Filter the data for the current branch
    branch_data = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch.strip().lower()]
    
    # Select numeric columns and calculate correlation matrix
    numeric_branch = branch_data.select_dtypes(include=[np.number])
    correlation_matrix = numeric_branch.corr()
    
    # Focus on correlations with 80072ned_Ziekteverzuimpercentage_1
    correlation_with_target = correlation_matrix['80072ned_Ziekteverzuimpercentage_1'].sort_values(ascending=False)
    
    # Filter correlations with absolute value > 0.6
    significant_correlations = correlation_with_target[abs(correlation_with_target) > 0.6]
    
    # Save the correlations for this branch
    correlation_results[branch] = significant_correlations

    # Create a heatmap for significant correlations
    fig = px.imshow(
        significant_correlations.to_frame().T,  # Transpose to display as a heatmap
        labels=dict(x="Variables", y="Correlation with Target", color="Correlation"),
        x=significant_correlations.index,
        y=["SickLeave"],  # Single row heatmap
        title=f"Correlation with Sick Leave ({branch})",
        color_continuous_scale="Bluered",
        width=1200,  # Adjust plot width
        height=400   # Adjust plot height
    )
    
    # Adjust axis labels for readability
    fig.update_xaxes(tickangle=45, tickfont=dict(size=10))
    fig.show()

# Save correlation results for reference
output_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\significant_correlations_per_branch.xlsx"
with pd.ExcelWriter(output_path) as writer:
    for branch, correlations in correlation_results.items():
        correlations.to_excel(writer, sheet_name=branch)

print(f"Significant correlations saved to: {output_path}")


Significant correlations saved to: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\significant_correlations_per_branch.xlsx


In [18]:
import numpy as np
import pandas as pd
import plotly.express as px

# Filter the dataset for specific branches
branches = ['Q Gezondheids- en welzijnszorg', 'G Handel', 'C Industrie']
correlation_results = {}

for branch in branches:
    # Filter the data for the current branch
    branch_data = df[df['BedrijfstakkenBranchesSBI2008'].str.strip().str.lower() == branch.strip().lower()]
    
    # Select numeric columns and calculate correlation matrix
    numeric_branch = branch_data.select_dtypes(include=[np.number])
    correlation_matrix = numeric_branch.corr()
    
    # Extract the correlation with sick leave (target variable)
    correlation_with_target = correlation_matrix['80072ned_Ziekteverzuimpercentage_1']
    
    # Add the correlation with sick leave as an annotation to the matrix
    annotations = [
        dict(
            x=col, y='80072ned_Ziekteverzuimpercentage_1', text=f"{correlation_with_target[col]:.2f}",
            showarrow=False, font=dict(color="black", size=10)
        )
        for col in correlation_matrix.columns if col != '80072ned_Ziekteverzuimpercentage_1'
    ]
    
    # Save the full correlation matrix for this branch
    correlation_results[branch] = correlation_matrix

    # Create a heatmap for the entire correlation matrix
    fig = px.imshow(
        correlation_matrix,
        labels=dict(x="Variables", y="Variables", color="Correlation"),
        x=correlation_matrix.columns,
        y=correlation_matrix.index,
        title=f"Correlation Matrix with Sick Leave Highlighted ({branch})",
        color_continuous_scale="RdBu",  # Diverging color scale for correlations
        width=800,  # Adjust plot width
        height=800   # Adjust plot height
    )
    
    # Add annotations specifically for correlations with sick leave
    fig.update_layout(annotations=annotations)
    
    # Adjust axis labels for readability
    fig.update_xaxes(tickangle=45, tickfont=dict(size=10))
    fig.update_yaxes(tickfont=dict(size=10))
    fig.show()

# Save correlation results for reference
output_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\correlation_matrices_with_target_per_branch.xlsx"
with pd.ExcelWriter(output_path) as writer:
    for branch, correlation_matrix in correlation_results.items():
        correlation_matrix.to_excel(writer, sheet_name=branch)

print(f"Correlation matrices with sick leave saved to: {output_path}")


Correlation matrices with sick leave saved to: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\correlation_matrices_with_target_per_branch.xlsx
