In [13]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [14]:
# Load the dataset
# 1. Paths to data files
file_path = r"C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\Data\merged_tables.csv"
df = pd.read_csv(file_path)

In [15]:
# Filter for specific branches
branches = {
    'G Handel': 'Handel',
    'C Industrie': 'Industrie',
    'Q Gezondheids- en welzijnszorg': 'Gezondheids- en welzijnszorg'
}

In [16]:
# Define target column
target_column = '80072ned_Ziekteverzuimpercentage_1'

In [27]:
# Create an Excel writer
with pd.ExcelWriter('strong_correlations.xlsx') as writer:
    # Iterate over the selected branches for correlation analysis
    for branch_code, branch_name in branches.items():
        # Filter the dataset for the current branch
        branch_df = df[df['BedrijfstakkenBranchesSBI2008'] == branch_code]
        
        # Select only numeric columns for correlation calculation
        numeric_branch_df = branch_df.select_dtypes(include=[np.number])
        
        # Ensure the target column is included in the correlation calculation
        if target_column not in numeric_branch_df.columns:
            numeric_branch_df[target_column] = branch_df[target_column]
        
        # Compute correlations
        correlation_matrix = numeric_branch_df.corr()
        
        # Extract correlations with the target column
        if target_column in correlation_matrix.columns:
            target_correlations = correlation_matrix[target_column]
        else:
            target_correlations = pd.Series(dtype=float)
        
        # Identify strongly correlated columns (above 0.5 or below -0.5)
        strong_correlations = target_correlations[(target_correlations.abs() > 0.5) & (target_correlations.index != target_column)]
        
        # Write correlated columns to Excel for the current branch
        if not strong_correlations.empty:
            strong_correlations_df = pd.DataFrame({
                'Column': strong_correlations.index,
                'Correlation': strong_correlations.values,
                'Branch': branch_name
            })
            strong_correlations_df.to_excel(writer, sheet_name=f'{branch_name}_{branch_code}', index=False)
        else:
            print(f"\nNo strong correlations found for branch {branch_name} ({branch_code})")

        


Title is more than 31 characters. Some applications may not be able to read the file



In [28]:
# Plot correlation heatmap for the branch including the target column
columns_to_plot = numeric_branch_df.columns
        
fig = go.Figure(data=go.Heatmap(
            z=correlation_matrix.loc[columns_to_plot, columns_to_plot].values,
            x=columns_to_plot,
            y=columns_to_plot,
            colorscale='Viridis',
            colorbar=dict(title="Correlation")
        ))
fig.update_layout(
            title=f"Correlation Heatmap for Branch {branch_name} ({branch_code})",
            xaxis_title="Features",
            yaxis_title="Features",
            xaxis_tickangle=-45
        )
fig.show()