# Correlation Analysis

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

RESULTS_DIR = Path('../../results')

## Available Results

In [None]:
# List folders in results directory

print(f'Folders in results directory: {[f.name for f in RESULTS_DIR.iterdir() if f.is_dir()]}')
# Folders in results directory: ['PhD_Benchmark_results_2023-12-11',
# 'PhD_electricity_2023-12-11', 'PhD_electricity_2024-07-19', 'sca']

SCA_DIR = RESULTS_DIR / 'sca'
print(f'Folders in SCA directory: {[f.name for f in SCA_DIR.iterdir() if f.is_dir()]}')
# Folders in SCA directory: ['bandit', 'prospector', 'pylint', 'radon-cc',
# 'radon-hal', 'radon-mi', 'radon-raw', 'ruff', 'sonar', 'sonar_parsed', 'SUMMARY']

SCA_SUBDIR = SCA_DIR / 'SUMMARY'
print(f'Folders in SCA "SUMMARY": {[f.name for f in SCA_SUBDIR.iterdir() if f.is_dir()]}')
# Folders in SCA "SUMMARY": ['correlations', 'csv', 'plots', 'tables', 'tex']

ELEC_DIR = RESULTS_DIR / 'PhD_electricity_2024-07-19'
print(f'Folders in ELEC directory: {[f.name for f in ELEC_DIR.iterdir() if f.is_dir()]}')
# Folders in ELEC directory: ['autots_10800_time_limit', 'autots_7200_time_limit',
# 'fedot_10800_time_limit', 'fedot_7200_time_limit', 'nproc_-1', 'original_autots_times',
# 'univariate_forecasting', 'univariate_statistics', 'unused']

ELEC_SUBDIR = ELEC_DIR / 'univariate_statistics'
print(f'ELEC_dir: univariate_statistics: {[f.name for f in ELEC_SUBDIR.iterdir()]}')
# ELEC_dir: univariate_statistics: [
# '0_autokeras_early_stopping.csv', '0_increased_time_limit.csv', '0_original_autots_times.csv', '0_original_autots_times_comparison.xlsx',
# '1_all_scores.csv', '1_all_scores.tex',
# '3_failed_counts.png', '3_failed_counts_by_library.png', '3_mean_scores_by_library.csv',
# '4_R2_mean_by_library.png',
# '5_MAE_box.png', '5_MAE_mean_by_library.png', '5_MSE_box.png',
# '6_MSE_mean_by_library.png', '6_RMSE_box.png', '6_Spearman_Correlation_box.png',
# '7_duration_mean_by_library.png', '8_duration_box.png',
# 'heatmap.csv', 'heatmap.png', 'metrics_corr_heatmap.csv', 'metrics_corr_heatmap.tex',
# 'metrics_corr_heatmap_pvalues.csv', 'Pearson Correlation Heatmap.docx', 'Pearson Correlation Heatmap.PNG'
# ]

## Correlation Analysis Between Forecasting and SCA

In [None]:
# Load CSV files of SCA data
df_sca_all = pd.read_csv(SCA_SUBDIR / 'csv' / 'ranks.csv')
print(f'df_sca_all.shape: {df_sca_all.shape}')

# Load CSV files of SCA data grouped into categories
df_sca_summary = pd.read_csv(SCA_SUBDIR / 'csv' / 'summary_ranks.csv')
print(f'df_sca_summary.shape: {df_sca_summary.shape}')

# Load CSV files of forecasting results
df_forecasting_all = pd.read_csv(ELEC_SUBDIR / '1_all_scores.csv')
print(f'df_forecasting_all.shape: {df_forecasting_all.shape}')

# Load CSV files of average forecasting scores by library
df_forecasting_mean = pd.read_csv(ELEC_SUBDIR / '3_mean_scores_by_library.csv')
print(f'df_forecasting_mean.shape: {df_forecasting_mean.shape}')


In [None]:
df_sca_all

In [None]:
df_forecasting_mean

In [None]:
# Convert library to lowercase for merging
df_sca_all['library'] = df_sca_all['name'].str.lower()
df_sca_all = df_sca_all.drop(columns=['Library', 'name'], errors='ignore')
df_forecasting_mean['library'] = df_forecasting_mean['library'].str.lower()

# Merge SCA summary with forecasting mean scores on library
df_merged = pd.merge(df_sca_all, df_forecasting_mean, on='library')

# Set library as index
df_merged = df_merged.set_index('library')

# Drop columns that are not needed for correlation analysis
for col in df_merged.columns:
    # We deliberately keep "_min" columns due to high correlation with sca metrics
    if any(s in col for s in ['Unnamed', 'iterations', 'failed', '_max']) or col in ['Median Rank']:
        df_merged = df_merged.drop(columns=[col])

df_merged

In [None]:
# Correlation heatmap between metrics using Spearman method
# to compare model rankings using different metrics
corr = df_merged.corr(method='spearman')

# Use forecasting metrics as columns
corr = corr[[col for col in corr if col in df_forecasting_mean]]

# Use SCA metrics as rows
corr = corr.loc[[col for col in corr.index if col in df_sca_all]]

# Save correlation heatmap to CSV
corr.to_csv(RESULTS_DIR / 'metrics_spearman_corr_heatmap.csv')
corr.T.to_csv(RESULTS_DIR / 'T_metrics_spearman_corr_heatmap.csv')
print(corr.shape)

# Hide values between -0.5 and 0.5 for better visualization
# corr = corr.mask(abs(corr) < 0.5)

plt.figure(figsize=(26, 24))
sns.heatmap(corr, annot=True, fmt='.2f', cbar=False, cmap='coolwarm', square=True, cbar_kws={'shrink': .8})
plt.title('Spearman Correlation Heatmap Between SCA Metrics and Forecasting Metrics')
plt.show()

## Same Correlation Analysis Between SCA and Forecasting Metrics using SCA categories

In [None]:
df_sca_summary

In [None]:
df_sca_summary['library'] = df_sca_summary['Library'].str.lower()
df_sca_summary = df_sca_summary.drop(columns=['Library', 'name'], errors='ignore')
df_forecasting_mean['library'] = df_forecasting_mean['library'].str.lower()

# Merge SCA summary with forecasting mean scores on library
df_merged = pd.merge(df_sca_summary, df_forecasting_mean, on='library')

# Set library as index
df_merged = df_merged.set_index('library')

# Drop columns that are not needed for correlation analysis
for col in df_merged.columns:
    # We deliberately keep "_min" columns due to high correlation with sca metrics
    if any(s in col for s in ['Unnamed', 'iterations', 'failed', '_max']) or col in ['Median Rank']:
        df_merged = df_merged.drop(columns=[col])

In [None]:
# Correlation heatmap between metrics using Spearman method
# to compare model rankings using different metrics
corr = df_merged.corr(method='spearman')

# Use forecasting metrics as columns
corr = corr[[col for col in corr if col in df_forecasting_mean]]

# Use SCA metrics as rows
corr = corr.loc[[col for col in corr.index if col in df_sca_summary]]

# Save correlation heatmap to CSV
corr.to_csv(RESULTS_DIR / 'metrics_spearman_corr_heatmap_shorter.csv')
corr.T.to_csv(RESULTS_DIR / 'T_metrics_spearman_corr_heatmap_shorter.csv')
print(corr.shape)

# Hide values between -0.5 and 0.5 for better visualization
# corr = corr.mask(abs(corr) < 0.5)

plt.figure(figsize=(20, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cbar=False, cmap='coolwarm', square=True, cbar_kws={'shrink': .8})
plt.title('Spearman Correlation Heatmap Between Grouped SCA Metrics and Forecasting Metrics')
plt.show()