In [6]:
import pandas as pd
import numpy as np
from scipy import stats

def compare_clusters(df1, df2):
    """Compare feature distributions between two cluster dataframes."""
    
    # Exclude file_name column if present
    cols_to_compare = [col for col in df1.columns if col != "file_name"]
    
    differences = {}
    for col in cols_to_compare:
        
        # Calculate mean difference
        mean_diff = df1[col].mean() - df2[col].mean()
        
        # T-test to check statistical significance
        t_stat, p_value = stats.ttest_ind(df1[col], df2[col], nan_policy='omit')
        
        differences[col] = {
            "cluster_0_mean": df1[col].mean(),
            "cluster_1_mean": df2[col].mean(),
            "mean_difference": mean_diff,
            "p_value": p_value,
            "significant": p_value < 0.05
        }
    
    return pd.DataFrame(differences).T.sort_values("mean_difference", ascending=False)

if __name__ == "__main__":
    # Load the dataframes
    df1 = pd.read_csv("../visuals/rep_samples/Spectral_Cluster/0/cluster_0.csv")
    df2 = pd.read_csv("../visuals/rep_samples/Spectral_Cluster/1/cluster_1.csv")
    
    # Compare the dataframes
    differences = compare_clusters(df1, df2)
    
    # Print features sorted by biggest differences
    print(differences)
    print("\nMost significant features (p < 0.05):")
    print(differences[differences["significant"]])

                        cluster_0_mean cluster_1_mean mean_difference  \
a1_spectral_centroid_y     2679.894102    2225.149821      454.744281   
a1_spectral_centroid_x     2679.894102    2225.149821      454.744281   
a0_spectral_centroid_y     2446.049968    2252.620994      193.428974   
a0_spectral_centroid_x     2446.049968    2252.620994      193.428974   
a1_spectral_entropy_x         5.806427       5.373629        0.432799   
...                                ...            ...             ...   
a0_peaks_per_second_x         3.610205       12.27628       -8.666075   
a1_spectral_bandwidth_x    1094.480789    1163.354607      -68.873818   
a1_spectral_bandwidth_y    1094.480789    1163.354607      -68.873818   
a0_spectral_bandwidth_x    1191.573495    1271.778343      -80.204848   
a0_spectral_bandwidth_y    1191.573495    1271.778343      -80.204848   

                          p_value significant  
a1_spectral_centroid_y        0.0        True  
a1_spectral_centroid_x     

  res = hypotest_fun_out(*samples, **kwds)
