In [5]:
import pandas as pd
import numpy as np
from scipy import stats

# function to compare multiple dfs that containe the same features but different clusters and give back what features are most different between the clusters and if those differences are statistically significant

def compare_clusters(dfs):
    features = dfs[0].columns
    # get rid of file_name
    features = [f for f in features if f != "file_name"]

    rows = []  # <-- collect results here

    for feature in features:
        cluster_values = [df[feature].dropna() for df in dfs]

        # If any cluster has <2 values, ANOVA can fail; skip it
        if sum(len(v) >= 2 for v in cluster_values) < 2:
            continue

        f_statistic, p_value = stats.f_oneway(*cluster_values)

        means = [np.mean(values) for values in cluster_values]
        mean_difference = max(means) - min(means)

        rows.append({
            "feature": feature,
            "difference": mean_difference,
            "p_value": p_value,
            "significant": p_value < 0.05
        })

    differences_df = pd.DataFrame(rows).sort_values(by="difference", ascending=False)
    return differences_df

if __name__ == "__main__":
    # Load the dataframes
    df0 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/-1/cluster_-1.csv")
    df1 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/0/cluster_0.csv")
    df2 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/1/cluster_1.csv")
    df3 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/2/cluster_2.csv")
    df4 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/3/cluster_3.csv")
    df5 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/4/cluster_4.csv")
    df6 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/5/cluster_5.csv")
    df7 = pd.read_csv("../visuals/rep_samples/UMAP-HDBSCAN/6/cluster_6.csv")

    dfs = [df0, df1, df2, df3, df4, df5, df6, df7]
    
    # Compare the dataframes and get the differences
    differences = compare_clusters(dfs)
    # save differences to csv
    differences.to_csv("../visuals/rep_samples/UMAP-HDBSCAN/cluster_differences.csv", index=False)
    
    # Print features sorted by biggest differences
    print(differences)
    print("\nMost significant features (p < 0.05):")
    print(differences[differences["significant"]])

                              feature   difference       p_value  significant
16             a1_spectral_centroid_x  1386.847228  6.198646e-65         True
50             a1_spectral_centroid_y  1386.847228  6.198646e-65         True
1              a0_spectral_centroid_x  1221.656083  2.137690e-56         True
35             a0_spectral_centroid_y  1221.656083  2.137690e-56         True
3             a0_spectral_bandwidth_x   699.735351  3.119813e-29         True
..                                ...          ...           ...          ...
43                      a0_std_peak_y     0.037313  4.062396e-40         True
61  a1_percent_time_above_threshold_y     0.004806  3.303489e-32         True
27  a1_percent_time_above_threshold_x     0.004806  3.303489e-32         True
12  a0_percent_time_above_threshold_x     0.004450  1.835634e-38         True
46  a0_percent_time_above_threshold_y     0.004450  1.835634e-38         True

[67 rows x 4 columns]

Most significant features (p < 0.05):
  

  res = hypotest_fun_out(*samples, **kwds)
