In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1) load & encode your target
df = pd.read_csv("../../feature_engineering/data_with_new_features_v3.csv")
le = LabelEncoder()
df["genre_encoded"] = le.fit_transform(df["genre"])

# 2) pick out your numeric features
non_num = ["participant","timestamp","genre"]
numeric_feats = [c for c in df.columns
                 if pd.api.types.is_numeric_dtype(df[c])
                    and c not in non_num + ["genre_encoded"]]

# 3) compute Pearson correlation with the target
corr_with_target = df[numeric_feats + ["genre_encoded"]].corr()["genre_encoded"] \
                     .drop("genre_encoded") \
                     .abs() \
                     .sort_values(ascending=False)

print("Top 10 |corr| with genre:")
print(corr_with_target.head(10))

# 4) if any of these are exactly 1.0 (or extremely high, say >0.9), you’ve found leakage.
leaky = corr_with_target[corr_with_target >= 0.99].index.tolist()
print("\nSuspect (|corr|>=0.99):", leaky)


Top 10 |corr| with genre:
pupil_iris_ratio_fft_band_0_0.1        0.770793
pupil_diameter_mm_fft_band_0_0.1       0.770793
pupil_iris_ratio_fft_max_amp           0.713226
pupil_diameter_mm_fft_max_amp          0.713226
pupil_iris_ratio_fft_band_0.1_0.25     0.540690
pupil_diameter_mm_fft_band_0.1_0.25    0.540690
heart_rate_high_normal                 0.521763
pupil_diameter_mm_fft_entropy          0.518583
pupil_iris_ratio_fft_entropy           0.518583
heart_rate_normal_high                 0.507076
Name: genre_encoded, dtype: float64

Suspect (|corr|>=0.99): []
