In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import os

# ---------------------------------------------------------------------------------------------------

path = "/Volumes/dax-hd/project-data/search-files/merged-data.csv"
base_save_folder = "/Volumes/dax-hd/project-data/corr_features_2/"

df = pd.read_csv(path)

# ---------------------------------------------------------------------------------------------------

destress_columns = [
    "hydrophobic_fitness",
    "isoelectric_point",
    "charge",
    "mass",
    "num_residues",
    "packing_density",
    "budeff_total",
    "budeff_steric",
    "budeff_desolvation",
    "budeff_charge",
    "evoef2_total",
    "evoef2_ref_total",
    "evoef2_intraR_total",
    "evoef2_interS_total",
    "evoef2_interD_total",
    "dfire2_total",
    "rosetta_total",
    "rosetta_fa_atr",
    "rosetta_fa_rep",
    "rosetta_fa_intra_rep",
    "rosetta_fa_elec",
    "rosetta_fa_sol",
    "rosetta_lk_ball_wtd",
    "rosetta_fa_intra_sol_xover4",
    "rosetta_hbond_lr_bb",
    "rosetta_hbond_sr_bb",
    "rosetta_hbond_bb_sc",
    "rosetta_hbond_sc",
    "rosetta_dslf_fa13",
    "rosetta_rama_prepro",
    "rosetta_p_aa_pp",
    "rosetta_fa_dun",
    "rosetta_omega",
    "rosetta_pro_close",
    "rosetta_yhh_planarity",
    "aggrescan3d_total_value",
    "aggrescan3d_avg_value",
    "aggrescan3d_min_value",
    "aggrescan3d_max_value"
]

ss_columns= [
    "ss_prop_alpha_helix",
    "ss_prop_beta_bridge",
    "ss_prop_beta_strand",
    "ss_prop_3_10_helix",
    "ss_prop_pi_helix",
    "ss_prop_hbonded_turn",
    "ss_prop_bend",
    "ss_prop_loop"
    ]

aa_comp_columns= [
    # Negative # 
    "composition_ARG", "composition_HIS", "composition_LYS", 
    # Positive #
    "composition_ASP", "composition_GLU", 
    # Polar #
    "composition_SER", "composition_THR", "composition_ASN", "composition_GLN", 
    # Special #
    "composition_CYS", "composition_GLY", "composition_PRO",
    # Hydrophobic #
    "composition_ALA", "composition_VAL", "composition_ILE", "composition_LEU", "composition_MET", "composition_PHE", "composition_TYR", "composition_TRP",
    # Unknown #
    "composition_UNK",
]

# ---------------------------------------------------------------------------------------------------

def remove_highly_correlated_features(df, tolerance=0.6, columns=None):
    if columns is None:
        columns = df.columns

    # Ensure only valid numerical columns are included
    valid_columns = [col for col in columns if col in df.columns and np.issubdtype(df[col].dtype, np.number)]
    df_selected = df[valid_columns].copy()

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_selected)
    df_scaled = pd.DataFrame(scaled_features, columns=valid_columns)

    # Compute Spearman correlation
    corr_matrix = df_scaled.corr(method='spearman').abs()
    dropped_features = []

    while True:
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > tolerance)]
        
        if not to_drop:
            break
        
        feature_to_remove = to_drop[0]
        df_selected.drop(columns=feature_to_remove, inplace=True)
        df_scaled.drop(columns=feature_to_remove, inplace=True)
        dropped_features.append(feature_to_remove)
        corr_matrix = df_scaled.corr(method='spearman').abs()

    # Return the original dataframe without the dropped features.
    return df.drop(columns=dropped_features), dropped_features

# ---------------------------------------------------------------------------------------------------

def plot_correlation_matrix(df, columns, img_path):
    valid_columns = [col for col in columns if col in df.columns and np.issubdtype(df[col].dtype, np.number)]
    df_selected = df[valid_columns]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_selected)
    df_scaled = pd.DataFrame(scaled_features, columns=valid_columns)

    corr_matrix = df_scaled.corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='viridis', cbar=True, square=True, linewidths=.5)
    plt.title('Correlation Matrix Heatmap')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(img_path)
    plt.close()

# ---------------------------------------------------------------------------------------------------

column_groups = {
    'destress_columns': 'destress_corr_matrix.png',
    'ss_columns': 'ss_corr_matrix.png',
    'aa_comp_columns': 'aa_comp_corr_matrix.png'
}

for group_name, img_filename in column_groups.items():
    columns = globals()[group_name]
    reduced_df, dropped_features = remove_highly_correlated_features(df, tolerance=0.6, columns=columns)
    print(f"{group_name} reduced DataFrame shape: {reduced_df.shape}")
    print(f"Dropped features for {group_name}: {dropped_features}")

    corr_matrix_output_path = os.path.join(base_save_folder, img_filename)
    plot_correlation_matrix(reduced_df, columns, corr_matrix_output_path)
    print(f"{group_name} correlation matrix saved to {corr_matrix_output_path}")
# ---------------------------------------------------------------------------------------------------

destress_columns reduced DataFrame shape: (32388, 55)
Dropped features for destress_columns: ['charge', 'num_residues', 'packing_density', 'budeff_total', 'budeff_steric', 'budeff_desolvation', 'budeff_charge', 'evoef2_total', 'evoef2_ref_total', 'evoef2_intraR_total', 'evoef2_interS_total', 'evoef2_interD_total', 'dfire2_total', 'rosetta_fa_atr', 'rosetta_fa_rep', 'rosetta_fa_intra_rep', 'rosetta_fa_elec', 'rosetta_fa_sol', 'rosetta_lk_ball_wtd', 'rosetta_fa_intra_sol_xover4', 'rosetta_hbond_lr_bb', 'rosetta_hbond_sr_bb', 'rosetta_hbond_bb_sc', 'rosetta_hbond_sc', 'rosetta_p_aa_pp', 'rosetta_fa_dun', 'rosetta_omega', 'rosetta_pro_close', 'aggrescan3d_total_value']
destress_columns correlation matrix saved to /Volumes/dax-hd/project-data/corr_features_2/destress_corr_matrix.png
ss_columns reduced DataFrame shape: (32388, 83)
Dropped features for ss_columns: ['ss_prop_beta_strand']
ss_columns correlation matrix saved to /Volumes/dax-hd/project-data/corr_features_2/ss_corr_matrix.png
aa_

  annotation = ("{:" + self.fmt + "}").format(val)


aa_comp_columns correlation matrix saved to /Volumes/dax-hd/project-data/corr_features_2/aa_comp_corr_matrix.png
