In [26]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import spearmanr
from google.colab import files

# mRMR approximation function
def approximate_mrmr(X, y, feature_groups, n_clusters=50, top_k=100):

    """
    Approximates the Minimum Redundancy Maximum Relevance (mRMR) feature selection method by:
    - Reducing intra-group redundancy using hierarchical clustering based on Spearman correlation.
    - Selecting one representative feature per cluster using mutual information (MI) with the target.
    - Ranking selected features globally using MI and returning the top-k most informative features.

    Parameters
    ----------
    X : ndarray of shape (n_samples, n_features)
        Feature matrix.

    y : array-like of shape (n_samples,)
        Target labels (e.g., binary classification).

    feature_groups : dict
        Dictionary mapping group names to lists of feature indices.
        These groups represent biologically or functionally meaningful feature blocks (e.g., ESM, AAC, PseAAC).

    n_clusters : int, optional (default=50)
        Number of clusters for intra-group feature reduction via Agglomerative Clustering.

    top_k : int, optional (default=100)
        Number of top features to return based on mutual information ranking across all selected features.

    Returns
    -------
    selected_final : list of int
        List of indices of the top_k selected features from the original feature matrix.
    """

    n_features = X.shape[1]
    for group_name, indices in feature_groups.items():
        if max(indices) >= n_features:
            raise ValueError(f"Feature group '{group_name}' contains indices beyond matrix size (max index: {max(indices)}, matrix size: {n_features})")

    selected_features = []

    for group_name, indices in feature_groups.items():
        X_group = X[:, indices]

        if X_group.shape[1] == 1:
            selected_features.append(indices[0])
            continue

        corr_matrix = spearmanr(X_group, axis=0).correlation
        corr_dist = 1 - np.abs(corr_matrix)
        np.fill_diagonal(corr_dist, 0)

        clustering = AgglomerativeClustering(
            n_clusters=min(n_clusters, len(indices)),
            metric='precomputed',
            linkage='complete'
        )
        clusters = clustering.fit_predict(corr_dist)

        mi_scores = mutual_info_classif(X_group, y, random_state=42)

        for cluster_id in np.unique(clusters):
            cluster_mask = (clusters == cluster_id)
            cluster_indices = np.where(cluster_mask)[0]
            cluster_mi_scores = mi_scores[cluster_mask]
            best_local_idx = cluster_indices[np.argmax(cluster_mi_scores)]
            best_feature_idx = indices[best_local_idx]
            selected_features.append(best_feature_idx)

    mi_scores_global = mutual_info_classif(X[:, selected_features], y)
    top_indices = np.argsort(mi_scores_global)[-top_k:]
    return [selected_features[i] for i in top_indices]

In [34]:
def run_train_test_feature_selection(train_csv, test_csv, output_train_csv, output_test_csv,
                                     feature_groups, n_clusters=50, k_features=50,
                                     shannon_index=434):
    """
    Perform feature selection using approximate mRMR on the training dataset,
    apply the same selected features to both training and test sets, and save
    the reduced feature matrices to new CSV files.

    Additionally, ensures Shannon entropy (default index 434) is included in the selected features.

    Parameters:
    -----------
    train_csv : str
        Path to the training dataset CSV file. The file must have columns in the order:
        [ID, features..., Taxonomic Group, Label]

    test_csv : str
        Path to the test dataset CSV file. Same column structure as train_csv.

    output_train_csv : str
        Path to save the transformed training dataset with selected features.

    output_test_csv : str
        Path to save the transformed test dataset with selected features.

    feature_groups : dict
        Dictionary mapping group names to lists of feature indices. Used for mRMR approximation.

    n_clusters : int, optional (default=50)
        Number of clusters to use for intra-group feature redundancy reduction.

    k_features : int, optional (default=50)
        Number of top features to select from the entire set of groups after mRMR scoring.

    shannon_index : int, optional (default=434)
        Index of the Shannon entropy feature to include even if it's not selected by mRMR.

    Returns:
    --------
    None
    """

    # Load datasets
    df_train = pd.read_csv(train_csv)
    df_test = pd.read_csv(test_csv)

    # Extract features and metadata from training set
    X_train = df_train.iloc[:, 1:-2].values
    y_train = df_train.iloc[:, -1].values
    train_ids = df_train.iloc[:, 0]
    train_taxa = df_train.iloc[:, -2]
    train_labels = df_train.iloc[:, -1]

    # Extract test set features and metadata
    X_test = df_test.iloc[:, 1:-2].values
    test_ids = df_test.iloc[:, 0]
    test_taxa = df_test.iloc[:, -2]
    test_labels = df_test.iloc[:, -1]

    # Run approximate mRMR feature selection on training data
    selected_indices = approximate_mrmr(X_train, y_train, feature_groups, n_clusters, k_features)

    # Ensure Shannon entropy is included
    if shannon_index not in selected_indices:
        selected_indices.append(shannon_index)

    # Get corresponding column names for selected features
    selected_feature_names = [df_train.columns[1:-2][i] for i in selected_indices]

    # Create new DataFrames with selected features + metadata
    df_train_selected = pd.concat([train_ids, df_train[selected_feature_names], train_taxa, train_labels], axis=1)
    df_test_selected = pd.concat([test_ids, df_test[selected_feature_names], test_taxa, test_labels], axis=1)

    # Save to output CSV files
    df_train_selected.to_csv(output_train_csv, index=False)
    df_test_selected.to_csv(output_test_csv, index=False)

    print(f"✅ Saved: {output_train_csv} and {output_test_csv} with {len(selected_feature_names)} features.")
    print(f"Selected features (including Shannon entropy if needed): {selected_feature_names}")


In [35]:
def should_keep_entropy(X, y, entropy_idx):
    """
    Decide whether Shannon Entropy (at a given column index) should be retained as a feature.

    This function applies a multi-step filter to assess whether the entropy feature provides
    meaningful and non-redundant information for classification tasks. It returns a standard
    Python boolean (`True` or `False`) to indicate whether to keep the feature.

    Steps:
    1. Variance Check:
        - If the variance of Shannon Entropy is too small (below 1e-5), it provides no useful signal.
        - In that case, discard the feature.

    2️. Redundancy Check (Correlation):
        - Compute the absolute correlation of Shannon Entropy with all other features.
        - If the maximum absolute correlation exceeds 0.9, it is considered highly redundant.
        - Discard the feature in that case.

    3️. Feature Importance Check (Random Forest):
        - Train a quick Random Forest Classifier (50 trees) on the data.
        - Compare the feature importance of Shannon Entropy to the median importance of all features.
        - If Entropy’s importance is above the median, keep it; otherwise, discard it.

    Args:
        X (np.ndarray): Feature matrix, shape (n_samples, n_features).
        y (np.ndarray): Labels, shape (n_samples,).
        entropy_idx (int): Column index of the Shannon Entropy feature in X.

    Returns:
        bool: True if the feature passes all checks and should be retained, False otherwise.
    """

    # 1. Variance check: discard if variance too small
    entropy_values = X[:, entropy_idx]
    if np.var(entropy_values) < 1e-5:
        return False

    # 2. Correlation (redundancy) check: discard if highly correlated with other features
    corr_with_others = np.abs(
        np.corrcoef(entropy_values, X[:, np.arange(X.shape[1]) != entropy_idx], rowvar=False)[0, 1:]
    ).max()
    if corr_with_others > 0.9:
        return False

    # 3. Feature importance check via Random Forest: discard if below median importance
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    model.fit(X, y)

    # Convert np.bool_ to Python bool explicitly for consistency
    return bool(model.feature_importances_[entropy_idx] > np.median(model.feature_importances_))

In [36]:
# 2️. Define your feature groups (adjust indices based on your CSV)
feature_groups = {
    'Dipeptide': range(0, 399),       # X[:, 0:399] → AA (df col 1) to YY (df col 399)
    'PseAAC': range(399, 419),        # X[:, 399:419] → AAC_A (df col 400) to AAC_Y (df col 419)
    'PseAAC_theta': range(419, 424),  # Theta_1 (df col 420) to Theta_5 (df col 424)
    'PhysChem': range(424, 434),      # Avg_Hydrophobicity (df col 425) to Avg_Bulkiness (df col 434)
    'Entropy': [434],                 # Shannon_Entropy (df col 435)
    'CTD': range(435, 582),           # _PolarizabilityC1 (df col 436) to _HydrophobicityD3100 (df col 582)
    'Z-scale': range(582, 587)        # Z1 (df col 583) to Z5 (df col 587)
}

In [68]:
uploaded = files.upload()

Saving std_train_spo11_aa.csv to std_train_spo11_aa.csv


In [69]:
uploaded = files.upload()

Saving std_test_spo11_aa.csv to std_test_spo11_aa.csv


In [70]:
# File names:
train_csv = 'std_train_spo11_aa.csv'
test_csv = 'std_test_spo11_aa.csv'
output_train_csv = 'mrmr_selected_train_spo11_aa_50.csv'
output_test_csv = 'mrmr_selected_test_spo11_aa_50.csv'

# 4️. Run it
run_train_test_feature_selection(train_csv, test_csv, output_train_csv, output_test_csv,
                                 feature_groups, n_clusters=50, k_features=50)

✅ Saved: mrmr_selected_train_spo11_aa_50.csv and mrmr_selected_test_spo11_aa_50.csv with 51 features.
Selected features (including Shannon entropy if needed): ['LG', '_ChargeC2', 'RF', '_SecondaryStrD3001', 'AE', '_PolarityD1025', 'KG', '_PolarizabilityD2001', 'IE', 'YK', 'KF', '_PolarityD3100', 'Z4', 'GL', '_ChargeD3100', 'PL', 'VG', '_PolarityD3001', '_HydrophobicityD2100', 'IT', '_SolventAccessibilityD3100', '_SecondaryStrD3100', 'MA', 'RD', 'VP', '_NormalizedVDWVD3100', 'GS', 'TG', 'LS', 'FG', 'AAC_C', 'EE', 'PS', 'Avg_Hydrophobicity', 'TR', 'ML', 'PD', 'DC', 'Z2', 'Avg_Hydrophilicity', 'AAC_L', 'IL', 'Avg_Solvent_Accessibility', 'Z5', 'Z1', 'Avg_Bulkiness', '_PolarizabilityD3001', 'WL', 'Avg_Polarity', '_HydrophobicityC3', 'Shannon_Entropy']


In [76]:
# File names:
train_csv = 'std_train_spo11_aa.csv'
test_csv = 'std_test_spo11_aa.csv'
output_train_csv = 'mrmr_selected_train_spo11_aa_100.csv'
output_test_csv = 'mrmr_selected_test_spo11_aa_100.csv'

# 4️. Run it
run_train_test_feature_selection(train_csv, test_csv, output_train_csv, output_test_csv,
                                 feature_groups, n_clusters=100, k_features=100)

✅ Saved: mrmr_selected_train_spo11_aa_100.csv and mrmr_selected_test_spo11_aa_100.csv with 101 features.
Selected features (including Shannon entropy if needed): ['FT', 'VN', 'IF', 'PA', 'IK', 'PG', 'AL', 'Avg_Polarizability', 'FV', 'ME', 'LY', 'GI', 'NI', 'LD', 'RK', 'ND', 'NL', 'AH', 'QL', 'MV', '_ChargeC2', '_ChargeC3', 'LR', 'KD', 'GA', 'GE', 'LA', 'PR', 'GQ', '_PolarizabilityD1075', '_SecondaryStrD1100', 'VQ', 'MG', 'AE', 'IQ', 'LG', 'AD', 'ED', '_PolarityD2100', '_PolarityD1050', 'AP', 'RF', 'RV', '_SecondaryStrD3001', 'YR', '_ChargeD3001', '_SolventAccessibilityD1001', '_PolarityD1025', 'MA', 'IE', 'YK', 'KS', 'KG', '_PolarizabilityD2001', '_PolarizabilityD1100', 'GL', 'Z4', 'VG', '_HydrophobicityD2100', '_PolarityD3001', '_ChargeD3100', 'IT', 'PL', 'KF', 'VP', 'RD', '_SecondaryStrD3100', '_ChargeD1100', '_PolarityD3100', '_SolventAccessibilityD3100', '_HydrophobicityD2001', 'GS', 'FG', '_NormalizedVDWVD3100', '_HydrophobicityT12', 'PS', 'TG', 'LS', 'AAC_C', 'EE', 'Avg_Hydrophob

In [72]:
files.download('mrmr_selected_train_spo11_aa_50.csv')
files.download('mrmr_selected_test_spo11_aa_50.csv')
files.download('mrmr_selected_train_spo11_aa_100.csv')
files.download('mrmr_selected_test_spo11_aa_100.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [75]:
# Load CSV
df = pd.read_csv(train_csv)

# Prepare data
X = df.iloc[:, 1:-2].values  # Feature matrix (exclude ID, taxa, and label)
y = df.iloc[:, -1].values    # Labels (last column assumed to be label)
entropy_idx = 434

# Run entropy check
should_keep_entropy(X, y, entropy_idx)

True

In [82]:
import pandas as pd

# Load the files
df_train_50 = pd.read_csv('mrmr_selected_train_spo11_aa_50.csv')
df_test_50 = pd.read_csv('mrmr_selected_test_spo11_aa_50.csv')
df_train_100 = pd.read_csv('mrmr_selected_train_spo11_aa_100.csv')
df_test_100 = pd.read_csv('mrmr_selected_test_spo11_aa_100.csv')

# Print dimensions
print("Train 50 shape:", df_train_50.shape)
print("Test 50 shape:", df_test_50.shape)
print("Train 100 shape:", df_train_100.shape)
print("Test 100 shape:", df_test_100.shape)


Train 50 shape: (2109, 54)
Test 50 shape: (905, 54)
Train 100 shape: (2109, 104)
Test 100 shape: (905, 104)
