In [2]:
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_selection import mutual_info_classif

def approximate_mrmr_save(X, y, seq_ids, labels, df_original, n_clusters=50, k_features=50, output_csv='mrmr_selected_features.csv'):
    """
    Approximate mRMR with feature clustering + save selected features dataset.

    Parameters:
    - X: feature matrix (numpy array)
    - y: labels (numpy array)
    - seq_ids: original sequence IDs (Series)
    - labels: labels (Series)
    - df_original: original DataFrame
    - n_clusters: number of clusters for feature grouping
    - k_features: number of final features to select
    - output_csv: file name for output CSV

    Returns:
    - selected_features: list of selected feature indices (as strings)
    """
    # Step 1: Compute correlation distance matrix between features
    corr_dist = 1 - np.abs(np.corrcoef(X.T))

    # Step 2: Cluster features using correlation distance
    clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        metric='precomputed',
        linkage='complete'
    )
    clusters = clustering.fit_predict(corr_dist)

    # Step 3: Compute Mutual Information
    mi_scores = mutual_info_classif(X, y)

    # Step 4: Select top-MI feature from each cluster
    selected = []
    for cluster_id in np.unique(clusters):
        cluster_features = np.where(clusters == cluster_id)[0]
        top_feature = cluster_features[np.argmax(mi_scores[cluster_features])]
        selected.append(top_feature)

    # Step 5: Select top-k features (by MI)
    selected = sorted(selected, key=lambda x: mi_scores[x], reverse=True)[:k_features]
    selected_features = [feature_names[i] for i in selected]

    # Step 6: Save new dataset with sequence IDs, selected features, and labels
    df_selected = pd.concat([
        seq_ids.reset_index(drop=True),
        pd.DataFrame(X[:, selected], columns=selected_features),
        labels.reset_index(drop=True)
    ], axis=1)

    df_selected.to_csv(output_csv, index=False)
    print(f"✅ Saved selected features dataset to {output_csv}")
    print(f"✅ Selected features (column indices): {selected_features}")

    return selected_features





In [None]:
# 1. Load the CSV
# uploaded = files.upload()
# df = pd.read_csv('std_combined_dmc1_non_meiosis_cls_embeddings.csv')

# 2. Prepare Data
# X = df.iloc[:, 1:-1].values  # Feature matrix (exclude ID and label)
# y = df.iloc[:, -1].values    # Labels
# seq_ids = df.iloc[:, 0]      # Sequence IDs
#labels = df.iloc[:, -1]      # Labels (again, for later use)
# feature_names = [str(i) for i in range(X.shape[1])]  # Clean feature names (no 'f' prefix)

# 4. Run the function
# selected_features = approximate_mrmr_save(
#    X, y, seq_ids, labels, df,
#    n_clusters=50, k_features=50,
#    output_csv='dmc1_mrmr_selected_embeddings_50.csv'
#)
# files.download('dmc1_mrmr_selected_embeddings_50.csv')


In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_selection import mutual_info_classif

def approximate_mrmr_from_train(X_train, y_train, feature_names, k_features=50, n_clusters=50):
    """
    Perform approximate mRMR feature selection on the training data.

    Parameters:
    - X_train: numpy array of training features
    - y_train: labels for training set
    - feature_names: list of feature column names
    - k_features: final number of features to select
    - n_clusters: number of clusters to group features

    Returns:
    - selected_feature_names: list of selected feature column names
    - selected_indices: list of selected feature column indices
    """
    # Step 1: Correlation distance matrix
    corr_dist = 1 - np.abs(np.corrcoef(X_train.T))

    # Step 2: Clustering
    clustering = AgglomerativeClustering(
    n_clusters=n_clusters,
    metric='precomputed',
    linkage='average'  # use 'average' with 'precomputed'
    )
    clusters = clustering.fit_predict(corr_dist)

    # Step 3: Mutual information
    mi_scores = mutual_info_classif(X_train, y_train)

    # Step 4: Select top feature per cluster
    selected_indices = []
    for cluster_id in np.unique(clusters):
        cluster_features = np.where(clusters == cluster_id)[0]
        top_feature = cluster_features[np.argmax(mi_scores[cluster_features])]
        selected_indices.append(top_feature)

    # Step 5: Keep top-k features by MI score
    selected_indices = sorted(selected_indices, key=lambda i: mi_scores[i], reverse=True)[:k_features]
    selected_feature_names = [feature_names[i] for i in selected_indices]

    print(f"✅ Selected features: {selected_feature_names}")
    return selected_feature_names, selected_indices


In [42]:
# Load training data
train_df = pd.read_csv('std_train_spo11_esm.csv')
test_df = pd.read_csv('std_test_spo11_esm.csv')

# Extract feature names
feature_names = train_df.columns[1:-2].tolist()  # assuming format: ID | features | taxa | label

# Extract X and y
X_train = train_df.iloc[:, 1:-2].values
y_train = train_df.iloc[:, -1].values

# Run mRMR
selected_feature_names, selected_indices = approximate_mrmr_from_train(X_train, y_train, feature_names, k_features=50)

# Apply to both train and test sets
selected_train_df = pd.concat([
    train_df.iloc[:, [0]],               # ID
    train_df[selected_feature_names],    # Selected features
    train_df.iloc[:, -2:]                # taxa, label
], axis=1)

selected_test_df = pd.concat([
    test_df.iloc[:, [0]],                # ID
    test_df[selected_feature_names],     # Selected features
    test_df.iloc[:, -2:]                 # taxa, label
], axis=1)

✅ Selected features: ['181', '201', '418', '289', '2', '624', '539', '549', '268', '334', '319', '172', '247', '78', '263', '243', '225', '327', '271', '133', '303', '337', '285', '113', '290', '476', '356', '535', '453', '157', '36', '404', '228', '250', '183', '48', '294', '44', '448', '575', '396', '634', '383', '392', '399', '490', '429', '196', '602', '152']


In [43]:
# Save output
selected_train_df.to_csv('mrmr_selected_train_spo11_esm_50.csv', index=False)
selected_test_df.to_csv('mrmr_selected_test_spo11_esm_50.csv', index=False)
files.download('mrmr_selected_train_spo11_esm_50.csv')
files.download('mrmr_selected_test_spo11_esm_50.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
# Load training data
train_df = pd.read_csv('std_train_spo11_esm.csv')
test_df = pd.read_csv('std_test_spo11_esm.csv')

# Extract feature names
feature_names = train_df.columns[1:-2].tolist()  # assuming format: ID | features | taxa | label

# Extract X and y
X_train = train_df.iloc[:, 1:-2].values
y_train = train_df.iloc[:, -1].values

# Run mRMR
selected_feature_names, selected_indices = approximate_mrmr_from_train(X_train, y_train, feature_names, k_features=100, n_clusters=100)

# Apply to both train and test sets
selected_train_df = pd.concat([
    train_df.iloc[:, [0]],               # ID
    train_df[selected_feature_names],    # Selected features
    train_df.iloc[:, -2:]                # taxa, label
], axis=1)

selected_test_df = pd.concat([
    test_df.iloc[:, [0]],                # ID
    test_df[selected_feature_names],     # Selected features
    test_df.iloc[:, -2:]                 # taxa, label
], axis=1)

✅ Selected features: ['181', '536', '201', '418', '107', '289', '553', '2', '624', '539', '390', '549', '268', '117', '592', '334', '319', '371', '559', '182', '503', '434', '172', '520', '286', '247', '78', '263', '243', '225', '327', '271', '133', '303', '211', '337', '147', '214', '285', '113', '353', '638', '290', '13', '6', '110', '71', '476', '516', '356', '535', '453', '119', '72', '39', '45', '475', '157', '258', '36', '630', '404', '228', '250', '426', '7', '30', '183', '92', '227', '427', '48', '240', '255', '294', '44', '394', '287', '448', '555', '575', '237', '396', '634', '79', '70', '383', '392', '399', '42', '490', '429', '196', '602', '430', '66', '318', '501', '152', '637']


In [45]:
# Save output
selected_train_df.to_csv('mrmr_selected_train_spo11_esm_100.csv', index=False)
selected_test_df.to_csv('mrmr_selected_test_spo11_esm_100.csv', index=False)
files.download('mrmr_selected_train_spo11_esm_100.csv')
files.download('mrmr_selected_test_spo11_esm_100.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>