In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC


from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, f1_score

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline

from collections import Counter
import matplotlib.pyplot as plt

import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import preprocessing_utils as preproc
from preprocessing_utils import generate_recurrence_labels
import config


X_train = joblib.load("../data/no_boot/X_train.pkl")
y_train = joblib.load("../data/no_boot/y_train.pkl")
X_test = joblib.load("../data/no_boot/X_test.pkl")
y_test = joblib.load("../data/no_boot/y_test.pkl")

import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import resample

def bootstrap_feature_stability(X, y, k=50, n_bootstrap=100, random_state=42):
    """
    Runs bootstrapping to evaluate how often each feature is selected by SelectKBest.
    
    Parameters:
        X : pd.DataFrame
            Feature matrix
        y : pd.Series or np.array
            Labels
        k : int
            Number of features to select per run
        n_bootstrap : int
            Number of bootstrap samples
        random_state : int
            Seed for reproducibility
    
    Returns:
        pd.Series of selection frequencies (index = feature names)
    """
    rng = np.random.RandomState(random_state)
    feature_counts = pd.Series(0, index=X.columns, dtype=int)

    for i in range(n_bootstrap):
        # Bootstrap resample
        X_res, y_res = resample(X, y, replace=True, random_state=rng.randint(1e6))
        
        # Fit SelectKBest
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X_res, y_res)
        
        # Get selected features
        selected = X.columns[selector.get_support()]
        feature_counts[selected] += 1

    # Normalize to frequency
    return feature_counts / n_bootstrap


In [6]:
# Suppose you already have X_train (features) and y_train (labels)
stability = bootstrap_feature_stability(X_train, y_train, k=100, n_bootstrap=5)

# Sort by most stable features
stability.sort_values(ascending=False).head(50)


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


mrna__CCDC86       1.0
mrna__C11orf86     1.0
mrna__MSX2         0.8
mrna__NAA50        0.8
mrna__RALGDS       0.8
mrna__SMG6         0.8
mrna__USP43        0.8
mrna__GABARAPL3    0.6
mrna__CSE1L        0.6
mrna__EPB41L1      0.6
mrna__GALNT12      0.6
mrna__AURKA        0.6
mrna__HDGF         0.6
mrna__LBX2         0.6
mrna__HSPD1        0.6
mrna__CIRBP        0.6
mrna__MYC          0.6
mrna__RASGRF1      0.6
mrna__SLC1A1       0.6
mrna__SLC47A1      0.6
mrna__SLC6A17      0.6
mrna__SST          0.6
mrna__UMPS         0.6
mrna__YARS         0.6
mrna__HSP90AB1     0.4
mrna__GLT8D1       0.4
mrna__C9orf103     0.4
mrna__DUSP6        0.4
mrna__GTF2H4       0.4
mrna__BAT3         0.4
mrna__C20orf11     0.4
mrna__DCTPP1       0.4
mrna__C20orf24     0.4
mrna__DHCR24       0.4
mrna__FZD4         0.4
mrna__ASPDH        0.4
mrna__KIAA0913     0.4
mrna__CHD3         0.4
mrna__ECE2         0.4
mrna__CEBPA-DT     0.4
mrna__MGAM         0.4
mrna__MRPL37       0.4
mrna__DERL1        0.4
mrna__MTCH1