In [1]:
import numpy as np
import pandas as pd
import joblib
from preprocessing_utils import *
from sklearn.base import BaseEstimator, TransformerMixin
import config

def compare_feature_sets(features_a, features_b, name_a="Method A", name_b="Method B"):
    """
    Compare overlap between two feature sets.
    
    Parameters
    ----------
    features_a : list, np.array, or boolean mask
        Features selected by the first method
    features_b : list, np.array, or boolean mask
        Features selected by the second method
    name_a : str
        Label for the first method
    name_b : str
        Label for the second method
        
    Returns
    -------
    pd.DataFrame
        Table summarizing overlap
    """

    # Convert boolean masks to indices
    if isinstance(features_a, (np.ndarray, list)) and all(isinstance(x, (bool, np.bool_)) for x in features_a):
        features_a = np.where(features_a)[0]
    if isinstance(features_b, (np.ndarray, list)) and all(isinstance(x, (bool, np.bool_)) for x in features_b):
        features_b = np.where(features_b)[0]
    
    set_a = set(features_a)
    set_b = set(features_b)

    overlap = set_a & set_b
    only_a = set_a - set_b
    only_b = set_b - set_a

    summary = {
        "Total " + name_a: len(set_a),
        "Total " + name_b: len(set_b),
        "Overlap": len(overlap),
        "Unique to " + name_a: len(only_a),
        "Unique to " + name_b: len(only_b),
        "% of " + name_a + " in overlap": len(overlap) / len(set_a) * 100 if set_a else 0,
        "% of " + name_b + " in overlap": len(overlap) / len(set_b) * 100 if set_b else 0
    }
    
    return pd.DataFrame([summary])




In [2]:
X_train = joblib.load("../data/mutation/X_train.pkl")
y_train = joblib.load("../data/mutation/y_train.pkl")
print(X_train.shape, y_train.shape)


(361, 18134) (361,)


In [3]:
boot_selector = BootstrappedSelectKBest()
boot_selector.fit(X_train, y_train)
stability_selector = StabilitySelection()
stability_selector.fit(X_train, y_train)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[BootstrappedSelectKBest] Kept 187 features (threshold=0.4, k=500, bootstraps=50)


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

[StabilitySelection] Kept 1555 / 18134 features (threshold=0.85, boots=100, alpha=0.1)


  f = msb / msw


In [4]:

# Example usage:
boot_features = list(boot_selector.get_support())
stability_features = list(stability_selector.get_support())
result = compare_feature_sets(boot_features, stability_features, "Bootstrapped", "Stability")
print(result)


   Total Bootstrapped  Total Stability  Overlap  Unique to Bootstrapped  \
0                 187             1555      187                       0   

   Unique to Stability  % of Bootstrapped in overlap  \
0                 1368                         100.0   

   % of Stability in overlap  
0                  12.025723  


In [5]:
print(config.FPR_ALPHA)

0.1
