# Probabilistic Feature Subset Selection with Ensemble Models

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from src.utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models

## UCI Breast Cancer Clinical Records Dataset

In [2]:
FILE_PATH = "../data/breast_cancer_clinical_records.csv" # Change this to the path of the dataset
REPO_ID = 15 # Change this to the repository ID of the dataset

### Load and Preprocess Dataset

In [3]:
# Load dataset from UCI Machine Learning Repository if it does not exist
if not os.path.exists(FILE_PATH):
    clinical_records_metadata, clinical_records = load_uci_dataset(repo_id=REPO_ID)
    print(clinical_records_metadata)
    clinical_records.to_csv(FILE_PATH, index=False)

In [4]:
# Load dataset from CSV file
clinical_records = pd.read_csv(FILE_PATH)
print("Number of samples:", len(clinical_records))
clinical_records.head()

Number of samples: 699


Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [5]:
# Handle missing values by dropping rows with any missing values
clinical_records = clinical_records.dropna()
print("Number of samples:", len(clinical_records))

Number of samples: 683


In [6]:
# Separate features and target after dropping missing values
X = clinical_records.iloc[:, :-1]  # All columns except the last
y = clinical_records.iloc[:, -1]  # Last column as target

# Encode target labels if necessary
y = pd.factorize(y)[0]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Train Multiple Ensembles on All Features

In [7]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False) # no probabilities were inserted

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

                                                                              

Classifier
AdaBoost               0.956098
Gradient Boosting      0.965854
LDA                    0.936585
Logistic Regression    0.956098
Random Forest          0.960976
SVM                    0.965854
Name: Accuracy, dtype: float64

### Train Multiple Ensembles on Probabilistic Features (Mutual Information)


In [8]:
mutual_info = mutual_info_classif(X_train, y_train)
probabilities = mutual_info / np.sum(mutual_info)  # Normalize to create a probability distribution

print("Feature Probabilities (Mutual Information):")
print(probabilities)

Feature Probabilities (Mutual Information):
[0.08951867 0.16151573 0.14406373 0.09988988 0.10796143 0.13461173
 0.10775483 0.10370329 0.05098071]


In [9]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

                                                                              

Classifier
AdaBoost               0.949268
Gradient Boosting      0.955122
LDA                    0.943415
Logistic Regression    0.956098
Random Forest          0.955122
SVM                    0.956098
Name: Accuracy, dtype: float64

In [10]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.6 0.6 0.2 0.4 0.6 0.4 0.8 0.8 0.6]


### Train Multiple Ensembles on Probabilistic Features (Interaction Scores)

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

# Calculate pairwise interaction scores
interaction_matrix = np.zeros((X_train.shape[1], X_train.shape[1]))
for tree in rf.estimators_:
    for feature_idx, importance in enumerate(tree.feature_importances_):
        interaction_matrix[feature_idx] += importance

interaction_scores = interaction_matrix.sum(axis=1) / rf.n_estimators
probabilities = interaction_scores / interaction_scores.sum()

print("Feature Probabilities (Interaction Scores):")
print(probabilities)

Feature Probabilities (Interaction Scores):
[0.04244837 0.29303644 0.29023037 0.0174573  0.0567721  0.17691701
 0.05443319 0.05968247 0.00902275]


In [12]:
# Train Multiple Ensembles on Probabilistically Sampled Subsets
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

                                                                              

Classifier
AdaBoost               0.955122
Gradient Boosting      0.954146
LDA                    0.937561
Logistic Regression    0.952195
Random Forest          0.955122
SVM                    0.956098
Name: Accuracy, dtype: float64

In [13]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.8 1.  0.  0.2 0.6 0.8 1.  0.4]


### Train Multiple Ensembles on Probabilistic Features (METHOD 3)

### Train Multiple Ensembles on Probabilistic Features (METHOD 4)