# Probabilistic Feature Subset Selection with Ensemble Models

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from src.utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models

## UCI Heart Failure Clinical Records Dataset

### Load Dataset

In [2]:
breast_cancer_clinical_records = load_uci_dataset("../data/breast_cancer.csv", repo_id=15, verbose=True)
print("Number of samples:", len(breast_cancer_clinical_records))
breast_cancer_clinical_records.head()

{'uci_id': 519, 'name': 'Heart Failure Clinical Records', 'repository_url': 'https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records', 'data_url': 'https://archive.ics.uci.edu/static/public/519/data.csv', 'abstract': 'This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 299, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['death_event'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C5Z89R', 'creators': [], 'intro_paper': {'ID': 286, 'type': 'NATIVE', 'title': 'Machine learning can predict survival of patients with heart failure f

TypeError: 'NoneType' object is not callable

### Preprocess Dataset

In [3]:
# Handle missing values by dropping rows with any missing values
breast_cancer_clinical_records = breast_cancer_clinical_records.dropna()
print("Number of samples:", len(breast_cancer_clinical_records))

Number of samples: 299


In [4]:
# Separate features and target after dropping missing values
X = breast_cancer_clinical_records.iloc[:, :-1]  # All columns except the last
y = breast_cancer_clinical_records.iloc[:, -1]  # Last column as target

# Encode target labels if necessary
y = pd.factorize(y)[0]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Assign Probabilities to Features using Mutual Information


In [5]:
mutual_info = mutual_info_classif(X_train, y_train)
probabilities = mutual_info / np.sum(mutual_info)  # Normalize to create a probability distribution

print("Feature Probabilities (Mutual Information):")
print(probabilities)

Feature Probabilities (Mutual Information):
[0.08333563 0.02014873 0.00677836 0.         0.18991753 0.
 0.         0.19162453 0.         0.         0.         0.50819522]


### Train Multiple Ensembles on Probabilistically Sampled Subsets


In [6]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=True)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()


Training Ensemble 1/5...
Training Random Forest...
Random Forest Accuracy: 0.8333
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8667
Training AdaBoost...
AdaBoost Accuracy: 0.8556
Training Logistic Regression...
Logistic Regression Accuracy: 0.8111
Training SVM...
SVM Accuracy: 0.7444
Training LDA...
LDA Accuracy: 0.8000

Training Ensemble 2/5...
Training Random Forest...
Random Forest Accuracy: 0.8444
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8667
Training AdaBoost...
AdaBoost Accuracy: 0.8556
Training Logistic Regression...
Logistic Regression Accuracy: 0.8111
Training SVM...
SVM Accuracy: 0.7444
Training LDA...
LDA Accuracy: 0.8000

Training Ensemble 3/5...
Training Random Forest...
Random Forest Accuracy: 0.8222
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8556
Training AdaBoost...
AdaBoost Accuracy: 0.8333
Training Logistic Regression...
Logistic Regression Accuracy: 0.8000
Training SVM...
SVM Accuracy: 0.8333
Training LDA...
LDA Acc

Classifier
AdaBoost               0.846667
Gradient Boosting      0.862222
LDA                    0.800000
Logistic Regression    0.806667
Random Forest          0.837778
SVM                    0.780000
Name: Accuracy, dtype: float64

In [7]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[1.  0.4 0.6 0.  1.  0.  0.  1.  0.  0.  0.  1. ]


### Assign probabilities to features using interaction scores

In [8]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

# Calculate pairwise interaction scores
interaction_matrix = np.zeros((X_train.shape[1], X_train.shape[1]))
for tree in rf.estimators_:
    for feature_idx, importance in enumerate(tree.feature_importances_):
        interaction_matrix[feature_idx] += importance

interaction_scores = interaction_matrix.sum(axis=1) / rf.n_estimators
probabilities = interaction_scores / interaction_scores.sum()

print("Feature Probabilities (Interaction Scores):")
print(probabilities)

Feature Probabilities (Interaction Scores):
[0.08910293 0.01823446 0.06889207 0.01136961 0.10625269 0.01338873
 0.07031262 0.15944837 0.08441427 0.01277598 0.01005281 0.35575545]


### Train Multiple Ensembles on Probabilistically Sampled Subsets


In [9]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=True)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()


Training Ensemble 1/5...
Training Random Forest...
Random Forest Accuracy: 0.7778
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8111
Training AdaBoost...
AdaBoost Accuracy: 0.8556
Training Logistic Regression...
Logistic Regression Accuracy: 0.8222
Training SVM...
SVM Accuracy: 0.6778
Training LDA...
LDA Accuracy: 0.8111

Training Ensemble 2/5...
Training Random Forest...
Random Forest Accuracy: 0.7333
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.7111
Training AdaBoost...
AdaBoost Accuracy: 0.7000
Training Logistic Regression...
Logistic Regression Accuracy: 0.7333
Training SVM...
SVM Accuracy: 0.6778
Training LDA...
LDA Accuracy: 0.7333

Training Ensemble 3/5...
Training Random Forest...
Random Forest Accuracy: 0.8444
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8778
Training AdaBoost...
AdaBoost Accuracy: 0.8222
Training Logistic Regression...
Logistic Regression Accuracy: 0.8222
Training SVM...
SVM Accuracy: 0.7444
Training LDA...
LDA Acc

Classifier
AdaBoost               0.804444
Gradient Boosting      0.815556
LDA                    0.784444
Logistic Regression    0.795556
Random Forest          0.802222
SVM                    0.715556
Name: Accuracy, dtype: float64

### Analyze uncertainty in feature importance

In [10]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.2 0.8 0.  0.8 0.2 0.4 0.8 0.8 0.  0.  0.8]
