# Probabilistic Feature Subset Selection with Ensemble Models

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from src.utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models

## UCI Heart Failure Clinical Records Dataset

In [2]:
FILE_PATH = "../data/heart_failure_clinical_records.csv" # Change this to the path of the dataset
REPO_ID = 519 # Change this to the repository ID of the dataset

### Load and Preprocess Dataset

In [3]:
# Load dataset from UCI Machine Learning Repository if it does not exist
if not os.path.exists(FILE_PATH):
    clinical_records_metadata, clinical_records = load_uci_dataset(repo_id=REPO_ID)
    print(clinical_records_metadata)
    clinical_records.to_csv(FILE_PATH, index=False)

In [4]:
# Load dataset from CSV file
clinical_records = pd.read_csv(FILE_PATH)
print("Number of samples:", len(clinical_records))
clinical_records.head()

Number of samples: 299


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
# Handle missing values by dropping rows with any missing values
clinical_records = clinical_records.dropna()
print("Number of samples:", len(clinical_records))

Number of samples: 299


In [6]:
# Separate features and target after dropping missing values
X = clinical_records.iloc[:, :-1]  # All columns except the last
y = clinical_records.iloc[:, -1]  # Last column as target

# Encode target labels if necessary
y = pd.factorize(y)[0]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Train Multiple Ensembles on All Features

In [7]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False) # no probabilities were inserted

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.844444
Gradient Boosting      0.844444
LDA                    0.788889
Logistic Regression    0.800000
Random Forest          0.855556
SVM                    0.677778
Name: Accuracy, dtype: float64

### Train Multiple Ensembles on Probabilistic Features (Mutual Information)


In [8]:
mutual_info = mutual_info_classif(X_train, y_train)
probabilities = mutual_info / np.sum(mutual_info)  # Normalize to create a probability distribution

print("Feature Probabilities (Mutual Information):")
print(probabilities)

Feature Probabilities (Mutual Information):
[0.08702435 0.06403895 0.00719986 0.08437133 0.10787557 0.
 0.         0.17482071 0.00584059 0.         0.         0.46882864]


In [9]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

                                                                              

Classifier
AdaBoost               0.837778
Gradient Boosting      0.837778
LDA                    0.804444
Logistic Regression    0.815556
Random Forest          0.844444
SVM                    0.808889
Name: Accuracy, dtype: float64

In [10]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.8 0.4 0.6 0.8 0.  0.  1.  0.2 0.  0.  1. ]


### Train Multiple Ensembles on Probabilistic Features (Interaction Scores)

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

# Calculate pairwise interaction scores
interaction_matrix = np.zeros((X_train.shape[1], X_train.shape[1]))
for tree in rf.estimators_:
    for feature_idx, importance in enumerate(tree.feature_importances_):
        interaction_matrix[feature_idx] += importance

interaction_scores = interaction_matrix.sum(axis=1) / rf.n_estimators
probabilities = interaction_scores / interaction_scores.sum()

print("Feature Probabilities (Interaction Scores):")
print(probabilities)

Feature Probabilities (Interaction Scores):
[0.08910293 0.01823446 0.06889207 0.01136961 0.10625269 0.01338873
 0.07031262 0.15944837 0.08441427 0.01277598 0.01005281 0.35575545]


In [12]:
# Train Multiple Ensembles on Probabilistically Sampled Subsets
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

                                                                              

Classifier
AdaBoost               0.826667
Gradient Boosting      0.831111
LDA                    0.813333
Logistic Regression    0.813333
Random Forest          0.822222
SVM                    0.740000
Name: Accuracy, dtype: float64

In [13]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.6 0.  0.6 0.  0.6 0.2 0.4 1.  0.4 0.  0.2 1. ]


### Train Multiple Ensembles on Probabilistic Features (METHOD 3)

### Train Multiple Ensembles on Probabilistic Features (METHOD 4)