# Probabilistic Feature Subset Selection with Ensemble Models

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models

## UCI Breast Cancer Clinical Records Dataset

In [2]:
FILE_PATH = "../data/hcvdat0.csv" # Change this to the path of the dataset
REPO_ID = 571 # Change this to the repository ID of the dataset

### Load and Preprocess Dataset

In [3]:
# Load dataset from UCI Machine Learning Repository if it does not exist
if not os.path.exists(FILE_PATH):
    clinical_records_metadata, clinical_records = load_uci_dataset(repo_id=REPO_ID)
    print(clinical_records_metadata)
    clinical_records.to_csv(FILE_PATH, index=False)

In [4]:
# Load dataset from CSV file
clinical_records = pd.read_csv(FILE_PATH)
print("Number of samples:", len(clinical_records))
clinical_records.head()

Number of samples: 615


Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [5]:
# Handle missing values by dropping rows with any missing values
clinical_records = clinical_records.dropna()
print("Number of samples:", len(clinical_records))

Number of samples: 589


In [6]:
# Separate features and target after dropping missing values
X = clinical_records.drop(columns=["Unnamed: 0", "Category"])  # Drop the first column and target

# map the gender column to integers m->1, f->0
X['Sex'] = X['Sex'].map({'m': 1, 'f': 0})

#select the category column as the target
y = clinical_records["Category"]
#select only the first character of the string for each value in y and convert it to an integer
y = y.apply(lambda x: int(x[0]))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Train Multiple Ensembles on All Features

In [7]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False) # no probabilities were inserted

results_df = pd.DataFrame(ensemble_results)
results_default = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_default)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.960452
Gradient Boosting      0.926554
LDA                    0.966102
Logistic Regression    0.949153
Random Forest          0.954802
SVM                    0.960452
Name: Accuracy, dtype: float64




### Train Multiple Ensembles on Probabilistic Features (Mutual Information)


In [8]:
mutual_info = mutual_info_classif(X_train, y_train)
probabilities = mutual_info / np.sum(mutual_info)  # Normalize to create a probability distribution

print("Feature Probabilities (Mutual Information):")
print(probabilities)

Feature Probabilities (Mutual Information):
[0.         0.         0.12080712 0.11571771 0.12601803 0.19307922
 0.10198185 0.12161827 0.0783143  0.07500287 0.04969066 0.01776997]


In [9]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_mutual_information = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_mutual_information)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.944633
Gradient Boosting      0.938983
LDA                    0.943503
Logistic Regression    0.943503
Random Forest          0.946893
SVM                    0.940113
Name: Accuracy, dtype: float64




In [10]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.  0.  0.8 0.4 0.2 0.8 0.4 0.6 0.4 0.6 0.4 0.4]


### Train Multiple Ensembles on Probabilistic Features (Interaction Scores)

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

# Calculate pairwise interaction scores
interaction_matrix = np.zeros((X_train.shape[1], X_train.shape[1]))
for tree in rf.estimators_:
    for feature_idx, importance in enumerate(tree.feature_importances_):
        interaction_matrix[feature_idx] += importance

interaction_scores = interaction_matrix.sum(axis=1) / rf.n_estimators
probabilities = interaction_scores / interaction_scores.sum()

print("Feature Probabilities (Interaction Scores):")
print(probabilities)

Feature Probabilities (Interaction Scores):
[0.03907333 0.00498679 0.05015069 0.13591432 0.14373398 0.21013785
 0.06284569 0.15013282 0.04185235 0.04683836 0.07288722 0.0414466 ]


In [12]:
# Train Multiple Ensembles on Probabilistically Sampled Subsets
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_interaction_score = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_interaction_score)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.941243
Gradient Boosting      0.929944
LDA                    0.938983
Logistic Regression    0.952542
Random Forest          0.951412
SVM                    0.938983
Name: Accuracy, dtype: float64




In [13]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.4 0.  0.6 0.8 0.4 0.6 0.4 0.8 0.4 0.4 0.2 0. ]


### Train Multiple Ensembles on Probabilistic Features (Linear Correlation)

In [14]:
linear_correlation = np.abs(X_train.corrwith(y_train))
probabilities_correlation = linear_correlation / linear_correlation.sum()

print("Feature Probabilities (Linear Correlation):")
print(probabilities_correlation)

probabilities = probabilities_correlation

Feature Probabilities (Linear Correlation):
Age     0.044778
Sex     0.010117
ALB     0.089236
ALP     0.010838
ALT     0.056558
AST     0.195632
BIL     0.167794
CHE     0.116255
CHOL    0.099848
CREA    0.076071
GGT     0.127368
PROT    0.005505
dtype: float64


In [15]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_linear_correlation = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_linear_correlation)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.932203
Gradient Boosting      0.934463
LDA                    0.938983
Logistic Regression    0.942373
Random Forest          0.942373
SVM                    0.934463
Name: Accuracy, dtype: float64




In [16]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.2 1.  0.2 0.2 0.8 0.6 0.8 0.4 0.2 0.4 0. ]


### Train Multiple Ensembles on Probabilistic Features (Indormation Gain)

In [17]:
from sklearn.feature_selection import mutual_info_classif
info_gain = mutual_info_classif(X_train, y_train, discrete_features=False)
probabilities_info_gain = info_gain / np.sum(info_gain)

print("Feature Probabilities (Information Gain):")
print(probabilities_info_gain)

probabilities = probabilities_info_gain

Feature Probabilities (Information Gain):
[0.         0.00505105 0.11867885 0.11519388 0.13128368 0.18896891
 0.09852386 0.12065471 0.08074242 0.07627976 0.04828186 0.01634104]


In [18]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_information_gain = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_information_gain)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
                                                                              

Classifier
AdaBoost               0.941243
Gradient Boosting      0.936723
LDA                    0.941243
Logistic Regression    0.946893
Random Forest          0.946893
SVM                    0.948023
Name: Accuracy, dtype: float64




In [19]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.  0.2 0.8 0.4 0.8 1.  0.6 0.4 0.2 0.4 0.  0.2]


### Train Multiple Ensembles on Probabilistic Features (Spearman Rank Correlation)

In [20]:
from scipy.stats import spearmanr

spearman_correlation = np.abs(X_train.apply(lambda x: spearmanr(x, y_train)[0]))
probabilities_spearman = spearman_correlation / spearman_correlation.sum()

print("Feature Probabilities (Spearman Rank Correlation):")
print(probabilities_spearman)

probabilities = probabilities_spearman

Feature Probabilities (Spearman Rank Correlation):
Age     0.034428
Sex     0.020163
ALB     0.056143
ALP     0.076337
ALT     0.122039
AST     0.177107
BIL     0.129505
CHE     0.077133
CHOL    0.111665
CREA    0.042203
GGT     0.134201
PROT    0.019076
dtype: float64


In [21]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_spearman_correlation = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_spearman_correlation)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.942373
Gradient Boosting      0.933333
LDA                    0.937853
Logistic Regression    0.945763
Random Forest          0.940113
SVM                    0.935593
Name: Accuracy, dtype: float64




In [22]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.2 0.2 0.4 0.2 0.8 0.6 0.8 0.4 0.  0.6 0.6]


### Train Multiple Ensembles on Probabilistic Features (Shannon Entropy)

In [23]:
from scipy.stats import entropy

# Calculate Shannon entropy for each feature
def calculate_shannon_entropy(feature):
    # Compute the probability distribution of unique values
    values, counts = np.unique(feature, return_counts=True)
    probabilities = counts / np.sum(counts)
    return entropy(probabilities, base=2)  # Base 2 for Shannon entropy

# Check if X_train is a DataFrame and adjust accordingly
if isinstance(X_train, pd.DataFrame):
    shannon_entropies = [calculate_shannon_entropy(X_train.iloc[:, i]) for i in range(X_train.shape[1])]
else:
    shannon_entropies = [calculate_shannon_entropy(X_train[:, i]) for i in range(X_train.shape[1])]



probabilities_entropy = shannon_entropies / np.sum(shannon_entropies)
probabilities = probabilities_entropy

print("Feature Probabilities (Shannon Entropy):")
print(probabilities_entropy)


Feature Probabilities (Shannon Entropy):
[0.06360281 0.01192012 0.08713173 0.10177804 0.09669875 0.09462856
 0.08652191 0.10089442 0.09589797 0.07511366 0.09760885 0.08820319]


In [24]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_shannon_entropy = results_df.groupby("Classifier")["Accuracy"].mean()
print(results_shannon_entropy)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.940113
Gradient Boosting      0.928814
LDA                    0.932203
Logistic Regression    0.937853
Random Forest          0.941243
SVM                    0.924294
Name: Accuracy, dtype: float64




In [25]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.6 0.  0.4 0.4 0.2 0.4 0.4 0.6 0.8 0.4 0.4 0.4]


### Group all the results for each similarity measurement

In [None]:
#%pip install matplotlib

import matplotlib.pyplot as plt

# Combine measurement results into a table and plot
measurements_results = {
    "Default": results_default,
    "Mutual Information": results_mutual_information,
    "Interaction Scores": results_interaction_score,
    "Linear Correlation": results_linear_correlation,
    "Information Gain": results_information_gain,
    "Spearman Rank Correlation": results_spearman_correlation,
    "Shannon Entropy": results_shannon_entropy,
}

combined_df = pd.DataFrame()
for name, result in measurements_results.items():
    result_df = result.rename(name)  # Rename series for clarity
    combined_df = pd.concat([combined_df, result_df], axis=1)

print("Combined Results Table:")
print(combined_df)


# # Convert each result series to a DataFrame and merge them
# combined_df = pd.DataFrame()
# for name, result in measurements_results.items():
#     result_df = result.rename(name)  # Rename series for clarity
#     combined_df = pd.concat([combined_df, result_df], axis=1)

# print("Combined Results Table:")
# print(combined_df)

# # Plot the table
# fig, ax = plt.subplots(figsize=(10, 6))
# ax.axis('tight')
# ax.axis('off')
# table = ax.table(cellText=combined_df.round(3).values,
#                   colLabels=combined_df.columns,
#                   rowLabels=combined_df.index,
#                   loc='center',
#                   cellLoc='center')
# plt.title("Measurement Results Table")
# plt.show()

Combined Results Table:
                      Default  Mutual Information  Interaction Scores  \
AdaBoost             0.960452            0.944633            0.941243   
Gradient Boosting    0.926554            0.938983            0.929944   
LDA                  0.966102            0.943503            0.938983   
Logistic Regression  0.949153            0.943503            0.952542   
Random Forest        0.954802            0.946893            0.951412   
SVM                  0.960452            0.940113            0.938983   

                     Linear Correlation  Information Gain  \
AdaBoost                       0.932203          0.941243   
Gradient Boosting              0.934463          0.936723   
LDA                            0.938983          0.941243   
Logistic Regression            0.942373          0.946893   
Random Forest                  0.942373          0.946893   
SVM                            0.934463          0.948023   

                     Spearman Rank C

In [27]:
measurements_improvements = {
    "Mutual Information": results_mutual_information - results_default,
    "Interaction Scores": results_interaction_score - results_default,
    "Linear Correlation": results_linear_correlation - results_default,
    "Information Gain": results_information_gain - results_default,
    "Spearman Rank Correlation": results_spearman_correlation - results_default,
    "Shannon Entropy": results_shannon_entropy - results_default,
}

combined_improvements_df = pd.DataFrame()
for name, result in measurements_improvements.items():
    result_df_impr = result.rename(name)  # Rename series for clarity
    combined_improvements_df = pd.concat([combined_improvements_df, result_df_impr], axis=1)

print("Combined Improvements Table:")
print(combined_improvements_df)


Combined Improvements Table:
                     Mutual Information  Interaction Scores  \
AdaBoost                      -0.015819           -0.019209   
Gradient Boosting              0.012429            0.003390   
LDA                           -0.022599           -0.027119   
Logistic Regression           -0.005650            0.003390   
Random Forest                 -0.007910           -0.003390   
SVM                           -0.020339           -0.021469   

                     Linear Correlation  Information Gain  \
AdaBoost                      -0.028249         -0.019209   
Gradient Boosting              0.007910          0.010169   
LDA                           -0.027119         -0.024859   
Logistic Regression           -0.006780         -0.002260   
Random Forest                 -0.012429         -0.007910   
SVM                           -0.025989         -0.012429   

                     Spearman Rank Correlation  Shannon Entropy  
AdaBoost                           