# Probabilistic Feature Subset Selection with Ensemble Models

In [1]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from src.utils import compute_feature_frequency, load_uci_dataset, train_ensemble_models

## UCI Breast Cancer Clinical Records Dataset

### Load Dataset

In [3]:
# # Load dataset from UCI Machine Learning Repository if it does not exist
# from ucimlrepo import fetch_ucirepo
#
# # fetch dataset
# hcv_data = fetch_ucirepo(id=571)
#
# # data (as pandas dataframes)
# X = hcv_data.data.features
# y = hcv_data.data.targets
#
# # metadata
# print(hcv_data.metadata)
#
# # variable information
# print(hcv_data.variables)

In [6]:
# Load dataset from CSV file
FILE_PATH = "../data/hcvdat0.csv"

hep_c_records = pd.read_csv(FILE_PATH)
print("Number of samples:", len(hep_c_records))
hep_c_records.head()

Number of samples: 615


Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


### Preprocess Dataset

In [7]:
# Handle missing values by dropping rows with any missing values
hep_c_records = hep_c_records.dropna()
print("Number of samples:", len(hep_c_records))

Number of samples: 589


In [14]:
# Separate features and target after dropping missing values
X = hep_c_records.drop(columns=["Unnamed: 0", "Category"])  # Drop the first column and target

# map the gender column to integers m->1, f->0
X['Sex'] = X['Sex'].map({'m': 1, 'f': 0})

#select the category column as the target
y = hep_c_records["Category"]
#select only the first character of the string for each value in y and convert it to an integer
y = y.apply(lambda x: int(x[0]))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

0      1
1      1
2      1
3      1
4      1
      ..
608    0
609    0
610    0
611    0
612    0
Name: Sex, Length: 589, dtype: int64


### Assign Probabilities to Features using Mutual Information


In [15]:
mutual_info = mutual_info_classif(X_train, y_train)
probabilities_mi = mutual_info / np.sum(mutual_info)  # Normalize to create a probability distribution

print("Feature Probabilities (Mutual Information):")
print(probabilities_mi)

Feature Probabilities (Mutual Information):
[0.00594148 0.01085856 0.12249352 0.11484162 0.13240261 0.18738324
 0.09841207 0.1156652  0.07711062 0.07893319 0.04422028 0.01173761]


### Assign Probabilities to Features using Linear Correlation


In [20]:
linear_correlation = np.abs(X_train.corrwith(y_train))
probabilities_correlation = linear_correlation / linear_correlation.sum()

print("Feature Probabilities (Linear Correlation):")
print(probabilities_correlation)

### Assign Probabilities to Features using Coherence

In [None]:
res_coherence =

### Assign Probabilities to Features using Recurrence Rate

### Assign Probabilities to Features using Symbolic Entropy

### Train Multiple Ensembles on Probabilistically Sampled Subsets


In [None]:
probabilities = probabilities_mi

In [16]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier
AdaBoost               0.943503
Gradient Boosting      0.933333
LDA                    0.937853
Logistic Regression    0.952542
Random Forest          0.943503
SVM                    0.946893
Name: Accuracy, dtype: float64

In [17]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 0.  0.2 0.6 0.4 0.8 0.6 0.6 0.8 0.6 0.2 0. ]


### Assign probabilities to features using interaction scores

In [18]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

# Calculate pairwise interaction scores
interaction_matrix = np.zeros((X_train.shape[1], X_train.shape[1]))
for tree in rf.estimators_:
    for feature_idx, importance in enumerate(tree.feature_importances_):
        interaction_matrix[feature_idx] += importance

interaction_scores = interaction_matrix.sum(axis=1) / rf.n_estimators
probabilities = interaction_scores / interaction_scores.sum()

print("Feature Probabilities (Interaction Scores):")
print(probabilities)

Feature Probabilities (Interaction Scores):
[0.03907333 0.00498679 0.05015069 0.13591432 0.14373398 0.21013785
 0.06284569 0.15013282 0.04185235 0.04683836 0.07288722 0.0414466 ]


### Train Multiple Ensembles on Probabilistically Sampled Subsets


In [19]:
ensemble_results = train_ensemble_models(X_train, X_test, y_train, y_test, probabilities, n_ensembles=5, n_features_sample=5, random_state=42, verbose=False)

results_df = pd.DataFrame(ensemble_results)
results_df.groupby("Classifier")["Accuracy"].mean()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
                                                                              

Classifier
AdaBoost               0.944633
Gradient Boosting      0.935593
LDA                    0.946893
Logistic Regression    0.957062
Random Forest          0.952542
SVM                    0.950282
Name: Accuracy, dtype: float64

### Analyze uncertainty in feature importance

In [11]:
# Analyze uncertainty in feature importance
feature_frequency = compute_feature_frequency(ensemble_results, X_train.shape[1])

print("\nFeature Selection Frequency:")
print(feature_frequency)


Feature Selection Frequency:
[0.2 1.  1.  0.2 0.6 1.  0.4 0.4 0.2]
