# 01 - SelectKBest, Fisher score & RFE - comparison

In [3]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np

In [4]:
# Load dataset
df = pd.read_csv("bank_numeric.csv")

# Define features and target
X = df.drop("deposit", axis=1)
y = df["deposit"]

In [5]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature names for reference
feature_names = X.columns.tolist()

In [6]:
# Feature Selection: SelectKBest (using ANOVA F-statistic)
select_kbest = SelectKBest(score_func=f_classif, k=10)
X_kbest = select_kbest.fit_transform(X_scaled, y)
kbest_features = [feature_names[i] for i in select_kbest.get_support(indices=True)]

In [7]:
# Feature Selection: Fisher Score
def fisher_score(X, y):
    scores = []
    for i in range(X.shape[1]):
        num = (np.mean(X[y == 0, i]) - np.mean(X[y == 1, i])) ** 2
        denom = np.var(X[y == 0, i]) + np.var(X[y == 1, i])
        scores.append(num / denom if denom != 0 else 0)
    return np.array(scores)

fisher_scores = fisher_score(X_scaled, y)
fisher_indices = fisher_scores.argsort()[-10:][::-1]  # Select top 10 features
fisher_features = [feature_names[i] for i in fisher_indices]

In [8]:
# Feature Selection: Recursive Feature Elimination (RFE)
model = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_scaled, y)
rfe_features = [feature_names[i] for i in rfe.get_support(indices=True)]

In [10]:
# Comparison of Selected Features
comparison_df = pd.DataFrame({
    "SelectKBest": pd.Series(kbest_features),
    "FisherScore": pd.Series(fisher_features),
    "RFE": pd.Series(rfe_features)
})


# Display comparison
print(comparison_df)

  SelectKBest FisherScore       RFE
0   education    duration       age
1     balance    poutcome       job
2     housing     contact   balance
3        loan    previous   contact
4     contact       pdays       day
5    duration     housing     month
6    campaign     balance  duration
7       pdays    campaign  campaign
8    previous   education     pdays
9    poutcome        loan  poutcome


In [None]:
# I utilized 3 feautres
# to determine which features in the dataset 
# are the most important or relevant for 
# predicting your target variable

# every tool has own logic
# SelectKBest: Picks features based on their statistical 
# relationship with the target variable 
# (how strongly each feature correlates with the target)

# Fisher Score: Selects features based on how well 
# they separate different classes in the target
# variable (e.g., good vs. bad outcomes).

# RFE (Recursive Feature Elimination): 
# Selects features by considering their 
# impact on a predictive model's performance 
# (in this case, a Random Forest classifier).

# We can see that some features like campaign, 
# poutcome, and contact appear in all three lists
# they are highly critical for the target variable

# SelectKBest:
# it prefers features like education, housing, and loan
# These might have a statistical relationship 
# with the target but might not work well together 
# in a model

# FisherScore:
# it emphasizes features like duration, pdays, and previous
# These features likely have strong class-separating 
# power but may not work well if their interaction 
# with other features isn’t strong

# RFE selects features like age, job, and month, 
# which are chosen based on how well 
# they help a Random Forest model make predictions

# it is a great opportunity to compare the same data
# using tools with different logics behind them

# there are also some variables that overlap
# in three analysis which indicate that
# they are very valuable for the target one
# for instance, campaign or poutcome