In [1]:
# ===============================
# FEATURE SELECTION NOTEBOOK
# Fully working MI + Permutation + SHAP
# Compatible with your pipeline structure
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.inspection import permutation_importance

# ===============================
# 1. Load dataset
# ===============================
df = pd.read_csv("student_data.csv", sep=";")
TARGET = "Output"
X = df.drop(TARGET, axis=1)
y = df[TARGET]

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# ===============================
# 2. Preprocessor + Pipeline
# ===============================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = RandomForestClassifier(random_state=42)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ===============================
# 3. Train-test split + Fit
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ===============================
# 4. Transform full X using the fitted preprocessor
# ===============================
fitted_preprocessor = clf.named_steps["preprocessor"]
X_preprocessed = fitted_preprocessor.transform(X)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()

# ===============================
# 5. Get feature names safely
# ===============================
num_feature_names = num_cols
cat_transformer = fitted_preprocessor.named_transformers_["cat"]

# get_feature_names_out for sklearn >= 1.0
try:
    cat_feature_names = cat_transformer.get_feature_names_out(cat_cols)
except AttributeError:
    cat_feature_names = cat_cols  # fallback for older sklearn

all_feature_names = list(num_feature_names) + list(cat_feature_names)

# ===============================
# 6. Mutual Information
# ===============================
mi_scores = mutual_info_classif(X_preprocessed, y, random_state=42)
mi_series = pd.Series(mi_scores, index=all_feature_names).sort_values(ascending=False)
print("\nTop MI Features:")
print(mi_series.head(20))

# ===============================
# 7. Permutation Importance
# ===============================
perm = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
perm_series = pd.Series(perm.importances_mean, index=all_feature_names).sort_values(ascending=False)
print("\nTop Permutation Importance Features:")
print(perm_series.head(20))

# ===============================
# 8. Combined Feature Selection Score
# ===============================
fs_df = pd.DataFrame({
    "MI": mi_series,
    "Permutation": perm_series
})

# Normalize to 0-1
fs_norm = (fs_df - fs_df.min()) / (fs_df.max() - fs_df.min())
fs_norm["score"] = fs_norm.mean(axis=1)

print("\nTop 20 Selected Features:")
print(fs_norm.sort_values("score", ascending=False).head(20))

              precision    recall  f1-score   support

     Dropout       0.81      0.75      0.78       284
    Enrolled       0.57      0.38      0.45       159
    Graduate       0.80      0.94      0.86       442

    accuracy                           0.78       885
   macro avg       0.73      0.69      0.70       885
weighted avg       0.76      0.78      0.76       885

[[212  29  43]
 [ 38  60  61]
 [ 11  17 414]]

Top MI Features:
Curricular units 2nd sem (approved)               0.310648
Curricular units 2nd sem (grade)                  0.246423
Curricular units 1st sem (approved)               0.236106
Curricular units 1st sem (grade)                  0.195508
Curricular units 2nd sem (evaluations)            0.098820
Tuition fees up to date                           0.078347
Curricular units 1st sem (evaluations)            0.076508
Application mode                                  0.061618
Course                                            0.059741
Scholarship holder      