In [None]:
# ===============================
# FEATURE SELECTION NOTEBOOK
# Fully working MI + Permutation + SHAP
# Compatible with your pipeline structure
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.inspection import permutation_importance
import shap

# ===============================
# 1. Load dataset
# ===============================
df = pd.read_csv("student_data.csv", sep=";")
TARGET = "Output"
X = df.drop(TARGET, axis=1)
y = df[TARGET]

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# ===============================
# 2. Preprocessor + Model Pipeline
# ===============================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = RandomForestClassifier(random_state=42)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ===============================
# 3. Train-test split + Fit
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ===============================
# 4. MUTUAL INFORMATION
# ===============================

# Transform full X for MI
X_preprocessed = preprocessor.fit_transform(X)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()

ohe = preprocessor.named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols)
all_feature_names = list(num_cols) + list(cat_feature_names)

mi_scores = mutual_info_classif(X_preprocessed, y, random_state=42)
mi_series = pd.Series(mi_scores, index=all_feature_names).sort_values(ascending=False)

print("Top MI Features:")
print(mi_series.head(20))

# ===============================
# 5. PERMUTATION IMPORTANCE
# ===============================

X_test_trans = preprocessor.transform(X_test)
if hasattr(X_test_trans, "toarray"):
    X_test_trans = X_test_trans.toarray()

perm = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
perm_series = pd.Series(perm.importances_mean, index=all_feature_names)
perm_series = perm_series.sort_values(ascending=False)

print("Top Permutation Importance Features:")
print(perm_series.head(20))

# ===============================
# 6. SHAP VALUES
# ===============================

X_test_small = X_test_trans[:200]

# Auto-select SHAP explainer
tree_model = clf.named_steps["model"]

if hasattr(tree_model, "estimators_") or hasattr(tree_model, "feature_importances_"):
    explainer = shap.TreeExplainer(tree_model)
    shap_values = explainer.shap_values(X_test_small)
else:
    explainer = shap.KernelExplainer(clf.predict_proba, X_test_small[:50])
    shap_values = explainer.shap_values(X_test_small)

# SHAP summary plot
shap.summary_plot(shap_values, X_test_small, feature_names=all_feature_names)

# ===============================
# 7. COMBINED FEATURE SELECTION SCORE
# ===============================

fs_df = pd.DataFrame({
    "MI": mi_series,
    "Permutation": perm_series
})

# Normalize 0â€“1
fs_norm = (fs_df - fs_df.min()) / (fs_df.max() - fs_df.min())
fs_norm["score"] = fs_norm.mean(axis=1)

print("Top 20 selected features:")
print(fs_norm.sort_values("score", ascending=False).head(20))
