In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("student_performance_cleaned.csv")

# 1) Identify obvious leakage columns
leakage_like = [c for c in df.columns if any(k in c.lower() for k in [
    "score","grade","result","gpa","mark","points","performance","exam"
])]
print("Possible leakage columns:", leakage_like)

# 2) Build a balanced 3-class target from Exam_Score using quantiles
#    - If you must keep fixed cutoffs later (e.g., 80/50), we can switch back.
assert "Exam_Score" in df.columns, "Exam_Score not found. Tell me the score column name."
labels = ["Low","Medium","High"]
df["Performance_Level"] = pd.qcut(df["Exam_Score"], q=3, labels=labels)

# 3) Define features and target, dropping *all* score/grade-like columns to avoid leakage
TARGET = "Performance_Level"
drop_cols = set(leakage_like + [TARGET])  # target + any score-like columns
X = df.drop(columns=list(drop_cols), errors="ignore")
y = df[TARGET]

print("X shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True).round(3))

Possible leakage columns: ['Previous_Scores', 'Exam_Score']
X shape: (6607, 18)
Target distribution:
 Performance_Level
Low       0.436
Medium    0.318
High      0.246
Name: proportion, dtype: float64


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)

# encode y
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

logreg = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    class_weight="balanced",
    random_state=42
)

pipe_lr = Pipeline([
    ("pre", preprocess),
    ("clf", logreg)
])

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(
    y_test, y_pred, target_names=le.classes_, zero_division=0
))

Accuracy: 0.7813918305597579
Macro F1: 0.7761702127659574

Confusion matrix:
 [[266   2  57]
 [  0 480  97]
 [ 67  66 287]]

Classification report:
               precision    recall  f1-score   support

        High       0.80      0.82      0.81       325
         Low       0.88      0.83      0.85       577
      Medium       0.65      0.68      0.67       420

    accuracy                           0.78      1322
   macro avg       0.78      0.78      0.78      1322
weighted avg       0.79      0.78      0.78      1322



In [9]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_macro_f1 = cross_val_score(pipe_lr, X, y_enc, cv=cv, scoring="f1_macro", n_jobs=-1)
cv_acc      = cross_val_score(pipe_lr, X, y_enc, cv=cv, scoring="accuracy", n_jobs=-1)

print("CV Macro-F1:", f"{cv_macro_f1.mean():.4f} ± {cv_macro_f1.std():.4f}")
print("CV Accuracy:", f"{cv_acc.mean():.4f} ± {cv_acc.std():.4f}")

CV Macro-F1: 0.7748 ± 0.0051
CV Accuracy: 0.7801 ± 0.0050


In [12]:
import joblib, os
os.makedirs("../models", exist_ok=True)
joblib.dump(pipe_lr, "../student-dropout/models/logreg_balanced_pipeline.joblib")
joblib.dump(le, "../student-dropout/models/performance_label_encoder.joblib")

['../student-dropout/models/performance_label_encoder.joblib']

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

dt = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=None,          # we’ll tune later if needed
    random_state=42,
    class_weight="balanced"  # helps smaller classes get attention
)

pipe_dt = Pipeline([
    ("pre", preprocess),
    ("clf", dt)
])

pipe_dt.fit(X_train, y_train)
y_pred_dt = pipe_dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Macro-F1:", f1_score(y_test, y_pred_dt, average="macro"))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred_dt, target_names=le.classes_, zero_division=0))


Decision Tree Accuracy: 0.6936459909228442
Decision Tree Macro-F1: 0.6817008287066821

Confusion Matrix:
 [[232  14  79]
 [ 13 460 104]
 [ 83 112 225]]

Classification Report:
               precision    recall  f1-score   support

        High       0.71      0.71      0.71       325
         Low       0.78      0.80      0.79       577
      Medium       0.55      0.54      0.54       420

    accuracy                           0.69      1322
   macro avg       0.68      0.68      0.68      1322
weighted avg       0.69      0.69      0.69      1322



In [18]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_dt = cross_val_score(pipe_dt, X, y_enc, cv=cv, scoring="f1_macro", n_jobs=-1)
cv_acc_dt = cross_val_score(pipe_dt, X, y_enc, cv=cv, scoring="accuracy", n_jobs=-1)

print("Decision Tree CV Macro-F1:", f"{cv_f1_dt.mean():.4f} ± {cv_f1_dt.std():.4f}")
print("Decision Tree CV Accuracy:", f"{cv_acc_dt.mean():.4f} ± {cv_acc_dt.std():.4f}")

Decision Tree CV Macro-F1: 0.6757 ± 0.0120
Decision Tree CV Accuracy: 0.6881 ± 0.0122


In [19]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt

feature_names = pipe_dt.named_steps["pre"].get_feature_names_out()   # works for num + cat
importances   = pipe_dt.named_steps["clf"].feature_importances_

fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(8,6))
fi.head(15).plot(kind="barh")
plt.title("Top 15 Most Important Features (Decision Tree)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


AttributeError: 'OneHotEncoder' object has no attribute 'named_steps'