In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    precision_recall_curve,
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Loading data
df = pd.read_csv("../FINAL_FINAL_FINAL.csv")
df.columns = df.columns.str.lower()

In [2]:
# Feature engineering
absent_cols = [f"absent_pc_grade_{i}" for i in range(3, 9)]
pc_cols = [
    "pc_ma04",
    "pc_ma05",
    "pc_ma06",
    "pc_ma07",
    "pc_ma08",
    "pc_rd04",
    "pc_rd05",
    "pc_rd06",
    "pc_rd07",
    "pc_rd08",
    "pc_sc05",
    "pc_sc08",
]
df["avg_absent"] = df[absent_cols].mean(axis=1)
df["avg_pc"] = df[pc_cols].mean(axis=1)

In [3]:
# Droping unimportant columns
df.drop(columns=["mastid", "lea", "schlcode"], inplace=True)

In [4]:
# Selecting features andd dropping missing rows
features = [
    "avg_absent",
    "avg_pc",
    "act",
    "iep",
    "eds",
    "fcs",
    "hms",
    "mig",
    "els",
    "swd",
    "mil",
    "aig",
]
df = df[features + ["ethnic", "sex", "exit_code_desc"]].dropna()

In [5]:
# Converting “Y”/“N” to 1/0
for c in features[2:]:
    df[c] = df[c].map({"Y": 1, "N": 0})

In [6]:
# One-hot encodeing demographics
df = pd.get_dummies(df, columns=["ethnic", "sex"], drop_first=True)

In [7]:
# Encoding target
df["y"] = LabelEncoder().fit_transform(df["exit_code_desc"])
df.drop(columns=["exit_code_desc"], inplace=True)

In [8]:
# Spliting out 10% for the test set:
X = df.drop(columns="y")
y = df["y"]
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.10, stratify=y, random_state=42
)

In [9]:
# From the rest 90%, split off 10% for validation
val_frac = 0.10 / 0.90
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_frac, stratify=y_temp, random_state=42
)

print(f"Splits → train: {len(X_train)}, val: {len(X_val)}, test: {len(X_test)}")

Splits → train: 238680, val: 29836, test: 29836


In [10]:
# Cleaning infinities and imputing with median
for D in (X_train, X_val, X_test):
    D.replace([np.inf, -np.inf], np.nan, inplace=True)
median = X_train.median()
X_train.fillna(median, inplace=True)
X_val.fillna(median, inplace=True)
X_test.fillna(median, inplace=True)

In [11]:
# Converting to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)
# Computing imbalance weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
# Defining the hyperparameter grid
param_grid = {
    "max_depth": [3, 5, 7],
    "eta": [0.1, 0.01],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "objective": "binary:logistic",
    "scale_pos_weight": scale_pos_weight,
    "eval_metric": "auc",
}

best_score = 0
best_params = None
best_rounds = 0
# Manual grid search with early stopping
for max_depth in param_grid["max_depth"]:
    for eta in param_grid["eta"]:
        for subsample in param_grid["subsample"]:
            for colsample in param_grid["colsample_bytree"]:
                params = {
                    "max_depth": max_depth,
                    "eta": eta,
                    "subsample": subsample,
                    "colsample_bytree": colsample,
                    "objective": "binary:logistic",
                    "scale_pos_weight": scale_pos_weight,
                    "eval_metric": "auc",
                }
                evals = [(dtrain, "train"), (dval, "val")]
                bst = xgb.train(
                    params,
                    dtrain,
                    num_boost_round=500,
                    evals=evals,
                    early_stopping_rounds=10,
                    verbose_eval=False,
                )
                if bst.best_score > best_score:
                    best_score = bst.best_score
                    best_params = params
                    best_rounds = bst.best_iteration

print("Best validation AUC:", best_score)
print("Best params:", best_params)
print("Best rounds:", best_rounds)
# Retraining the final model on train + validation
dtrain_full = xgb.DMatrix(
    pd.concat([X_train, X_val], axis=0), label=pd.concat([y_train, y_val], axis=0)
)
bst_final = xgb.train(best_params, dtrain_full, num_boost_round=best_rounds)

In [16]:
# Evaluating on the test set
y_test_proba = bst_final.predict(dtest)
y_test_pred = (y_test_proba > 0.5).astype(int)

print("\nTest Metrics:")
print(classification_report(y_test, y_test_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_test_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
# Computing confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Ploting
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=True,
    xticklabels=["Graduated (0)", "Dropped out (1)"],
    yticklabels=["Graduated (0)", "Dropped out (1)"],
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Variables")
plt.ylabel("Actual Variables")
plt.tight_layout()
plt.show()


# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
plt.figure()
plt.plot(fpr, tpr, lw=2)
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.title("ROC Curve (AUC = {:.3f})".format(roc_auc_score(y_test, y_test_proba)))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_test, y_test_proba)
plt.figure()
plt.plot(rec, prec, lw=2)
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

# Feature Importance (gain)
importance = bst_final.get_score(importance_type="gain")
items = sorted(importance.items(), key=lambda x: x[1], reverse=True)
feat, score = zip(*items)
plt.figure(figsize=(6, 8))
plt.barh(feat, score)
plt.gca().invert_yaxis()
plt.title("Feature Importance (gain)")
plt.xlabel("Gain")
plt.show()


Test Metrics:
              precision    recall  f1-score   support

           0       0.07      0.79      0.14       776
           1       0.99      0.74      0.85     29060

    accuracy                           0.74     29836
   macro avg       0.53      0.76      0.49     29836
weighted avg       0.97      0.74      0.83     29836

Test ROC AUC: 0.8502567563732344
Confusion Matrix:
 [[  610   166]
 [ 7594 21466]]
