In [1]:
import argparse
import joblib
import numpy as np
import pandas as pd


MODEL_PATH_DEFAULT = "student_success_multiclass.pkl"


def clean_column_names(df: pd.DataFrame, rename_dict: dict) -> pd.DataFrame:
    df = df.copy()
    return df.rename(columns=rename_dict)


def add_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # sem1 rates
    if {"sem1_enrolled", "sem1_approved"}.issubset(df.columns):
        df["sem1_approval_rate"] = df["sem1_approved"] / df["sem1_enrolled"].replace({0: np.nan})

    if {"sem1_evaluations", "sem1_approved"}.issubset(df.columns):
        df["sem1_success_ratio"] = df["sem1_approved"] / df["sem1_evaluations"].replace({0: np.nan})

    if {"sem1_without_eval", "sem1_enrolled"}.issubset(df.columns):
        df["sem1_noeval_rate"] = df["sem1_without_eval"] / df["sem1_enrolled"].replace({0: np.nan})

    # sem2 rates
    if {"sem2_enrolled", "sem2_approved"}.issubset(df.columns):
        df["sem2_approval_rate"] = df["sem2_approved"] / df["sem2_enrolled"].replace({0: np.nan})

    if {"sem2_evaluations", "sem2_approved"}.issubset(df.columns):
        df["sem2_success_ratio"] = df["sem2_approved"] / df["sem2_evaluations"].replace({0: np.nan})

    if {"sem2_without_eval", "sem2_enrolled"}.issubset(df.columns):
        df["sem2_noeval_rate"] = df["sem2_without_eval"] / df["sem2_enrolled"].replace({0: np.nan})

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df


def predict_one(bundle: dict, student_features: dict) -> dict:
    pipe = bundle["pipeline"]
    classes = bundle["classes"]

    X = pd.DataFrame([student_features])
    X = add_feature_engineering(X)

    prob = pipe.predict_proba(X)[0]  # order aligns with `classes`
    idx = int(np.argmax(prob))

    prediction = classes[idx]
    confidence = float(prob[idx]) * 100

    dropout_idx = classes.index("Dropout")
    dropout_risk = float(prob[dropout_idx]) * 100

    return {
        "prediction": prediction,
        "confidence": round(confidence, 1),
        "dropout_risk_score": round(dropout_risk, 1),
        "probs_percent": {classes[i]: round(float(prob[i]) * 100, 1) for i in range(len(classes))},
    }


def predict_csv(bundle: dict, csv_in: str, csv_out: str) -> None:
    pipe = bundle["pipeline"]
    classes = bundle["classes"]
    rename_dict = bundle.get("rename_dict", {})

    df = pd.read_csv(csv_in)

    # If CSV still has original column names, rename them.
    # If it already has snake_case, rename will just do nothing for missing keys.
    df = clean_column_names(df, rename_dict)
    df = add_feature_engineering(df)

    # Drop target if present
    if "target" in df.columns:
        df_features = df.drop(columns=["target"])
    else:
        df_features = df

    probs = pipe.predict_proba(df_features)  # (n,3)
    preds = np.argmax(probs, axis=1)

    pred_labels = [classes[i] for i in preds]
    confidence = np.max(probs, axis=1) * 100

    dropout_idx = classes.index("Dropout")
    dropout_risk = probs[:, dropout_idx] * 100

    out = df.copy()
    out["prediction"] = pred_labels
    out["confidence"] = np.round(confidence, 1)
    out["dropout_risk_score"] = np.round(dropout_risk, 1)

    # add per-class probs
    for i, c in enumerate(classes):
        out[f"prob_{c.lower()}"] = np.round(probs[:, i] * 100, 1)

    out.to_csv(csv_out, index=False)
    print(f"âœ… Saved predictions to: {csv_out}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default=MODEL_PATH_DEFAULT, help="Path to saved model bundle .pkl")
    parser.add_argument("--csv_in", default=None, help="Optional: input CSV for batch prediction")
    parser.add_argument("--csv_out", default="/mnt/data/predictions.csv", help="Output CSV path")
    # parse_known_args allows us to run inside notebooks where extra args (e.g. --f=...) are passed
    args, _ = parser.parse_known_args()

    bundle = joblib.load(args.model)

    if args.csv_in:
        predict_csv(bundle, args.csv_in, args.csv_out)
        return

    # ---- Example single prediction ----
    # IMPORTANT: use snake_case keys (recommended for your website).
    example_student = {
        "marital_status": 1,
        "application_mode": 17,
        "application_order": 1,
        "course": 33,
        "attendance_type": 1,
        "previous_qualification": 1,
        "nationality": 1,
        "mother_qualification": 1,
        "father_qualification": 1,
        "mother_occupation": 0,
        "father_occupation": 0,
        "displaced": 0,
        "special_needs": 0,
        "debtor": 1,
        "tuition_up_to_date": 0,
        "gender": 1,
        "scholarship": 0,
        "age": 21,
        "international": 1,
        "sem1_credited": 0,
        "sem1_enrolled": 6,
        "sem1_evaluations": 6,
        "sem1_approved": 2,
        "sem1_grade": 10.0,
        "sem1_without_eval": 0,
        "sem2_credited": 0,
        "sem2_enrolled": 6,
        "sem2_evaluations": 6,
        "sem2_approved": 3,
        "sem2_grade": 11.0,
        "sem2_without_eval": 0,
        "unemployment_rate": 7.5,
        "inflation_rate": 1.2,
        "gdp": 1.8
    }

    res = predict_one(bundle, example_student)
    print("\n=== Single student prediction ===")
    print(res)


if __name__ == "__main__":
    main()


=== Single student prediction ===
{'prediction': 'Dropout', 'confidence': 99.9, 'dropout_risk_score': 99.9, 'probs_percent': {'Dropout': 99.9, 'Enrolled': 0.1, 'Graduate': 0.0}}


In [4]:
# quick sanity check of the FastAPI backend
import requests

url = "http://127.0.0.1:8000/predict"
example = {
    "marital_status": 1,
    "application_mode": 17,
    "application_order": 1,
    "course": 33,
    "attendance_type": 1,
    "previous_qualification": 1,
    "nationality": 1,
    "mother_qualification": 1,
    "father_qualification": 1,
    "mother_occupation": 0,
    "father_occupation": 0,
    "displaced": 0,
    "special_needs": 0,
    "debtor": 1,
    "tuition_up_to_date": 0,
    "gender": 1,
    "scholarship": 0,
    "age": 21,
    "international": 1,
    "sem1_credited": 0,
    "sem1_enrolled": 6,
    "sem1_evaluations": 6,
    "sem1_approved": 2,
    "sem1_grade": 10.0,
    "sem1_without_eval": 0,
    "sem2_credited": 0,
    "sem2_enrolled": 6,
    "sem2_evaluations": 6,
    "sem2_approved": 3,
    "sem2_grade": 11.0,
    "sem2_without_eval": 0,
    "unemployment_rate": 7.5,
    "inflation_rate": 1.2,
    "gdp": 1.8
}

resp = requests.post(url, json=example)
print(resp.status_code)
print(resp.json())

ModuleNotFoundError: No module named 'requests'