In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("archive/dataset.csv")
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                             4424

In [3]:
DATA_PATH = "archive/dataset.csv"   # поменяй на свой путь
MODEL_PATH = "student_success_multiclass.pkl"

In [4]:
def clean_column_names(df):
    df = df.copy()
    
    rename_dict = {
        "Marital status": "marital_status",
        "Application mode": "application_mode",
        "Application order": "application_order",
        "Course": "course",
        "Daytime/evening attendance": "attendance_type",
        "Previous qualification": "previous_qualification",
        "Nacionality": "nationality",
        "Mother's qualification": "mother_qualification",
        "Father's qualification": "father_qualification",
        "Mother's occupation": "mother_occupation",
        "Father's occupation": "father_occupation",
        "Displaced": "displaced",
        "Educational special needs": "special_needs",
        "Debtor": "debtor",
        "Tuition fees up to date": "tuition_up_to_date",
        "Gender": "gender",
        "Scholarship holder": "scholarship",
        "Age at enrollment": "age",
        "International": "international",
        
        "Curricular units 1st sem (credited)": "sem1_credited",
        "Curricular units 1st sem (enrolled)": "sem1_enrolled",
        "Curricular units 1st sem (evaluations)": "sem1_evaluations",
        "Curricular units 1st sem (approved)": "sem1_approved",
        "Curricular units 1st sem (grade)": "sem1_grade",
        "Curricular units 1st sem (without evaluations)": "sem1_without_eval",
        
        "Curricular units 2nd sem (credited)": "sem2_credited",
        "Curricular units 2nd sem (enrolled)": "sem2_enrolled",
        "Curricular units 2nd sem (evaluations)": "sem2_evaluations",
        "Curricular units 2nd sem (approved)": "sem2_approved",
        "Curricular units 2nd sem (grade)": "sem2_grade",
        "Curricular units 2nd sem (without evaluations)": "sem2_without_eval",
        
        "Unemployment rate": "unemployment_rate",
        "Inflation rate": "inflation_rate",
        "GDP": "gdp",
        
        "Target": "target"
    }

    df = df.rename(columns=rename_dict)
    return df


def add_feature_engineering(df):
    df = df.copy()

    # --- Semester 1 ---
    if "sem1_enrolled" in df.columns and "sem1_approved" in df.columns:
        df["sem1_approval_rate"] = (
            df["sem1_approved"] / df["sem1_enrolled"].replace({0: np.nan})
        )

    if "sem1_evaluations" in df.columns and "sem1_approved" in df.columns:
        df["sem1_success_ratio"] = (
            df["sem1_approved"] / df["sem1_evaluations"].replace({0: np.nan})
        )

    # --- Semester 2 ---
    if "sem2_enrolled" in df.columns and "sem2_approved" in df.columns:
        df["sem2_approval_rate"] = (
            df["sem2_approved"] / df["sem2_enrolled"].replace({0: np.nan})
        )

    if "sem2_evaluations" in df.columns and "sem2_approved" in df.columns:
        df["sem2_success_ratio"] = (
            df["sem2_approved"] / df["sem2_evaluations"].replace({0: np.nan})
        )

    # Replace inf and NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    return df

In [None]:
# train_student_success.py

import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier


DATA_PATH = "archive/dataset.csv"
MODEL_PATH = "student_success_multiclass.pkl"
META_PATH  = "student_success_meta.json"


# ----------------------------
# 1) Column rename (snake_case)
# ----------------------------
RENAME_DICT = {
    "Marital status": "marital_status",
    "Application mode": "application_mode",
    "Application order": "application_order",
    "Course": "course",
    "Daytime/evening attendance": "attendance_type",
    "Previous qualification": "previous_qualification",
    "Nacionality": "nationality",
    "Mother's qualification": "mother_qualification",
    "Father's qualification": "father_qualification",
    "Mother's occupation": "mother_occupation",
    "Father's occupation": "father_occupation",
    "Displaced": "displaced",
    "Educational special needs": "special_needs",
    "Debtor": "debtor",
    "Tuition fees up to date": "tuition_up_to_date",
    "Gender": "gender",
    "Scholarship holder": "scholarship",
    "Age at enrollment": "age",
    "International": "international",

    "Curricular units 1st sem (credited)": "sem1_credited",
    "Curricular units 1st sem (enrolled)": "sem1_enrolled",
    "Curricular units 1st sem (evaluations)": "sem1_evaluations",
    "Curricular units 1st sem (approved)": "sem1_approved",
    "Curricular units 1st sem (grade)": "sem1_grade",
    "Curricular units 1st sem (without evaluations)": "sem1_without_eval",

    "Curricular units 2nd sem (credited)": "sem2_credited",
    "Curricular units 2nd sem (enrolled)": "sem2_enrolled",
    "Curricular units 2nd sem (evaluations)": "sem2_evaluations",
    "Curricular units 2nd sem (approved)": "sem2_approved",
    "Curricular units 2nd sem (grade)": "sem2_grade",
    "Curricular units 2nd sem (without evaluations)": "sem2_without_eval",

    "Unemployment rate": "unemployment_rate",
    "Inflation rate": "inflation_rate",
    "GDP": "gdp",

    "Target": "target",
}


def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df.rename(columns=RENAME_DICT)
    return df


# ----------------------------
# 2) Feature engineering
# ----------------------------
def add_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # sem1 rates
    if {"sem1_enrolled", "sem1_approved"}.issubset(df.columns):
        df["sem1_approval_rate"] = df["sem1_approved"] / df["sem1_enrolled"].replace({0: np.nan})

    if {"sem1_evaluations", "sem1_approved"}.issubset(df.columns):
        df["sem1_success_ratio"] = df["sem1_approved"] / df["sem1_evaluations"].replace({0: np.nan})

    if {"sem1_without_eval", "sem1_enrolled"}.issubset(df.columns):
        df["sem1_noeval_rate"] = df["sem1_without_eval"] / df["sem1_enrolled"].replace({0: np.nan})

    # sem2 rates
    if {"sem2_enrolled", "sem2_approved"}.issubset(df.columns):
        df["sem2_approval_rate"] = df["sem2_approved"] / df["sem2_enrolled"].replace({0: np.nan})

    if {"sem2_evaluations", "sem2_approved"}.issubset(df.columns):
        df["sem2_success_ratio"] = df["sem2_approved"] / df["sem2_evaluations"].replace({0: np.nan})

    if {"sem2_without_eval", "sem2_enrolled"}.issubset(df.columns):
        df["sem2_noeval_rate"] = df["sem2_without_eval"] / df["sem2_enrolled"].replace({0: np.nan})

    # clean
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df


# ----------------------------
# 3) Target mapping (3-class)
# ----------------------------
def normalize_target(s: pd.Series) -> pd.Series:
    # normalize: 'dropout', 'Enrolled ', etc -> Dropout/Enrolled/Graduate
    x = s.astype(str).str.strip().str.lower()
    mapping = {
        "dropout": "Dropout",
        "enrolled": "Enrolled",
        "graduate": "Graduate",
    }
    out = x.map(mapping)
    if out.isna().any():
        bad = sorted(set(s[out.isna()].astype(str).unique()))
        raise ValueError(f"Unexpected target values: {bad}")
    return out


# ----------------------------
# 4) Build training pipeline
# ----------------------------
def build_pipeline(X: pd.DataFrame) -> Pipeline:
    # Note: after renaming, most cols are numeric ints/floats -> treat as numeric
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]), cat_cols),
        ],
        remainder="drop"
    )

    model = XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        n_estimators=700,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        eval_metric="mlogloss",
    )

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model),
    ])
    return pipe


# ----------------------------
# 5) Prediction helper
# ----------------------------
def predict_one(bundle: dict, student_features: dict) -> dict:
    pipe = bundle["pipeline"]
    classes = bundle["classes"]

    X = pd.DataFrame([student_features])
    X = add_feature_engineering(X)

    prob = pipe.predict_proba(X)[0]  # order aligns with classes
    idx = int(np.argmax(prob))

    prediction = classes[idx]
    confidence = float(prob[idx]) * 100

    # (optional) dropout risk score for your website
    dropout_idx = classes.index("Dropout")
    dropout_risk = float(prob[dropout_idx]) * 100

    return {
        "prediction": prediction,
        "confidence": round(confidence, 1),
        "dropout_risk_score": round(dropout_risk, 1),
        "probs_percent": {classes[i]: round(float(prob[i]) * 100, 1) for i in range(len(classes))},
    }


# ----------------------------
# 6) MAIN
# ----------------------------
def main():
    df = pd.read_csv(DATA_PATH)
    pipe.fit(X_train, y_train)

    # eval
    probs = pipe.predict_proba(X_test)
    preds = np.argmax(probs, axis=1)

    print("\n=== Confusion Matrix ===")
    print(confusion_matrix(y_test, preds))

    print("\n=== Classification Report ===")
    print(classification_report(y_test, preds, target_names=classes, digits=4))

    # save bundle
    bundle = {
        "pipeline": pipe,
        "classes": classes,
        "label_map": label_map,
        "rename_dict": RENAME_DICT,
        "engineered_features": [
            "sem1_approval_rate", "sem1_success_ratio", "sem1_noeval_rate",
            "sem2_approval_rate", "sem2_success_ratio", "sem2_noeval_rate",
        ],
    }
    joblib.dump(bundle, MODEL_PATH)

    meta 
    # rename + FE
    df = clean_column_names(df)
    df = add_feature_engineering(df)

    if "target" not in df.columns:
        raise ValueError("Column 'Target' was not found (after renaming expected 'target').")

    df.info()

    # y (3 classes)
    y_text = normalize_target(df["target"])
    classes = ["Dropout", "Enrolled", "Graduate"]
    label_map = {c: i for i, c in enumerate(classes)}
    y = y_text.map(label_map).astype(int)

    # X
    X = df.drop(columns=["target"])

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # pipeline
    pipe = build_pipeline(X_train)
= {
        "data_path": DATA_PATH,
        "model_path": MODEL_PATH,
        "classes": classes,
        "notes": "Use clean_column_names + add_feature_engineering BEFORE training and BEFORE prediction on website."
    }
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Saved model bundle to: {MODEL_PATH}")
    print(f"✅ Saved meta to: {META_PATH}")

    # quick demo prediction
    example_student = {
        # minimal set: you can pass all features, but these are common
        "marital_status": 1,
        "application_mode": 17,
        "application_order": 1,
        "course": 33,
        "attendance_type": 1,
        "previous_qualification": 1,
        "nationality": 1,
        "mother_qualification": 1,
        "father_qualification": 1,
        "mother_occupation": 0,
        "father_occupation": 0,
        "displaced": 0,
        "special_needs": 0,
        "debtor": 1,
        "tuition_up_to_date": 0,
        "gender": 1,
        "scholarship": 0,
        "age": 21,
        "international": 1,
        "sem1_credited": 0,
        "sem1_enrolled": 6,
        "sem1_evaluations": 6,
        "sem1_approved": 2,
        "sem1_grade": 10.0,
        "sem1_without_eval": 0,
        "sem2_credited": 0,
        "sem2_enrolled": 6,
        "sem2_evaluations": 6,
        "sem2_approved": 3,
        "sem2_grade": 11.0,
        "sem2_without_eval": 0,
        "unemployment_rate": 7.5,
        "inflation_rate": 1.2,
        "gdp": 1.8
    }

    res = predict_one(bundle, example_student)
    print("\n=== Demo: single student ===")
    print(res)


if __name__ == "__main__":
    main()

<class 'pandas.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   marital_status          4424 non-null   int64  
 1   application_mode        4424 non-null   int64  
 2   application_order       4424 non-null   int64  
 3   course                  4424 non-null   int64  
 4   attendance_type         4424 non-null   int64  
 5   previous_qualification  4424 non-null   int64  
 6   nationality             4424 non-null   int64  
 7   mother_qualification    4424 non-null   int64  
 8   father_qualification    4424 non-null   int64  
 9   mother_occupation       4424 non-null   int64  
 10  father_occupation       4424 non-null   int64  
 11  displaced               4424 non-null   int64  
 12  special_needs           4424 non-null   int64  
 13  debtor                  4424 non-null   int64  
 14  tuition_up_to_date      4424 non-null   int64  
 15