In [10]:
import os, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score

from pygam import LogisticGAM


def infer_columns(df: pd.DataFrame):
    cat_cols = list(df.select_dtypes(include=["object", "category"]).columns)
    for c in df.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if df[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in df.columns if c not in cat_cols]
    return num_cols, cat_cols


def plot_confusion_matrix(cm, labels, outpath="gam_confusion_matrix_student2.png",
                          title="Confusion Matrix ‚Äì GAM (student_data_2)"):
    fig, ax = plt.subplots()
    ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath)
    plt.close(fig)


def main():
    class Args:
        csv = "student_data_2.csv"
        target = None
        sep = ";"
        test_size = 0.2
        seed = 42
        save_model = "gam_model_student2.joblib"
        pred_out = "gam_predictions_student2.csv"
        cm_out = "gam_confusion_matrix_student2.png"

    args = Args()
    base_path = Path().resolve()
    csv_path = base_path / args.csv
    args.save_model = base_path / args.save_model
    args.pred_out = base_path / args.pred_out
    args.cm_out = base_path / args.cm_out

    print("Loading CSV from:", csv_path)
    df = pd.read_csv(csv_path, sep=args.sep, engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows.")


    df.columns = [str(c).replace("\ufeff", "").strip().replace(" ", "_") for c in df.columns]

    # detect target
    if args.target is None or args.target.replace(" ", "_") not in df.columns:
        possibles = ["Output", "Target", "Status", "Outcome", "Result", "Label"]
        found = [c for c in df.columns if c in possibles or c.lower() in [t.lower() for t in possibles]]
        if not found:
            raise ValueError("Could not identify target column. Set args.target manually.")
        target_col = found[0]
    else:
        target_col = args.target.replace(" ", "_")

    print("Detected target column:", target_col)

    df_binary = df[df[target_col].isin(["Dropout", "Graduate"])].copy()
    print(f"Filtered to binary target: {len(df_binary)} rows")

    y = df_binary[target_col]
    X = df_binary.drop(columns=[target_col])

    label_enc = LabelEncoder()
    y_enc = label_enc.fit_transform(y)

    num_cols, cat_cols = infer_columns(X)

    pre = ColumnTransformer(
        [
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ],
        remainder="drop",
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=args.test_size, random_state=args.seed, stratify=y_enc
    )

    X_train_pre = pre.fit_transform(X_train).astype(np.float64)
    X_test_pre = pre.transform(X_test).astype(np.float64)

    print("Fitting GAM model (no CV)‚Ä¶")
    gam = LogisticGAM(n_splines=5, max_iter=200).fit(X_train_pre, y_train)

    y_pred = gam.predict(X_test_pre)
    print("\n=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3, target_names=label_enc.classes_))

    cm = confusion_matrix(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"\n‚≠ê Cohen Kappa Score: {kappa:.4f}")

    plot_confusion_matrix(cm, label_enc.classes_, args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # full dataset
    X_full_pre = pre.transform(X).astype(np.float64)
    probs = gam.predict_proba(X_full_pre)

    out = df_binary.copy()
    out["prediction"] = label_enc.inverse_transform(gam.predict(X_full_pre))
    if probs.ndim == 1:
        probs = np.column_stack([1 - probs, probs])
    for i, c in enumerate(label_enc.classes_):
        out[f"p_{c}"] = probs[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")

    joblib.dump({"model": gam, "preprocess": pre, "label_encoder": label_enc}, args.save_model)
    print(f"üíæ Saved GAM model to {args.save_model}")


main()


Loading CSV from: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/gam/student_data_2.csv
Loaded 1000 rows.
Detected target column: Output
Filtered to binary target: 825 rows
Fitting GAM model (no CV)‚Ä¶

=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.878     0.741     0.804        58
    Graduate      0.871     0.944     0.906       107

    accuracy                          0.873       165
   macro avg      0.874     0.843     0.855       165
weighted avg      0.873     0.873     0.870       165


‚≠ê Cohen Kappa Score: 0.7106
üñºÔ∏è Saved confusion matrix to /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/gam/gam_confusion_matrix_student2.png
‚úÖ Wrote predictions to /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/gam/gam_predictions_student2.csv
üíæ Saved GAM model to /Users/defneelagoz/Desktop/practical course/practical-cou