In [24]:
# %%
import os, argparse, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


In [25]:
# %%
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pygam import LogisticGAM, s, f
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
# %%
# Current notebook folder
base_path = Path().resolve()
print("Notebook folder:", base_path)

# CSV path
csv_path = base_path.parent / "student_data.csv"

# Load CSV
df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
print(f"Loaded {len(df)} rows from {csv_path}")


Notebook folder: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\gam
Loaded 4424 rows from C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\student_data.csv


In [27]:
# %%
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)
    # treat low-cardinality integers as categorical
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols


In [28]:
# %%
def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì GAM"):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath); plt.close(fig)

In [1]:
def main():
    parser = argparse.ArgumentParser(description="GAM student outcome model (single-file).")
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data.csv"),
    help="CSV yolu (default: predictive_model/student_data.csv)")
    parser.add_argument("--target", default="Target", help="Hedef s√ºtun adƒ± (default: Target)")
    parser.add_argument("--sep", default=";", help="CSV ayra√ß (default: ;)")
    parser.add_argument("--test_size", type=float, default=0.20, help="Test oranƒ± (default: 0.20)")
    parser.add_argument("--seed", type=int, default=42, help="Rastgele tohum (default: 42)")
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "gam_model.joblib"),
    help="Model kaydetme yolu")
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "predictions.csv"),
    help="Tahmin √ßƒ±ktƒ±sƒ± CSV yolu")
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "confusion_matrix.png"),
    help="Karƒ±≈üƒ±klƒ±k matrisi g√∂rsel yolu")
    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "predictions.csv"
    args.save_model = base_path / "gam_model.joblib"
    args.cm_out = base_path / "confusion_matrix.png"

    # Load data
    csv_path = base_path.parent / "student_data.csv"
    print("Loading CSV from:", csv_path)
    df = pd.read_csv(csv_path, sep=args.sep, engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows from {csv_path}")

    # Normalize headers
    df.columns = [str(c).replace("\ufeff","").strip().replace(" ","_") for c in df.columns]

    # Detect target column
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target","target","Status","Outcome","Result","Label"]
        aliases = [a.replace(" ","_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
           target_col = found[0]
        else:
           KNOWN = {"Dropout","Graduate","Enrolled"}
           found = [c for c in df.columns if len(set(df[c].astype(str).str.strip().unique()) - KNOWN - {""}) <= 0]
           if not found:
              raise ValueError("Couldn't find target column.")
           target_col = found[0]

    # Split
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])
    X_train, X_test, y_train, y_test = train_test_split(
       X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # Preprocess
    num_cols, cat_cols = infer_columns(df, target_col)
    pre = ColumnTransformer([
       ("num", SimpleImputer(strategy="median"), num_cols),
       ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ], remainder="drop")
    X_train_p = pre.fit_transform(X_train)
    X_test_p = pre.transform(X_test)
    X_full_p = pre.transform(X)

    # Build GAM
    n_features = X_train_p.shape[1]
    gam = LogisticGAM(s(0))
    for i in range(1, n_features):
       gam = gam + s(i)
    gam.gridsearch(X_train_p, y_train)

    # Evaluate
    y_pred = gam.predict(X_test_p)
    print("=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))
    labels_sorted = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # Full CSV predictions
    proba = gam.predict_proba(X_full_p)
    out = df.copy()
    out["prediction"] = gam.predict(X_full_p)
    classes = gam.classes_
    for i, c in enumerate(classes):
       out[f"p_{c}"] = proba[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")

    # Save model
    if args.save_model:
       joblib.dump((pre, gam), args.save_model)
       print(f"üíæ Saved model to {args.save_model}")
    main()