In [1]:
import pandas as pd
import os, argparse, joblib
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    cohen_kappa_score,
    make_scorer,
)

In [4]:
base_path = Path().resolve()
print("Notebook folder:", base_path)

# Just to show where it expects the CSV:
csv_path = base_path.parent / "student_data_2.csv"
print("Expected CSV location:", csv_path)

Notebook folder: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\log_regression
Expected CSV location: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\student_data_2.csv


In [5]:
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

In [6]:
def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì Baseline"):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath); plt.close(fig)

In [21]:
def main():
    parser = argparse.ArgumentParser(description="Logistic regression model on student_data_2 with CV.")

    # === use student_data_2.csv ===
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data_2.csv"))
    parser.add_argument("--target", default="Target")
    parser.add_argument("--sep", default=";")
    parser.add_argument("--test_size", type=float, default=0.20)
    parser.add_argument("--seed", type=int, default=42)

    # === separate output names so you don't overwrite the old ones ===
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "baseline_model_student2.joblib"))
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "predictions_student2.csv"))
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "confusion_matrix_student2.png"))

    # in notebook: no CLI args
    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "predictions_student2.csv"
    args.save_model = base_path / "baseline_model_student2.joblib"
    args.cm_out = base_path / "confusion_matrix_student2.png"

    # ===== 1) Load CSV =====
    csv_path = base_path.parent / "student_data_2.csv"
    print("Loading CSV from:", csv_path)

    df = pd.read_csv(csv_path, sep=args.sep, engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows")

    df.columns = [str(c).replace("\ufeff", "").strip().replace(" ", "_") for c in df.columns]

    # Target detection
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
            target_col = found[0]
        else:
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError("Target column not found.")
            target_col = found[0]

    print("Detected target column:", target_col)

    # ===== 2) Split =====
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # ===== 3) Preprocess + Model =====
    num_cols, cat_cols = infer_columns(df, target_col)

    pre = ColumnTransformer(
        [
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop"
    )

    clf = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=args.seed
    )

    pipe = Pipeline([
        ("pre", pre),
        ("clf", clf)
    ])

    # ===== 3b) Cross-Validation =====
    # We evaluate the pipeline using 5-fold and 10-fold CV on the *training* data

    # ===== 3b) Cross-Validation on training data (same format as other models) =====
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)
    cv10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.seed)

    scoring = {
        "accuracy": "accuracy",
        "kappa": make_scorer(cohen_kappa_score),
    }

    print("\n=== 5-fold Cross-Validation (training set) ===")
    cv5_results = cross_validate(
        pipe,
        X_train,
        y_train,
        cv=cv5,
        scoring=scoring,
        n_jobs=-1,
    )
    print(f"Accuracy: mean={cv5_results['test_accuracy'].mean():.3f}, std={cv5_results['test_accuracy'].std():.3f}")
    print(f"Kappa:    mean={cv5_results['test_kappa'].mean():.3f}, std={cv5_results['test_kappa'].std():.3f}")

    print("\n=== 10-fold Cross-Validation (training set) ===")
    cv10_results = cross_validate(
        pipe,
        X_train,
        y_train,
        cv=cv10,
        scoring=scoring,
        n_jobs=-1,
    )
    print(f"Accuracy: mean={cv10_results['test_accuracy'].mean():.3f}, std={cv10_results['test_accuracy'].std():.3f}")
    print(f"Kappa:    mean={cv10_results['test_kappa'].mean():.3f}, std={cv10_results['test_kappa'].std():.3f}")

    # ===== 4) Fit final model on full training set =====
    pipe.fit(X_train, y_train)

    # ===== 4) Fit final model on training set =====
    pipe.fit(X_train, y_train)

    # ===== 5) Evaluate on hold-out test set =====
    y_pred = pipe.predict(X_test)

    print("\n=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))

    # ‚≠ê Kappa Score
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"\n‚≠ê Cohen Kappa Score (test set): {kappa:.4f}")

    # Confusion Matrix
    labels_sorted = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out,
                          title="Confusion Matrix ‚Äì Logistic Regression (student_data_2)")
    print(f"üñºÔ∏è Confusion matrix saved to: {args.cm_out}")

    # ===== 6) Predict on full CSV =====
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = pipe.named_steps["clf"].classes_

    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]

    out.to_csv(args.pred_out, index=False)
    print(f"üìÅ Predictions saved to: {args.pred_out}")

    # ===== 7) Save Model =====
    joblib.dump(pipe, args.save_model)
    print(f"üíæ Model saved to: {args.save_model}")

# %%
main()

Loading CSV from: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/student_data_2.csv
Loaded 1000 rows
Detected target column: Output

=== 5-fold Cross-Validation (training set) ===
Accuracy: mean=0.731, std=0.029
Kappa:    mean=0.556, std=0.050

=== 10-fold Cross-Validation (training set) ===
Accuracy: mean=0.731, std=0.040
Kappa:    mean=0.555, std=0.067

=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.765     0.672     0.716        58
    Enrolled      0.311     0.400     0.350        35
    Graduate      0.827     0.804     0.815       107

    accuracy                          0.695       200
   macro avg      0.634     0.625     0.627       200
weighted avg      0.719     0.695     0.705       200


‚≠ê Cohen Kappa Score (test set): 0.4987
üñºÔ∏è Confusion matrix saved to: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/log_regression/confusion_matrix_st

In [12]:
def main():
    parser = argparse.ArgumentParser(description="Logistic regression model on student_data_2 with CV.")
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data_2.csv"))
    parser.add_argument("--target", default="Target")
    parser.add_argument("--sep", default=";")
    parser.add_argument("--test_size", type=float, default=0.20)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "baseline_model_student2.joblib"))
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "predictions_student2.csv"))
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "confusion_matrix_student2.png"))
    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "predictions_student2.csv"
    args.save_model = base_path / "baseline_model_student2.joblib"
    args.cm_out = base_path / "confusion_matrix_student2.png"

    # ===== Load CSV =====
    csv_path = base_path.parent / "student_data_2.csv"
    print("Loading CSV from:", csv_path)
    df = pd.read_csv(csv_path, sep=args.sep, engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows")

    df.columns = [str(c).replace("\ufeff", "").strip().replace(" ", "_") for c in df.columns]

    # Detect target
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
            target_col = found[0]
        else:
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError("Target column not found.")
            target_col = found[0]
    print("Detected target column:", target_col)

    # ===== Split =====
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # ===== Preprocess + Model =====
    num_cols, cat_cols = infer_columns(df, target_col)
    pre = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ], remainder="drop")

    clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=args.seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # ===== Cross-Validation =====
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)
    scoring = {"accuracy": "accuracy", "kappa": make_scorer(cohen_kappa_score)}
    cv5_results = cross_validate(pipe, X_train, y_train, cv=cv5, scoring=scoring, n_jobs=-1)
    print(f"5-fold CV Accuracy: {cv5_results['test_accuracy'].mean():.3f}, std={cv5_results['test_accuracy'].std():.3f}")
    print(f"5-fold CV Kappa: {cv5_results['test_kappa'].mean():.3f}, std={cv5_results['test_kappa'].std():.3f}")

    # ===== Fit final model =====
    pipe.fit(X_train, y_train)

    # ===== Evaluate on test set =====
    y_pred = pipe.predict(X_test)
    print("\n=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"\n‚≠ê Cohen Kappa Score (test set): {kappa:.4f}")

    # ===== Overall confusion matrix =====
    labels_sorted = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    print("Confusion Matrix:\n", cm)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out,
                          title="Confusion Matrix ‚Äì Logistic Regression (student_data_2)")
    print(f"üñºÔ∏è Confusion matrix saved to: {args.cm_out}")

    # ===== Subgroup fairness (multi-class safe) =====
    group_col = "Application_mode"  # change to any categorical variable
    if group_col in X_test.columns:
        groups = X_test[group_col].unique()
        print(f"\n=== Subgroup Fairness by {group_col} ===")
    for g in groups:
        idx = X_test[group_col] == g
        y_true_g = y_test[idx]
        y_pred_g = y_pred[idx]
        if len(y_true_g) == 0:
            continue
        cm_g = confusion_matrix(y_true_g, y_pred_g, labels=labels_sorted)
        print(f"\nGroup: {g} | Count: {len(y_true_g)}")
        for i, label in enumerate(labels_sorted):
            TP = cm_g[i, i]
            FN = cm_g[i, :].sum() - TP
            FP = cm_g[:, i].sum() - TP
            TN = cm_g.sum() - (TP + FP + FN)
            FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
            FNR = FN / (FN + TP) if (FN + TP) > 0 else 0
            print(f"  Class: {label} | FPR: {FPR:.4f} | FNR: {FNR:.4f}")


    # ===== Predict on full dataset =====
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = pipe.named_steps["clf"].classes_
    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"üìÅ Predictions saved to: {args.pred_out}")

    # ===== Save model =====
    joblib.dump(pipe, args.save_model)
    print(f"üíæ Model saved to: {args.save_model}")

# %%
main()

Loading CSV from: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\student_data_2.csv
Loaded 1000 rows
Detected target column: Output
5-fold CV Accuracy: 0.731, std=0.029
5-fold CV Kappa: 0.556, std=0.050

=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.765     0.672     0.716        58
    Enrolled      0.311     0.400     0.350        35
    Graduate      0.827     0.804     0.815       107

    accuracy                          0.695       200
   macro avg      0.634     0.625     0.627       200
weighted avg      0.719     0.695     0.705       200


‚≠ê Cohen Kappa Score (test set): 0.4987
Confusion Matrix:
 [[39 14  5]
 [ 8 14 13]
 [ 4 17 86]]
üñºÔ∏è Confusion matrix saved to: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\log_regression\confusion_matrix_student2.png

=== Subgroup Fairness by Application_mode ===

Group: 1 | Count: 83
  Class: Dropout