In [1]:
# %%
import os, argparse, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


In [2]:
# %%
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# %%
# Current notebook folder
base_path = Path().resolve()
print("Notebook folder:", base_path)

# CSV path
csv_path = base_path.parent / "student_data.csv"

# Load CSV
df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
print(f"Loaded {len(df)} rows from {csv_path}")


Notebook folder: C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model
Loaded 4424 rows from C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\student_data.csv


In [4]:
# %%
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)
    # low-cardinality integers -> categorical
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

In [5]:
# %%
def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì Baseline"):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath)
    plt.close(fig)

In [6]:
# %%
def main():
    parser = argparse.ArgumentParser(description="Baseline student outcome model (Jupyter).")
    parser.add_argument("--csv", default=str(csv_path), help="CSV path")
    parser.add_argument("--target", default="Target", help="Target column")
    parser.add_argument("--sep", default=";", help="CSV separator")
    parser.add_argument("--test_size", type=float, default=0.20, help="Test split ratio")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")
    parser.add_argument("--save_model", default=str(base_path / "baseline_model.joblib"), help="Model path")
    parser.add_argument("--pred_out", default=str(base_path / "predictions.csv"), help="Predictions path")
    parser.add_argument("--cm_out", default=str(base_path / "confusion_matrix.png"), help="Confusion matrix path")
    
    # Jupyter-safe parsing
    args = parser.parse_args(args=[])

    # Load CSV
    df = pd.read_csv(args.csv, sep=args.sep, engine="python", encoding="utf-8-sig")

    # Normalize headers
    df.columns = [str(c).replace("\ufeff", "").strip().replace(" ", "_") for c in df.columns]

    # Detect target column
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
            target_col = found[0]
        else:
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError(f"Couldn't find target column. Columns: {df.columns}")
            target_col = found[0]

    # Split
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # Preprocess & Model
    num_cols, cat_cols = infer_columns(df, target_col)
    pre = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ], remainder="drop")

    clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=args.seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # Fit
    pipe.fit(X_train, y_train)

    # Evaluate
    y_pred = pipe.predict(X_test)
    print("=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))

    # Confusion matrix
    labels_sorted = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # Full predictions
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    for i, c in enumerate(pipe.named_steps["clf"].classes_):
        out[f"p_{c}"] = proba[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")
    #Save model
    joblib.dump(pipe, args.save_model)
    print(f"üíæ Saved model to {args.save_model}")

# %%
main()


=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.835     0.694     0.758       284
    Enrolled      0.412     0.635     0.500       159
    Graduate      0.866     0.792     0.827       442

    accuracy                          0.732       885
   macro avg      0.704     0.707     0.695       885
weighted avg      0.775     0.732     0.746       885

üñºÔ∏è Saved confusion matrix to C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\confusion_matrix.png
‚úÖ Wrote predictions to C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\predictions.csv
üíæ Saved model to C:\Users\beste\OneDrive\Masa√ºst√º\praktikum\practical-course\predictive_model\baseline_model.joblib


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
