In [1]:
# %%
import os, argparse, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# %%
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_val_score,
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    cohen_kappa_score,
)

# %%
# helper: infer numeric / categorical columns
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)

    # d√º≈ü√ºk kart sayƒ±lƒ± tamsayƒ±larƒ± kategorik say
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)

    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

# %%
def plot_confusion_matrix(
    cm, labels, outpath="decision_tree_confusion_matrix_student2.png",
    title="Confusion Matrix ‚Äì Decision Tree (student_data_2)"
):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)

    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")

    ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath)
    plt.close(fig)

# %%
def main():
    parser = argparse.ArgumentParser(
        description="Decision Tree student outcome model on student_data_2 with CV."
    )

    # use student_data_2.csv
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data_2.csv"))
    parser.add_argument("--target", default="Target")
    parser.add_argument("--sep", default=";")
    parser.add_argument("--test_size", type=float, default=0.20)
    parser.add_argument("--seed", type=int, default=42)

    # separate output names from the first DT model
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "decision_tree_model_student2.joblib"))
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "decision_tree_predictions_student2.csv"))
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "decision_tree_confusion_matrix_student2.png"))

    # in notebook: ignore CLI
    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "decision_tree_predictions_student2.csv"
    args.save_model = base_path / "decision_tree_model_student2.joblib"
    args.cm_out = base_path / "decision_tree_confusion_matrix_student2.png"

    # ===== 1) Load Data =====
    csv_path = base_path.parent / "student_data_2.csv"
    print("Loading CSV from:", csv_path)

    df = pd.read_csv(csv_path, sep=args.sep, engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows")

    # normalize headers
    df.columns = [str(c).replace("\ufeff", "").strip().replace(" ", "_") for c in df.columns]

    # detect target column
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]

        if found:
            target_col = found[0]
        else:
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError("Could not infer target column. Please specify --target.")
            target_col = found[0]

    print("Detected target column:", target_col)

    # ===== 2) Split =====
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=args.test_size,
        random_state=args.seed,
        stratify=y
    )

    # ===== 3) Preprocessing + Model =====
    num_cols, cat_cols = infer_columns(df, target_col)

    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop"
    )

    clf = DecisionTreeClassifier(
        random_state=args.seed,
        class_weight="balanced",
        max_depth=None,   # you can tune this later
    )

    pipe = Pipeline([
        ("pre", pre),
        ("clf", clf)
    ])

    # ===== 3b) Cross-Validation on training set =====
    print("\n=== 5-fold Cross-Validation (Accuracy) on training set ===")
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)
    scores5 = cross_val_score(pipe, X_train, y_train, cv=cv5, scoring="accuracy")
    print("Scores per fold:", np.round(scores5, 3))
    print(f"Mean accuracy: {scores5.mean():.3f}  |  Std: {scores5.std():.3f}")

    print("\n=== 10-fold Cross-Validation (Accuracy) on training set ===")
    cv10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.seed)
    scores10 = cross_val_score(pipe, X_train, y_train, cv=cv10, scoring="accuracy")
    print("Scores per fold:", np.round(scores10, 3))
    print(f"Mean accuracy: {scores10.mean():.3f}  |  Std: {scores10.std():.3f}")

    # ===== 4) Fit final model on training set =====
    pipe.fit(X_train, y_train)

    # ===== 5) Evaluate on test set =====
    y_pred = pipe.predict(X_test)
    print("\n=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))

    # ‚≠ê Kappa score
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"\n‚≠ê Cohen Kappa Score (test set): {kappa:.4f}")

    labels_sorted = sorted(y.unique())

    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(
        cm, labels_sorted, outpath=args.cm_out,
        title="Confusion Matrix ‚Äì Decision Tree (student_data_2)"
    )
    print(f"üñºÔ∏è Confusion matrix saved to: {args.cm_out}")

    # ===== 6) Predictions on full CSV =====
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = list(pipe.named_steps["clf"].classes_)

    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]

    out.to_csv(args.pred_out, index=False)
    print(f"üìÅ Predictions saved to: {args.pred_out}")

    # ===== 7) Save model =====
    if args.save_model:
        joblib.dump(pipe, args.save_model)
        print(f"üíæ Model saved to: {args.save_model}")

# %%
main()

Loading CSV from: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/student_data_2.csv
Loaded 1000 rows
Detected target column: Output

=== 5-fold Cross-Validation (Accuracy) on training set ===
Scores per fold: [0.656 0.662 0.656 0.7   0.719]
Mean accuracy: 0.679  |  Std: 0.026

=== 10-fold Cross-Validation (Accuracy) on training set ===
Scores per fold: [0.562 0.638 0.638 0.725 0.688 0.662 0.738 0.7   0.725 0.688]
Mean accuracy: 0.676  |  Std: 0.050

=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.692     0.621     0.655        58
    Enrolled      0.348     0.457     0.395        35
    Graduate      0.814     0.776     0.794       107

    accuracy                          0.675       200
   macro avg      0.618     0.618     0.615       200
weighted avg      0.697     0.675     0.684       200


‚≠ê Cohen Kappa Score (test set): 0.4685
üñºÔ∏è Confusion matrix saved to: /Users/defneelag