In [1]:
# %%
import os, argparse, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# %%
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    cohen_kappa_score
)
from sklearn.metrics import make_scorer

# %%
# Current notebook folder
base_path = Path().resolve()
print("Notebook folder:", base_path)

# If notebook is already inside predictive_model
csv_path = base_path.parent / "student_data.csv"

# Load CSV
df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
print(f"Loaded {len(df)} rows from {csv_path}")

Notebook folder: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/decision_tree
Loaded 4424 rows from /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/student_data.csv


In [3]:
# %%
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)

    # treat low-cardinality integers as categorical
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)

    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

In [4]:
# %%
def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì Decision Tree"):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)

    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")

    ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath); plt.close(fig)

In [5]:
# %%
def main():
    parser = argparse.ArgumentParser(description="Decision Tree student outcome model (single-file).")

    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data.csv"))
    parser.add_argument("--target", default="Target")
    parser.add_argument("--sep", default=";")
    parser.add_argument("--test_size", type=float, default=0.20)
    parser.add_argument("--seed", type=int, default=42)

    parser.add_argument("--save_model", default=os.path.join("predictive_model", "decision_tree_model.joblib"))
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "decision_tree_predictions.csv"))
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "decision_tree_confusion_matrix.png"))

    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "decision_tree_predictions.csv"
    args.save_model = base_path / "decision_tree_model.joblib"
    args.cm_out = base_path / "decision_tree_confusion_matrix.png"

    # 1) Load Data
    base_path = Path().resolve()
    csv_path = base_path.parent / "student_data.csv"
    print("Loading CSV from:", csv_path)

    df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows from {csv_path}")

    # normalize headers
    clean_cols = []
    for c in df.columns:
        c2 = str(c).replace("\ufeff", "").strip().replace(" ", "_")
        clean_cols.append(c2)
    df.columns = clean_cols

    # detect target column
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]

        if found:
            target_col = found[0]
        else:
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError("Could not infer target column. Please specify --target.")
            target_col = found[0]

    # 2) Split
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=args.test_size,
        random_state=args.seed,
        stratify=y
    )

    # 3) Preprocessing + Model
    num_cols, cat_cols = infer_columns(df, target_col)

    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop"
    )

    clf = DecisionTreeClassifier(
        random_state=args.seed,
        class_weight="balanced",
        max_depth=None,   # change if needed
    )

    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # 3b) Cross-validation on the training data (k=5 and k=10)
    #    We keep the test set untouched for final evaluation.
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)
    cv10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.seed)

    scoring = {
        "accuracy": "accuracy",
        "kappa": make_scorer(cohen_kappa_score)
    }

    print("\n=== 5-fold Cross-Validation (training set) ===")
    cv5_results = cross_validate(
        pipe,
        X_train,
        y_train,
        cv=cv5,
        scoring=scoring,
        n_jobs=-1
    )
    print(f"Accuracy: mean={cv5_results['test_accuracy'].mean():.3f}, std={cv5_results['test_accuracy'].std():.3f}")
    print(f"Kappa:    mean={cv5_results['test_kappa'].mean():.3f}, std={cv5_results['test_kappa'].std():.3f}")

    print("\n=== 10-fold Cross-Validation (training set) ===")
    cv10_results = cross_validate(
        pipe,
        X_train,
        y_train,
        cv=cv10,
        scoring=scoring,
        n_jobs=-1
    )
    print(f"Accuracy: mean={cv10_results['test_accuracy'].mean():.3f}, std={cv10_results['test_accuracy'].std():.3f}")
    print(f"Kappa:    mean={cv10_results['test_kappa'].mean():.3f}, std={cv10_results['test_kappa'].std():.3f}")

    # 4) Fit final model on the full training set
    pipe.fit(X_train, y_train)

    # 5) Evaluate
    y_pred = pipe.predict(X_test)
    print("=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))
    
    # ‚≠ê KAPPA EKLENDƒ∞ ‚≠ê
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"\n‚≠ê Cohen Kappa Score: {kappa:.4f}")
    labels_sorted = sorted(y.unique())

    # confusion matrix
    args.cm_out = Path().resolve() / "decision_tree_confusion_matrix.png"
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # 6) Predictions on full CSV
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = list(pipe.named_steps["clf"].classes_)

    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]

    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")

    # 7) Save model
    if args.save_model:
        joblib.dump(pipe, args.save_model)
        print(f"üíæ Saved model to {args.save_model}")

main()

Loading CSV from: /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/student_data.csv
Loaded 4424 rows from /Users/defneelagoz/Desktop/practical course/practical-course/predictive_model/student_data.csv

=== 5-fold Cross-Validation (training set) ===
Accuracy: mean=0.677, std=0.016
Kappa:    mean=0.477, std=0.023

=== 10-fold Cross-Validation (training set) ===
Accuracy: mean=0.673, std=0.019
Kappa:    mean=0.468, std=0.032
=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.664     0.690     0.677       284
    Enrolled      0.372     0.384     0.378       159
    Graduate      0.791     0.762     0.776       442

    accuracy                          0.671       885
   macro avg      0.609     0.612     0.610       885
weighted avg      0.675     0.671     0.673       885


‚≠ê Cohen Kappa Score: 0.4691
üñºÔ∏è Saved confusion matrix to /Users/defneelagoz/Desktop/practical course/practical-cour