In [7]:
import os, argparse, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path




In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
# Current notebook folder
base_path = Path().resolve()
print("Notebook folder:", base_path)

# If notebook is already inside predictive_model
csv_path = base_path / "student_data.csv"

# Load CSV
df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
print(f"Loaded {len(df)} rows from {csv_path}")


Notebook folder: C:\Users\wwwut\practical-course\predictive_model
Loaded 4424 rows from C:\Users\wwwut\practical-course\predictive_model\student_data.csv


In [10]:
def infer_columns(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)
    # d√º≈ü√ºk kart sayƒ±lƒ± tamsayƒ±larƒ± kategorik say (√∂rn. d√∂nem kodu)
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

In [11]:
def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì Naive Bayes"):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    for i in range(len(cm)):
        for j in range(len(cm)):
            ax.text(j, i, cm[i, j], ha="center", va="center")
    ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title(title)
    plt.tight_layout()
    fig.savefig(outpath); plt.close(fig)


In [12]:
def main():
    parser = argparse.ArgumentParser(description="Naive Bayes student outcome model (single-file).")
    # >>> ekrandaki isimlere ve yerlere g√∂re varsayƒ±lanlarƒ± koydum
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data.csv"),
                        help="CSV yolu (default: predictive_model/student_data.csv)")
    parser.add_argument("--target", default="Target", help="Hedef s√ºtun adƒ± (default: Target)")
    parser.add_argument("--sep", default=";", help="CSV ayra√ß (default: ;)")  # noktalƒ± virg√ºl
    parser.add_argument("--test_size", type=float, default=0.20, help="Test oranƒ± (default: 0.20)")
    parser.add_argument("--seed", type=int, default=42, help="Rastgele tohum (default: 42)")
    # √ßƒ±ktƒ± dosyalarƒ±nƒ± aynƒ± klas√∂re alalƒ±m ki kolay bul
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "naive_bayes_model.joblib"),
                        help="Model kaydetme yolu")
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "naive_bayes_predictions.csv"),
                        help="Tahmin √ßƒ±ktƒ±sƒ± CSV yolu")
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "naive_bayes_confusion_matrix.png"),
                        help="Karƒ±≈üƒ±klƒ±k matrisi g√∂rsel yolu")
    args = parser.parse_args(args=[])

    base_path = Path().resolve()
    args.pred_out = base_path / "naive_bayes_predictions.csv"
    args.save_model = base_path / "naive_bayes_model.joblib"
    args.cm_out = base_path / "naive_bayes_confusion_matrix.png"


    # 1) Veri
    # 1) Load data (handle BOM + weird spaces) and detect target column
    base_path = Path().resolve()
    
    # CSV relative to notebook
    csv_path = base_path / "student_data.csv"  # <- no extra "predictive_model"
    print("Loading CSV from:", csv_path)

    # Load CSV
    df = pd.read_csv(csv_path, sep=";", engine="python", encoding="utf-8-sig")
    print(f"Loaded {len(df)} rows from {csv_path}")

    # normalize headers
    clean_cols = []
    for c in df.columns:
        c2 = str(c).replace("\ufeff", "").strip().replace(" ", "_")
        clean_cols.append(c2)
    df.columns = clean_cols

    # try the provided target, then common aliases, then infer by values
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        # common alternatives people use
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
            target_col = found[0]
        else:
            # infer: column whose unique values are subset of the known classes
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                # accept if mostly within known set (allow a few NaN/empty)
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError(
                    f"Couldn't find the target column. Columns: {list(df.columns)[:15]} ... "
                    "Pass --target CorrectName or open the CSV to check the exact header."
                )
            target_col = found[0]

    # 2) Split
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # 3) Preprocess & Model
    num_cols, cat_cols = infer_columns(df, target_col)
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ],
        remainder="drop"
    )
    # Naive Bayes (GaussianNB)
    clf = GaussianNB()
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # 4) Fit
    pipe.fit(X_train, y_train)

    # 5) Evaluate
    y_pred = pipe.predict(X_test)
    print("=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))

    labels_sorted = sorted(y.unique())

    # Update the confusion matrix path to notebook folder
    args.cm_out = Path().resolve() / "naive_bayes_confusion_matrix.png"

    # Plot and save confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # 6) Full CSV √ºzerinde tahmin
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = list(pipe.named_steps["clf"].classes_)
    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")

    # 7) Modeli kaydet
    if args.save_model:
        joblib.dump(pipe, args.save_model)
        print(f"üíæ Saved model to {args.save_model}")

main()


Loading CSV from: C:\Users\wwwut\practical-course\predictive_model\student_data.csv
Loaded 4424 rows from C:\Users\wwwut\practical-course\predictive_model\student_data.csv
=== Classification Report (test set) ===
              precision    recall  f1-score   support

     Dropout      0.765     0.092     0.164       284
    Enrolled      0.192     0.987     0.321       159
    Graduate      0.906     0.066     0.122       442

    accuracy                          0.240       885
   macro avg      0.621     0.382     0.202       885
weighted avg      0.732     0.240     0.171       885

üñºÔ∏è Saved confusion matrix to C:\Users\wwwut\practical-course\predictive_model\naive_bayes_confusion_matrix.png
‚úÖ Wrote predictions to C:\Users\wwwut\practical-course\predictive_model\naive_bayes_predictions.csv
üíæ Saved model to C:\Users\wwwut\practical-course\predictive_model\naive_bayes_model.joblib
