In [None]:
%pip install joblib pandas numpy matplotlib
%pip install scikit-learn


In [5]:
import os
import argparse
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# detect numeric and categorical columns in a DataFrame
def infer_columns(df: pd.DataFrame, target: str):


    # Creates a new DataFrame X that contains all columns except the target column.
    X = df.drop(columns=[target])
    # Finds all columns that are of type: These are automatically considered categorical features. Stores them in cat_cols.
    cat_cols = list(X.select_dtypes(include=["object", "category"]).columns)


    # Treat low-cardinality integer columns as categorical (e.g., semester code)
    # *********
    for c in X.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns:
        if X[c].nunique() <= 20:
            cat_cols.append(c)
    cat_cols = sorted(set(cat_cols))


    # Selecting numerical columns
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

In [None]:
# # Defines a function named plot_confusion_matrix with four parameters:

# # cm: the confusion matrix ‚Äî expected as a 2D array-like (e.g. numpy.ndarray) of shape (n_classes, n_classes).

# # labels: list/sequence of label names (length must match cm dimensions) used for x/y tick labels.

# # outpath: optional filename (string) where the plotted image will be saved (default "confusion_matrix.png").

# # title: optional plot title (default string shown).

# def plot_confusion_matrix(cm, labels, outpath="confusion_matrix.png", title="Confusion Matrix ‚Äì Baseline"):
#     # Creates a new matplotlib Figure and a single Axes object. fig is the figure container; ax is the axes we draw on.
#     fig, ax = plt.subplots()
#     # Displays the confusion matrix cm as an image on the axes using imshow. Each cell of cm becomes a colored square. im is the AxesImage object (useful if you later want a colorbar or to query the colormap). 
#     # Default colormap and interpolation are used unless specified.
#     im = ax.imshow(cm)
#     # Sets the tick positions on the x and y axes to 0, 1, ..., n-1 where n = len(labels). This aligns tick positions with the matrix cell centers.
#     ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
#     # Sets the tick labels (text) using your labels list: 
#     # X tick labels are rotated 45¬∞ and horizontally aligned to the right (ha="right") so long labels don't overlap. Y tick labels are set with default rotation (vertical left side).
#     ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
#     # Nested loops that add the numeric value of each confusion-matrix cell on top of the corresponding square: i indexes rows (true class), j indexes columns (predicted class).
#     # ax.text(j, i, cm[i, j], ...) writes the cell value centered inside the square (ha="center", va="center"). This assumes cm is square and numeric; values are placed as-is (no formatting).
#     for i in range(len(cm)):
#         for j in range(len(cm)):
#             ax.text(j, i, cm[i, j], ha="center", va="center")
#     ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title(title)
#     plt.tight_layout()
#     fig.savefig(outpath); plt.close(fig)

In [10]:
def main():
    parser = argparse.ArgumentParser(description="Baseline student outcome model (single-file).")
    # >>> ekrandaki isimlere ve yerlere g√∂re varsayƒ±lanlarƒ± koydum
    parser.add_argument("--csv", default=os.path.join("predictive_model", "student_data.csv"),
                        help="CSV yolu (default: predictive_model/student_data.csv)")
    parser.add_argument("--target", default="Target", help="Hedef s√ºtun adƒ± (default: Target)")
    parser.add_argument("--sep", default=";", help="CSV ayra√ß (default: ;)")  # noktalƒ± virg√ºl
    parser.add_argument("--test_size", type=float, default=0.20, help="Test oranƒ± (default: 0.20)")
    parser.add_argument("--seed", type=int, default=42, help="Rastgele tohum (default: 42)")
    # √ßƒ±ktƒ± dosyalarƒ±nƒ± aynƒ± klas√∂re alalƒ±m ki kolay bul
    parser.add_argument("--save_model", default=os.path.join("predictive_model", "baseline_model.joblib"),
                        help="Model kaydetme yolu")
    parser.add_argument("--pred_out", default=os.path.join("predictive_model", "predictions.csv"),
                        help="Tahmin √ßƒ±ktƒ±sƒ± CSV yolu")
    parser.add_argument("--cm_out", default=os.path.join("predictive_model", "confusion_matrix.png"),
                        help="Karƒ±≈üƒ±klƒ±k matrisi g√∂rsel yolu")
    args = parser.parse_args()

    # 1) Veri
    # 1) Load data (handle BOM + weird spaces) and detect target column
    df = pd.read_csv(args.csv, sep=args.sep, engine="python", encoding="utf-8-sig")

    # normalize headers
    clean_cols = []
    for c in df.columns:
        c2 = str(c).replace("\ufeff", "").strip().replace(" ", "_")
        clean_cols.append(c2)
    df.columns = clean_cols

    # try the provided target, then common aliases, then infer by values
    cand_target = args.target.replace(" ", "_")
    if cand_target in df.columns:
        target_col = cand_target
    else:
        # common alternatives people use
        aliases = ["Target", "target", "Status", "Outcome", "Result", "Label"]
        aliases = [a.replace(" ", "_") for a in aliases]
        found = [c for c in df.columns if c in aliases or c.lower() in [a.lower() for a in aliases]]
        if found:
            target_col = found[0]
        else:
            # infer: column whose unique values are subset of the known classes
            KNOWN = {"Dropout", "Graduate", "Enrolled"}
            found = []
            for c in df.columns:
                vals = set(df[c].astype(str).str.strip().unique())
                # accept if mostly within known set (allow a few NaN/empty)
                if len(vals - KNOWN - {""}) <= 0 and len(vals & KNOWN) >= 2:
                    found.append(c)
            if not found:
                raise ValueError(
                    f"Couldn't find the target column. Columns: {list(df.columns)[:15]} ... "
                    "Pass --target CorrectName or open the CSV to check the exact header."
                )
            target_col = found[0]

    # 2) Split
    y = df[target_col].astype(str)
    X = df.drop(columns=[target_col])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    # 3) Preprocess & Model
    num_cols, cat_cols = infer_columns(df, target_col)
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop"
    )
    clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=args.seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # 4) Fit
    pipe.fit(X_train, y_train)

    # 5) Evaluate
    y_pred = pipe.predict(X_test)
    print("=== Classification Report (test set) ===")
    print(classification_report(y_test, y_pred, digits=3))

    labels_sorted = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    plot_confusion_matrix(cm, labels_sorted, outpath=args.cm_out)
    print(f"üñºÔ∏è Saved confusion matrix to {args.cm_out}")

    # 6) Full CSV √ºzerinde tahmin
    proba = pipe.predict_proba(X)
    out = df.copy()
    out["prediction"] = pipe.predict(X)
    classes = list(pipe.named_steps["clf"].classes_)
    for i, c in enumerate(classes):
        out[f"p_{c}"] = proba[:, i]
    out.to_csv(args.pred_out, index=False)
    print(f"‚úÖ Wrote predictions to {args.pred_out}")

    # 7) Modeli kaydet
    if args.save_model:
        joblib.dump(pipe, args.save_model)
        print(f"üíæ Saved model to {args.save_model}")

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [--csv CSV] [--target TARGET] [--sep SEP]
                             [--test_size TEST_SIZE] [--seed SEED]
                             [--save_model SAVE_MODEL] [--pred_out PRED_OUT]
                             [--cm_out CM_OUT]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\beste\AppData\Roaming\jupyter\runtime\kernel-v31ad578f4aa68f4cf7dbc917bd6d288de31316dfd.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
