# IMPORT DATA DAN LIBRARY

In [43]:
import numpy as np
import pandas as pd

# split & CV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# base & transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

# model & metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [10]:
df = pd.read_csv('DataFrame_processed/DataFrame_processed.csv')

In [13]:
X = df.drop(columns=["Attrition"])
y = df["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# PIPELINE FULL

In [45]:
# pipeline transformasi
prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

# full preprocessing pipeline
pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)) # bisa tambah ata ganti model lain 
])

> ## Untuk output score

In [47]:
# Silakan di copas 
pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8856    0.9717    0.9266       247
           1     0.6957    0.3404    0.4571        47

    accuracy                         0.8707       294
   macro avg     0.7906    0.6560    0.6919       294
weighted avg     0.8552    0.8707    0.8516       294

Test ROC-AUC: 0.8154


---

---

# Model 1 - Logistic Regression

> ## HyperParameter Tuning

# Model 2 - Decision Tree Classifier

> ## HyperParameter Tuning

# Model 3 - Bagging Classifier

> ## HyperParameter Tuning

# Model 4 - Ada Boost Classifier 

> ## HyperParameter Tuning

# Model 5 - Ensemble Stacking

In [6]:
import numpy as np
import pandas as pd

# split & CV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV

# base & transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# model & metrics 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, average_precision_score, confusion_matrix


# load data
df = pd.read_csv("ibm data.csv").copy()

# map target ke 0/1
if df["Attrition"].dtype == object:
    df["Attrition"] = df["Attrition"].map({"No": 0, "Yes": 1}).astype(int)


# drop kolom
DROP_COLS = [
    "EmployeeCount","StandardHours","Over18","PerformanceRating",
    "EmployeeNumber","Education","JobLevel","PercentSalaryHike","Gender",
    "YearsAtCompany","YearsWithCurrManager","NumCompaniesWorked",
    "YearsSinceLastPromotion","RelationshipSatisfaction"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])


# feature engineering
def apply_fe(fe):
    fe = fe.copy()
    if {"YearsInCurrentRole","TotalWorkingYears"}.issubset(fe.columns):
        denom = fe["TotalWorkingYears"].replace(0, np.nan)
        fe["ExperienceRatio"] = (fe["YearsInCurrentRole"] / denom).fillna(0)

    if {"MonthlyIncome","TotalWorkingYears"}.issubset(fe.columns):
        fe["IncomePerYearExp"] = fe["MonthlyIncome"] / (fe["TotalWorkingYears"] + 1)

    if {"YearsInCurrentRole","JobSatisfaction"}.issubset(fe.columns):
        fe["TenureSatisfaction"] = fe["YearsInCurrentRole"] * fe["JobSatisfaction"]
    return fe

df_fe = apply_fe(df)

# split data
X = df_fe.drop(columns=["Attrition"])
y = df_fe["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# encoding & scaling/standarisasi
prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

# pipeline
pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)) # bisa tambah ata ganti model lain 
])


# ngetest doang
# cv di train 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1")
cv_auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc")
print("CV F1  :", np.round(cv_f1, 3),  "| mean =", cv_f1.mean().round(3))
print("CV AUC :", np.round(cv_auc, 3), "| mean =", cv_auc.mean().round(3))

# fit di train, predict di test
pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


CV F1  : [0.44  0.208 0.448 0.491 0.485] | mean = 0.415
CV AUC : [0.777 0.724 0.822 0.867 0.78 ] | mean = 0.794

=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8856    0.9717    0.9266       247
           1     0.6957    0.3404    0.4571        47

    accuracy                         0.8707       294
   macro avg     0.7906    0.6560    0.6919       294
weighted avg     0.8552    0.8707    0.8516       294

Test ROC-AUC: 0.816


> ## HyperParameter Tuning

In [4]:
def best_threshold_by_fbeta(proba, y_true, beta=1.0):
    p, r, th = precision_recall_curve(y_true, proba)
    if len(th) == 0:
        return 0.5
    beta2 = beta**2
    fbeta = (1+beta2) * (p*r) / (beta2*p + r + 1e-12)   
    idx = np.nanargmax(fbeta)
    return th[min(idx, len(th)-1)]

def evaluate_with_threshold(name, pipe, X_tr, y_tr, X_val, y_val, X_te, y_te, beta=1.0):
    pipe.fit(X_tr, y_tr)

    # pilih threshold di validation (bukan test)
    proba_val = pipe.predict_proba(X_val)[:,1]
    thr = best_threshold_by_fbeta(proba_val, y_val, beta=beta)

    # final test
    proba_te = pipe.predict_proba(X_te)[:,1]
    y_pred = (proba_te >= thr).astype(int)

    print(f"\n=== {name} (thr={thr:.3f}, beta={beta}) ===")
    print(classification_report(y_te, y_pred, digits=4))
    roc = roc_auc_score(y_te, proba_te)
    ap  = average_precision_score(y_te, proba_te)
    tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()
    print(f"ROC-AUC: {roc:.4f} | PR-AUC: {ap:.4f} | CM: tn={tn}, fp={fp}, fn={fn}, tp={tp}")
    return thr, proba_te

# ======================
# CV baseline (optional quick check)
# ======================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1")
cv_auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc")
print("CV F1  :", np.round(cv_f1, 3),  "| mean =", cv_f1.mean().round(3))
print("CV AUC :", np.round(cv_auc, 3), "| mean =", cv_auc.mean().round(3))

# ======================
# GridSearchCV untuk Stacking (tuning RF, XGB, meta-LR, SMOTE)
# ======================
param_grid = {
    # RF (base)
    "clf__rf__n_estimators": [300, 500],
    "clf__rf__max_depth": [None, 10, 20],
    "clf__rf__min_samples_leaf": [1, 2],
    "clf__rf__max_features": ["sqrt", "log2"],
    # XGB (base)
    "clf__xgb__n_estimators": [300, 500],
    "clf__xgb__learning_rate": [0.05, 0.1],
    "clf__xgb__max_depth": [4, 6, 8],
    "clf__xgb__subsample": [0.8, 1.0],
    "clf__xgb__colsample_bytree": [0.8, 1.0],
    # Meta-learner (LR)
    "clf__final_estimator__C": [0.5, 1.0, 2.0],
    # SMOTE
    "smote__k_neighbors": [3, 5]
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1",          # bisa ganti ke make_scorer(fbeta_score, beta=2) kalau mau recall-heavy
    cv=cv,
    n_jobs=-1,
    verbose=1
)
gs.fit(X_train, y_train)
print("\nBest params:", gs.best_params_)
print("Best CV F1 :", round(gs.best_score_, 4))

best_pipe = gs.best_estimator_

# ======================
# Threshold tuning di validation split, lalu final test
# ======================
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)
thr, _ = evaluate_with_threshold(
    name="STACK (tuned)",
    pipe=best_pipe,
    X_tr=X_tr, y_tr=y_tr,
    X_val=X_val, y_val=y_val,
    X_te=X_test, y_te=y_test,
    beta=1.0   # set 2.0 kalau pengen lebih menekan recall
)

# ======================
# (Optional) bandingin juga default threshold 0.5 langsung di test
# ======================
best_pipe.fit(X_train, y_train)
proba_test = best_pipe.predict_proba(X_test)[:,1]
y_pred_05  = (proba_test >= 0.5).astype(int)
print("\n=== STACK (tuned, default thr=0.5) ===")
print(classification_report(y_test, y_pred_05, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, proba_test).round(4))

CV F1  : [0.44  0.208 0.448 0.491 0.485] | mean = 0.415
CV AUC : [0.777 0.724 0.822 0.867 0.78 ] | mean = 0.794
Fitting 5 folds for each of 6912 candidates, totalling 34560 fits


ValueError: Invalid parameter 'final_estimator' for estimator RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].