# IMPORT DATA DAN LIBRARY

In [43]:
import numpy as np
import pandas as pd

# split & CV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# base & transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

# model & metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [10]:
df = pd.read_csv('DataFrame_processed/DataFrame_processed.csv')

In [13]:
X = df.drop(columns=["Attrition"])
y = df["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# PIPELINE FULL

In [45]:
# pipeline transformasi
prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

# full preprocessing pipeline
pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)) # bisa tambah ata ganti model lain 
])

> ## Untuk output score

In [47]:
# Silakan di copas 
pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8856    0.9717    0.9266       247
           1     0.6957    0.3404    0.4571        47

    accuracy                         0.8707       294
   macro avg     0.7906    0.6560    0.6919       294
weighted avg     0.8552    0.8707    0.8516       294

Test ROC-AUC: 0.8154


---

---

# Model 1 - Logistic Regression

> ## HyperParameter Tuning

# Model 2 - Decision Tree Classifier

> ## HyperParameter Tuning

# Model 3 - Bagging Classifier

> ## HyperParameter Tuning

# Model 4 - Ada Boost Classifier 

> ## HyperParameter Tuning

# Model 5 - Ensemble Stacking

In [8]:
import numpy as np
import pandas as pd

# split & CV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

# base & transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# model & metrics 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, average_precision_score, confusion_matrix


# load data
df = pd.read_csv("ibm data.csv").copy()

# map target ke 0/1
if df["Attrition"].dtype == object:
    df["Attrition"] = df["Attrition"].map({"No": 0, "Yes": 1}).astype(int)


# drop kolom
DROP_COLS = [
    "EmployeeCount","StandardHours","Over18","PerformanceRating",
    "EmployeeNumber","Education","JobLevel","PercentSalaryHike","Gender",
    "YearsAtCompany","YearsWithCurrManager","NumCompaniesWorked",
    "YearsSinceLastPromotion","RelationshipSatisfaction"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])


# feature engineering
def apply_fe(fe):
    fe = fe.copy()
    if {"YearsInCurrentRole","TotalWorkingYears"}.issubset(fe.columns):
        denom = fe["TotalWorkingYears"].replace(0, np.nan)
        fe["ExperienceRatio"] = (fe["YearsInCurrentRole"] / denom).fillna(0)

    if {"MonthlyIncome","TotalWorkingYears"}.issubset(fe.columns):
        fe["IncomePerYearExp"] = fe["MonthlyIncome"] / (fe["TotalWorkingYears"] + 1)

    if {"YearsInCurrentRole","JobSatisfaction"}.issubset(fe.columns):
        fe["TenureSatisfaction"] = fe["YearsInCurrentRole"] * fe["JobSatisfaction"]
    return fe

df_fe = apply_fe(df)

# split data
X = df_fe.drop(columns=["Attrition"])
y = df_fe["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# encoding & scaling/standarisasi
prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

# pipeline
pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)) # bisa tambah ata ganti model lain 
])


# ngetest doang
# cv di train 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1")
cv_auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc")
print("CV F1  :", np.round(cv_f1, 3),  "| mean =", cv_f1.mean().round(3))
print("CV AUC :", np.round(cv_auc, 3), "| mean =", cv_auc.mean().round(3))

# fit di train, predict di test
pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


CV F1  : [0.44  0.208 0.448 0.491 0.485] | mean = 0.415
CV AUC : [0.777 0.724 0.822 0.867 0.78 ] | mean = 0.794

=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8856    0.9717    0.9266       247
           1     0.6957    0.3404    0.4571        47

    accuracy                         0.8707       294
   macro avg     0.7906    0.6560    0.6919       294
weighted avg     0.8552    0.8707    0.8516       294

Test ROC-AUC: 0.816


> ## HyperParameter Tuning

In [9]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def tune(pipe, param_dist, name, n_iter=30):
    rsearch = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=n_iter, scoring="f1",
        cv=cv, n_jobs=-1, verbose=1, random_state=42
    )
    rsearch.fit(X_train, y_train)
    print(f"\n[{name}] best F1: {rsearch.best_score_:.4f}")
    print(f"[{name}] best params: {rsearch.best_params_}")
    return rsearch.best_estimator_

# ========== 1) RF PIPE ==========
rf_pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])
rf_dist = {
    "clf__n_estimators": [300, 500, 800],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"],
    "smote__k_neighbors": [3, 5]
}
rf_best = tune(rf_pipe, rf_dist, "RF")

# ========== 2) XGB PIPE ==========
xgb_pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("clf", XGBClassifier(
        eval_metric="logloss", tree_method="hist",
        random_state=42, n_jobs=-1
    ))
])
xgb_dist = {
    "clf__n_estimators": [300, 500, 800],
    "clf__learning_rate": [0.05, 0.1, 0.2],
    "clf__max_depth": [4, 6, 8],
    "clf__subsample": [0.8, 1.0],
    "clf__colsample_bytree": [0.8, 1.0],
    "smote__k_neighbors": [3, 5]
}
xgb_best = tune(xgb_pipe, xgb_dist, "XGB")

# ========== 3) LR PIPE ==========
lr_pipe = ImbPipeline(steps=[
    ("prep", prep),  # scaling penting buat LR
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=2000, solver="lbfgs"))
])
lr_dist = {
    "clf__C": [0.1, 0.5, 1.0, 2.0, 5.0],
    "clf__class_weight": [None, "balanced"],
    "smote__k_neighbors": [3, 5]
}
lr_best = tune(lr_pipe, lr_dist, "LR")

# ========== 4) EVAL CEPAT per model ==========
def quick_test(name, best_pipe):
    best_pipe.fit(X_train, y_train)
    proba = best_pipe.predict_proba(X_test)[:,1]
    y_pred = (proba >= 0.5).astype(int)  # optional: nanti bisa threshold tuning
    print(f"\n=== {name} (thr=0.5) ===")
    print(classification_report(y_test, y_pred, digits=4))
    print("ROC-AUC:", roc_auc_score(y_test, proba).round(4))

quick_test("RF", rf_best)
quick_test("XGB", xgb_best)
quick_test("LR",  lr_best)

Fitting 5 folds for each of 30 candidates, totalling 150 fits



[RF] best F1: 0.4696
[RF] best params: {'smote__k_neighbors': 5, 'clf__n_estimators': 500, 'clf__min_samples_leaf': 4, 'clf__max_features': 'sqrt', 'clf__max_depth': None}
Fitting 5 folds for each of 30 candidates, totalling 150 fits

[XGB] best F1: 0.5003
[XGB] best params: {'smote__k_neighbors': 5, 'clf__subsample': 1.0, 'clf__n_estimators': 300, 'clf__max_depth': 6, 'clf__learning_rate': 0.1, 'clf__colsample_bytree': 1.0}
Fitting 5 folds for each of 20 candidates, totalling 100 fits





[LR] best F1: 0.4940
[LR] best params: {'smote__k_neighbors': 5, 'clf__class_weight': None, 'clf__C': 5.0}

=== RF (thr=0.5) ===
              precision    recall  f1-score   support

           0     0.8902    0.9514    0.9198       247
           1     0.6000    0.3830    0.4675        47

    accuracy                         0.8605       294
   macro avg     0.7451    0.6672    0.6936       294
weighted avg     0.8438    0.8605    0.8475       294

ROC-AUC: 0.8174

=== XGB (thr=0.5) ===
              precision    recall  f1-score   support

           0     0.8778    0.9595    0.9168       247
           1     0.5833    0.2979    0.3944        47

    accuracy                         0.8537       294
   macro avg     0.7306    0.6287    0.6556       294
weighted avg     0.8307    0.8537    0.8333       294

ROC-AUC: 0.7795

=== LR (thr=0.5) ===
              precision    recall  f1-score   support

           0     0.9303    0.7571    0.8348       247
           1     0.3548    0.7

In [10]:
# ambil model inti (tanpa prep/smote) dari best pipes
rf_clf  = rf_best.named_steps["clf"]
xgb_clf = xgb_best.named_steps["clf"]
lr_clf  = lr_best.named_steps["clf"]

# definisi stacking (meta-learner = LR)
stack_clf = StackingClassifier(
    estimators=[("rf", rf_clf), ("xgb", xgb_clf), ("lr", lr_clf)],
    final_estimator=LogisticRegression(max_iter=2000),
    stack_method="predict_proba",
    passthrough=True,
    n_jobs=-1
)

# pipeline stacking lengkap (prep → SMOTE → stacking)
stack_pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("clf", stack_clf),
])

# --- evaluasi CV di TRAIN (cepat) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1  = cross_val_score(stack_pipe, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
cv_auc = cross_val_score(stack_pipe, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
print("STACK (no tuning) | CV F1 :", np.round(cv_f1, 3),  "| mean =", cv_f1.mean().round(3))
print("STACK (no tuning) | CV AUC:", np.round(cv_auc, 3), "| mean =", cv_auc.mean().round(3))

# --- fit & test ---
stack_pipe.fit(X_train, y_train)
proba_test = stack_pipe.predict_proba(X_test)[:, 1]
y_pred     = (proba_test >= 0.5).astype(int)   # threshold default 0.5
print("\n=== STACK (no tuning) — TEST ===")
print(classification_report(y_test, y_pred, digits=4))print("Test ROC-AUC:", roc_auc_score(y_test, proba_test).round(4))

STACK (no tuning) | CV F1 : [0.542 0.407 0.585 0.618 0.521] | mean = 0.534
STACK (no tuning) | CV AUC: [0.825 0.76  0.841 0.878 0.804] | mean = 0.821

=== STACK (no tuning) — TEST ===
              precision    recall  f1-score   support

           0     0.8859    0.9433    0.9137       247
           1     0.5484    0.3617    0.4359        47

    accuracy                         0.8503       294
   macro avg     0.7172    0.6525    0.6748       294
weighted avg     0.8320    0.8503    0.8373       294

Test ROC-AUC: 0.7922


In [11]:
# param dist untuk tuning ringan 
stack_dist = {
    # meta-learner (LogReg)
    "clf__final_estimator__C": [0.5, 1.0, 2.0, 5.0],
    # base RF
    "clf__rf__n_estimators": [300, 500, 800],
    "clf__rf__max_depth": [None, 10, 20],
    "clf__rf__min_samples_leaf": [1, 2, 4],
    "clf__rf__max_features": ["sqrt", "log2"],
    # base XGB
    "clf__xgb__n_estimators": [300, 500, 800],
    "clf__xgb__learning_rate": [0.05, 0.1, 0.2],
    "clf__xgb__max_depth": [4, 6, 8],
    "clf__xgb__subsample": [0.8, 1.0],
    "clf__xgb__colsample_bytree": [0.8, 1.0],
    # SMOTE
    "smote__k_neighbors": [3, 5]
}

rsearch_stack = RandomizedSearchCV(
    estimator=stack_pipe,
    param_distributions=stack_dist,
    n_iter=40,                    # naikin kalau mau explore lebih luas
    scoring="f1",                 # bisa ganti ke F2 kalau fokus recall
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rsearch_stack.fit(X_train, y_train)
print("\n[STACK] Best params:", rsearch_stack.best_params_)
print("[STACK] Best CV F1 :", round(rsearch_stack.best_score_, 4))

stack_best = rsearch_stack.best_estimator_

# --- evaluasi di test dengan threshold default 0.5 ---
stack_best.fit(X_train, y_train)
proba_test = stack_best.predict_proba(X_test)[:, 1]
y_pred     = (proba_test >= 0.5).astype(int)
print("\n=== STACK (tuned; thr=0.5) — TEST ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, proba_test).round(4))


Fitting 5 folds for each of 40 candidates, totalling 200 fits

[STACK] Best params: {'smote__k_neighbors': 5, 'clf__xgb__subsample': 0.8, 'clf__xgb__n_estimators': 300, 'clf__xgb__max_depth': 6, 'clf__xgb__learning_rate': 0.05, 'clf__xgb__colsample_bytree': 1.0, 'clf__rf__n_estimators': 300, 'clf__rf__min_samples_leaf': 2, 'clf__rf__max_features': 'sqrt', 'clf__rf__max_depth': None, 'clf__final_estimator__C': 0.5}
[STACK] Best CV F1 : 0.5362

=== STACK (tuned; thr=0.5) — TEST ===
              precision    recall  f1-score   support

           0     0.8893    0.9433    0.9155       247
           1     0.5625    0.3830    0.4557        47

    accuracy                         0.8537       294
   macro avg     0.7259    0.6631    0.6856       294
weighted avg     0.8371    0.8537    0.8420       294

Test ROC-AUC: 0.7974
