# IMPORT DATA DAN LIBRARY

In [6]:
import numpy as np
import pandas as pd

# split & CV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# base & transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

# model & metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [8]:
df = pd.read_csv('DataFrame_processed/DataFrame_processed.csv')

In [10]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'EducationField', 'EnvironmentSatisfaction',
       'HourlyRate', 'JobInvolvement', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'OverTime',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsInCurrentRole', 'ExperienceRatio',
       'IncomePerYearExp', 'TenureSatisfaction'],
      dtype='object')

In [12]:
X = df.drop(columns=["Attrition"])
y = df["Attrition"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# PIPELINE FULL

In [15]:
# pipeline transformasi
prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

# full preprocessing pipeline
pipe = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)) # bisa tambah ata ganti model lain 
])

> ## Untuk output score

In [18]:
from sklearn.metrics import make_scorer, fbeta_score

f2_scorer = make_scorer(fbeta_score, beta=2, average="binary")


In [20]:
# Silakan di copas 
pipe.fit(X_train, y_train)
y_pred  = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))




=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8856    0.9717    0.9266       247
           1     0.6957    0.3404    0.4571        47

    accuracy                         0.8707       294
   macro avg     0.7906    0.6560    0.6919       294
weighted avg     0.8552    0.8707    0.8516       294

Test ROC-AUC: 0.8154


---

---

# Model 1 - Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=42, max_iter=1000)

pipe_log_reg = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", logreg)
])

In [9]:
pipe_log_reg.fit(X_train, y_train)
y_pred  = pipe_log_reg.predict(X_test)
y_proba = pipe_log_reg.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.9317    0.7733    0.8451       247
           1     0.3708    0.7021    0.4853        47

    accuracy                         0.7619       294
   macro avg     0.6512    0.7377    0.6652       294
weighted avg     0.8420    0.7619    0.7876       294

Test ROC-AUC: 0.7941


> ## HyperParameter Tuning

In [10]:
# Hyperparameter ranges
C_range = np.logspace(-4, 2, 20)  
penalty_options = ['l1', 'l2', 'elasticnet']
solver_options = ['liblinear', 'saga', 'lbfgs']
l1_ratio_range = np.linspace(0, 1, 5)  

# hyperparameters_logreg = {
#     'clf__C': C_range,
#     'clf__penalty': penalty_options,
#     'clf__solver': solver_options,
#     'clf__class_weight': [None, 'balanced'],
#     'clf__l1_ratio': l1_ratio_range
# }

param_distributions_logreg = [
    # L2 penalty
    {
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs', 'saga'],
        'clf__C': np.logspace(-3, 2, 10),
        'clf__max_iter': [500, 1000, 2000],
        'clf__class_weight': [None, 'balanced']
    },
    # L1 penalty
    {
        'clf__penalty': ['l1'],
        'clf__solver': ['liblinear', 'saga'],
        'clf__C': np.logspace(-3, 2, 10),
        'clf__max_iter': [500, 1000, 2000],
        'clf__class_weight': [None, 'balanced']
    },
    # ElasticNet penalty
    {
        'clf__penalty': ['elasticnet'],
        'clf__solver': ['saga'],
        'clf__C': np.logspace(-3, 2, 10),
        'clf__l1_ratio': np.linspace(0.1, 0.9, 5),
        'clf__max_iter': [500, 1000, 2000],
        'clf__class_weight': [None, 'balanced']
    }
]

rs_logreg = RandomizedSearchCV(pipe_log_reg, param_distributions=param_distributions_logreg , scoring=f2_scorer, random_state=42, cv=5, n_iter=50)
rs_logreg.fit(X_train, y_train)



In [11]:
print(f'score F2: {rs_logreg.best_score_}')
print(f'best param : {rs_logreg.best_params_}')

score F2: 0.6018068817726352
best param : {'clf__solver': 'saga', 'clf__penalty': 'l2', 'clf__max_iter': 500, 'clf__class_weight': 'balanced', 'clf__C': np.float64(0.1668100537200059)}


In [12]:
logreg_tuned = rs_logreg.best_estimator_
logreg_tuned.fit(X_train, y_train)
y_pred  = logreg_tuned.predict(X_test)
y_proba = logreg_tuned.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2:", rs_logreg.best_score_.round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.9363    0.7733    0.8470       247
           1     0.3778    0.7234    0.4964        47

    accuracy                         0.7653       294
   macro avg     0.6570    0.7483    0.6717       294
weighted avg     0.8470    0.7653    0.7909       294

Test ROC-AUC: 0.8002
Test F2: 0.6018


# Model 2 - Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

dec_tree_clf = DecisionTreeClassifier(random_state=42)

pipe_tree = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", dec_tree_clf)
])

In [14]:
pipe_tree.fit(X_train, y_train)
y_pred  = pipe_tree.predict(X_test)
y_proba = pipe_tree.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8760    0.8583    0.8671       247
           1     0.3269    0.3617    0.3434        47

    accuracy                         0.7789       294
   macro avg     0.6015    0.6100    0.6053       294
weighted avg     0.7883    0.7789    0.7834       294

Test ROC-AUC: 0.61


> ## HyperParameter Tuning

In [15]:
max_depth_tree = [int(x) for x in np.linspace(5, 50, 20)]  
split_tree = [int(x) for x in np.linspace(2, 50, 20)]      
leaf_tree = [int(x) for x in np.linspace(1, 50, 20)]       
max_features_tree = [None, 'sqrt', 'log2']                 
criterion_tree = ["gini", "entropy"]

hyperparameters_tree = {
    'clf__max_depth': max_depth_tree,
    'clf__min_samples_split': split_tree,
    'clf__min_samples_leaf': leaf_tree,
    'clf__max_features': max_features_tree,
    'clf__criterion': criterion_tree,
    'clf__class_weight': [None, "balanced"]
}

rs_tree = RandomizedSearchCV(pipe_tree, hyperparameters_tree, scoring=f2_scorer, random_state=42, cv=5, n_iter=50)
rs_tree.fit(X_train, y_train)

In [16]:
print(f'score F2: {rs_tree.best_score_}')
print(f'best param : {rs_tree.best_params_}')

score F2: 0.5215315949554459
best param : {'clf__min_samples_split': 37, 'clf__min_samples_leaf': 50, 'clf__max_features': 'log2', 'clf__max_depth': 28, 'clf__criterion': 'entropy', 'clf__class_weight': None}


In [17]:
tree_tuned = rs_tree.best_estimator_
tree_tuned.fit(X_train, y_train)
y_pred  = tree_tuned.predict(X_test)
y_proba = tree_tuned.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2:", rs_tree.best_score_.round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.9009    0.7733    0.8322       247
           1     0.3171    0.5532    0.4031        47

    accuracy                         0.7381       294
   macro avg     0.6090    0.6632    0.6177       294
weighted avg     0.8076    0.7381    0.7636       294

Test ROC-AUC: 0.6898
Test F2: 0.5215


# Model 3 - Bagging Classifier

In [18]:
from sklearn.ensemble import BaggingClassifier

bagging_base = BaggingClassifier(random_state=42)

pipe_bagging = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", bagging_base)
])

In [19]:
pipe_bagging.fit(X_train, y_train)
y_pred  = pipe_bagging.predict(X_test)
y_proba = pipe_bagging.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8889    0.9393    0.9134       247
           1     0.5455    0.3830    0.4500        47

    accuracy                         0.8503       294
   macro avg     0.7172    0.6611    0.6817       294
weighted avg     0.8340    0.8503    0.8393       294

Test ROC-AUC: 0.7869


> ## HyperParameter Tuning

In [20]:
Hyperparameter_bag = {
    'clf__n_estimators': [50, 100, 200, 300, 400, 500],
    'clf__max_samples': np.linspace(0.5, 1.0, 6),      
    'clf__max_features': np.linspace(0.5, 1.0, 6),    
    'clf__bootstrap': [True, False],
    'clf__bootstrap_features': [True, False]
}

rs_bag = RandomizedSearchCV(pipe_bagging, Hyperparameter_bag, scoring=f2_scorer, random_state=42, cv=5, n_iter=50)
rs_bag.fit(X_train, y_train)

In [21]:
print(f'score :{rs_bag.best_score_}, best param : {rs_bag.best_params_}')

score :0.4224169415077207, best param : {'clf__n_estimators': 300, 'clf__max_samples': np.float64(0.5), 'clf__max_features': np.float64(0.7), 'clf__bootstrap_features': False, 'clf__bootstrap': True}


In [22]:
bagging_tuned = rs_bag.best_estimator_
bagging_tuned.fit(X_train, y_train)
y_pred  = bagging_tuned.predict(X_test)
y_proba = bagging_tuned.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2:", rs_bag.best_score_.round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8910    0.9595    0.9240       247
           1     0.6429    0.3830    0.4800        47

    accuracy                         0.8673       294
   macro avg     0.7669    0.6712    0.7020       294
weighted avg     0.8513    0.8673    0.8530       294

Test ROC-AUC: 0.8271
Test F2: 0.4224


# Model 4 - Ada Boost Classifier 

In [23]:
from sklearn.ensemble import AdaBoostClassifier

best_estimator = DecisionTreeClassifier(random_state=42)
boost_model = AdaBoostClassifier(estimator= best_estimator,
                                 algorithm='SAMME',
                                 random_state=42
                                )
pipe_boost = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", boost_model) 
])

In [24]:
pipe_boost.fit(X_train, y_train)
y_pred  = pipe_boost.predict(X_test)
y_proba = pipe_boost.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8724    0.8583    0.8653       247
           1     0.3137    0.3404    0.3265        47

    accuracy                         0.7755       294
   macro avg     0.5931    0.5994    0.5959       294
weighted avg     0.7831    0.7755    0.7792       294

Test ROC-AUC: 0.5994




> ## HyperParameter Tuning

In [25]:
n_estimator_boost = [200,300,400,500]
learning_rate_boost = [int(x) for x in np.linspace(1, 10, 10)]
max_depth_tree = [int(x) for x in np.linspace(20, 100, 40)]
split_tree = [int(x) for x in np.linspace(10, 100, 30)]
leaf_tree = [int(x) for x in np.linspace(10, 100, 30)]

hyperparameters = {
    'clf__n_estimators': n_estimator_boost,
    'clf__learning_rate': learning_rate_boost,
    'clf__estimator__max_depth': max_depth_tree,
    'clf__estimator__min_samples_split': split_tree,
    'clf__estimator__min_samples_leaf': leaf_tree,
    'clf__estimator__class_weight': ["balanced",None]
}

rs_boost = RandomizedSearchCV(pipe_boost, hyperparameters, scoring='average_precision', random_state=0, cv=5, n_iter=50)
rs_boost.fit(X_train, y_train)



In [26]:
print(f'score :{rs_boost.best_score_}, best param : {rs_boost.best_params_}')

score :0.5499306233564831, best param : {'clf__n_estimators': 500, 'clf__learning_rate': 7, 'clf__estimator__min_samples_split': 68, 'clf__estimator__min_samples_leaf': 28, 'clf__estimator__max_depth': 81, 'clf__estimator__class_weight': None}


In [27]:
boost_tuned = rs_boost.best_estimator_
boost_tuned.fit(X_train, y_train)
y_pred  = boost_tuned.predict(X_test)
y_proba = boost_tuned.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2:", rs_boost.best_score_.round(4))




=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8633    0.9717    0.9143       247
           1     0.5625    0.1915    0.2857        47

    accuracy                         0.8469       294
   macro avg     0.7129    0.5816    0.6000       294
weighted avg     0.8152    0.8469    0.8138       294

Test ROC-AUC: 0.7924
Test F2: 0.5499


# Model 4 - XGBoost Classifier 

In [22]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(random_state=42)

pipe_xg = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", xg_model) 
])

In [24]:
pipe_xg.fit(X_train, y_train)
y_pred  = pipe_xg.predict(X_test)
y_proba = pipe_xg.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))




=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.8769    0.9514    0.9126       247
           1     0.5385    0.2979    0.3836        47

    accuracy                         0.8469       294
   macro avg     0.7077    0.6246    0.6481       294
weighted avg     0.8228    0.8469    0.8280       294

Test ROC-AUC: 0.7822


> ## HyperParameter Tuning

In [32]:
n_estimator_xg = [200,400,600,800]
learning_rate_xg = [int(x) for x in np.linspace(0.01, 5, 10)]
max_depth_xg = [int(x) for x in np.linspace(2, 40, 40)]
child_weight = [int(x) for x in np.linspace(1, 20, 20)]
subsample = [int(x) for x in np.linspace(0.1, 1, 10)]
colsample = [int(x) for x in np.linspace(0.1, 1, 10)]
pos_weight = [int(x) for x in np.linspace(10, 50, 30)]

hyperparameters_xg = {
    # Jumlah pohon boosting (coba range sedang → besar)
    "clf__n_estimators": [200, 400, 600],

    # Step kontribusi tiap pohon (lebih kecil → lebih hati2)
    "clf__learning_rate": [0.01, 0.05, 0.1],

    # Kontrol kedalaman & kompleksitas pohon
    "clf__max_depth": [3, 5, 7],
    "clf__min_child_weight": [1, 3],

    # Subsampling (bantu generalisasi)
    "clf__subsample": [0.8, 1.0],
    "clf__colsample_bytree": [0.8, 1.0],

    # Imbalance handling (kalau tidak pakai SMOTE Tomek)
    "clf__scale_pos_weight": [1, 5, 10]
}

rs_xg = RandomizedSearchCV(pipe_xg, hyperparameters_xg, scoring=f2_scorer, random_state=42, cv=5, n_iter=50,n_jobs=4)
rs_xg.fit(X_train, y_train)



In [33]:
print(f'score :{rs_xg.best_score_}, best param : {rs_xg.best_params_}')

score :0.566496555049192, best param : {'clf__subsample': 0.8, 'clf__scale_pos_weight': 10, 'clf__n_estimators': 200, 'clf__min_child_weight': 1, 'clf__max_depth': 7, 'clf__learning_rate': 0.01, 'clf__colsample_bytree': 0.8}


In [34]:
xg_best = rs_xg.best_estimator_
xg_best.fit(X_train, y_train)
y_pred  = xg_best.predict(X_test)
y_proba = xg_best.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2:", rs_xg.best_score_.round(4))




=== TEST REPORT ===
              precision    recall  f1-score   support

           0     0.9316    0.7166    0.8101       247
           1     0.3269    0.7234    0.4503        47

    accuracy                         0.7177       294
   macro avg     0.6293    0.7200    0.6302       294
weighted avg     0.8349    0.7177    0.7526       294

Test ROC-AUC: 0.8042
Test F2: 0.5665


# Model 5 - Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)

pipe_rf = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", rf_base)
])

In [34]:
pipe_rf.fit(X_train, y_train)
y_pred  = pipe_rf.predict(X_test)
y_proba = pipe_rf.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT (RF baseline) ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT (RF baseline) ===
              precision    recall  f1-score   support

           0     0.8815    0.9636    0.9207       247
           1     0.6250    0.3191    0.4225        47

    accuracy                         0.8605       294
   macro avg     0.7532    0.6414    0.6716       294
weighted avg     0.8405    0.8605    0.8411       294

Test ROC-AUC: 0.8094


> ## HyperParameter Tuning

In [35]:
rf_param_dist = {
    "clf__n_estimators": [200, 300, 400, 600, 800],
    "clf__max_depth": [None, 5, 8, 12, 16, 24],
    "clf__min_samples_split": [2, 5, 10, 20],
    "clf__min_samples_leaf": [1, 2, 4, 8],
    "clf__max_features": ["sqrt", "log2", None],
    "clf__bootstrap": [True, False],
    "clf__class_weight": [None, "balanced"]
}

rs_rf = RandomizedSearchCV(
    estimator=pipe_rf,
    param_distributions=rf_param_dist,
    n_iter=60,
    scoring=f2_scorer,     # konsisten sama yang lain (F2 fokus recall)
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [36]:
print(f'RF best F2 (CV): {rs_rf.best_score_:.4f}')
print(f'RF best params : {rs_rf.best_params_}')

RF best F2 (CV): 0.4844
RF best params : {'clf__n_estimators': 600, 'clf__min_samples_split': 20, 'clf__min_samples_leaf': 2, 'clf__max_features': 'sqrt', 'clf__max_depth': 5, 'clf__class_weight': 'balanced', 'clf__bootstrap': True}


In [37]:
rf_best = rs_rf.best_estimator_
rf_best.fit(X_train, y_train)
y_pred  = rf_best.predict(X_test)
y_proba = rf_best.predict_proba(X_test)[:, 1]
print("\n=== TEST REPORT (RF tuned) ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2 (CV best):", rs_rf.best_score_.round(4))


=== TEST REPORT (RF tuned) ===
              precision    recall  f1-score   support

           0     0.8907    0.8907    0.8907       247
           1     0.4255    0.4255    0.4255        47

    accuracy                         0.8163       294
   macro avg     0.6581    0.6581    0.6581       294
weighted avg     0.8163    0.8163    0.8163       294

Test ROC-AUC: 0.8095
Test F2 (CV best): 0.4844


# Model 6 - Ensemble Stacking

In [38]:
from sklearn.ensemble import StackingClassifier

# base learners dari model tuned
estimators = [
    ("rf", rf_best.named_steps["clf"]),        # ambil clf dari pipeline tuned
    ("bagging", bagging_tuned.named_steps["clf"]),
    ("logreg", logreg_tuned.named_steps["clf"])
]

# meta-learner → bisa LogisticRegression atau RandomForest
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42, max_iter=2000),
    passthrough=False,        # kalau True → feature original ikut dikasih ke meta-learner
    n_jobs=-1
)

# pipeline full (prep + smote + stack)
pipe_stack = ImbPipeline(steps=[
    ("prep", prep),
    ("smote", SMOTETomek(random_state=42)),
    ("clf", stack_model)
])

# baseline stack
pipe_stack.fit(X_train, y_train)
y_pred  = pipe_stack.predict(X_test)
y_proba = pipe_stack.predict_proba(X_test)[:, 1]

print("\n=== TEST REPORT (Stacking baseline) ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))


=== TEST REPORT (Stacking baseline) ===
              precision    recall  f1-score   support

           0     0.9027    0.9393    0.9206       247
           1     0.5946    0.4681    0.5238        47

    accuracy                         0.8639       294
   macro avg     0.7487    0.7037    0.7222       294
weighted avg     0.8535    0.8639    0.8572       294

Test ROC-AUC: 0.8303


> ## HyperParameter Tuning

In [39]:
param_stack = {
    "clf__final_estimator__C": np.logspace(-3, 2, 10),
    "clf__final_estimator__penalty": ["l2"],
    "clf__final_estimator__solver": ["lbfgs", "saga"]
}

rs_stack = RandomizedSearchCV(
    pipe_stack,
    param_distributions=param_stack,
    n_iter=20,
    cv=5,
    scoring=f2_scorer,   # konsisten F2
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rs_stack.fit(X_train, y_train)

print(f"Stacking best F2 (CV): {rs_stack.best_score_:.4f}")
print(f"Stacking best params : {rs_stack.best_params_}")

stack_best = rs_stack.best_estimator_

# evaluate final
y_pred  = stack_best.predict(X_test)
y_proba = stack_best.predict_proba(X_test)[:, 1]

print("\n=== TEST REPORT (Stacking tuned) ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))
print("Test F2 (CV best):", rs_stack.best_score_.round(4))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Stacking best F2 (CV): 0.5256
Stacking best params : {'clf__final_estimator__solver': 'lbfgs', 'clf__final_estimator__penalty': 'l2', 'clf__final_estimator__C': np.float64(0.001)}

=== TEST REPORT (Stacking tuned) ===
              precision    recall  f1-score   support

           0     0.9073    0.9109    0.9091       247
           1     0.5217    0.5106    0.5161        47

    accuracy                         0.8469       294
   macro avg     0.7145    0.7108    0.7126       294
weighted avg     0.8456    0.8469    0.8463       294

Test ROC-AUC: 0.8293
Test F2 (CV best): 0.5256


## Save & Load Model

In [40]:
import joblib
import os

# bikin folder models kalau belum ada
os.makedirs("models", exist_ok=True)

# dictionary model yang sudah dituning
models = {
    "logreg": logreg_tuned,
    "tree": tree_tuned,
    "bagging": bagging_tuned,
    "boost": boost_tuned,
    "xgboost": xg_best,
    "rf": rf_best,
    "stack": stack_best
}

# save loop
for name, model in models.items():
    path = f"models/{name}_tuned.pkl"
    joblib.dump(model, path)
    print(f"✅ Saved {name} -> {path}")


✅ Saved logreg -> models/logreg_tuned.pkl
✅ Saved tree -> models/tree_tuned.pkl
✅ Saved bagging -> models/bagging_tuned.pkl
✅ Saved boost -> models/boost_tuned.pkl
✅ Saved xgboost -> models/xgboost_tuned.pkl
✅ Saved rf -> models/rf_tuned.pkl
✅ Saved stack -> models/stack_tuned.pkl


In [40]:
# MODEL XGBOOST AJA TANPA PIPELINE
xgb_step = xg_best.named_steps["clf"]
xgb_step.save_model("models/xgb_tuned_model_only.json")

Contoh

In [42]:
import joblib

# load model yang udah disave
rf_loaded = joblib.load("models/rf_tuned.pkl")

# prediksi di test set
y_pred  = rf_loaded.predict(X_test)
y_proba = rf_loaded.predict_proba(X_test)[:, 1]

# evaluasi
from sklearn.metrics import classification_report, roc_auc_score

print("\n=== TEST REPORT (RF loaded) ===")
print(classification_report(y_test, y_pred, digits=4))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba).round(4))



=== TEST REPORT (RF loaded) ===
              precision    recall  f1-score   support

           0     0.8907    0.8907    0.8907       247
           1     0.4255    0.4255    0.4255        47

    accuracy                         0.8163       294
   macro avg     0.6581    0.6581    0.6581       294
weighted avg     0.8163    0.8163    0.8163       294

Test ROC-AUC: 0.8095
