### 1. Data Loading

In [1]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [23]:
def read_data(df_path: str, feature_path: str):
    df = pd.read_parquet(df_path)
    features = pd.read_parquet(feature_path)
    
    y = df['Attrition']
    selected_features = features[features['Final'] == 1].index
    X = df.loc[:,selected_features]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)
    
    return (X_train, y_train), (X_test, y_test), (X_valid, y_valid)

In [24]:
# balance handled data
bdf_path = 'Preprocessing/balanced_df.parquet'
imbdf_path = 'Preprocessing/imbalanced_df.parquet'
feature_path = 'Preprocessing/selected_features.parquet'

btrain, btest, bvalid = read_data(bdf_path, feature_path)
imbtrain, imbtest, imbvalid = read_data(imbdf_path, feature_path)

### 2. Model Cross Validation

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

Ở đây bước preprocessing được tách riêng, do đó pipeline sẽ chỉ bao gồm bước estimate

In [12]:
models = {
            'RFC': RandomForestClassifier(),
            'GaussianNB': GaussianNB(),
            'XGB': XGBClassifier(),
            'XGBRF': XGBRFClassifier(),
            'CatBoost': CatBoostClassifier(),
            'LGBM': LGBMClassifier()
        }

In [27]:
X_train, y_train = btrain[0], btrain[1]
bmean, bstd, all_bscores = [], [], []

for model_name, model in models.items():
    print(model_name)
    
    pl = Pipeline(
        steps=[
            ('classifier', model)
        ]
    )
    
    metric = make_scorer(precision_score)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    
    scores = cross_val_score(pl, X_train, y_train, scoring=metric, cv=cv)
    all_bscores.append(scores)
    
    print('-' * 30)
    
    bmean.append(np.mean(scores))
    bstd.append(np.std(scores))

RFC
------------------------------
GaussianNB
------------------------------
XGB
------------------------------
XGBRF
------------------------------
CatBoost
Learning rate set to 0.013159
0:	learn: 0.6863643	total: 2.48ms	remaining: 2.48s
1:	learn: 0.6785605	total: 4.34ms	remaining: 2.17s
2:	learn: 0.6714098	total: 6.17ms	remaining: 2.05s
3:	learn: 0.6644388	total: 9.03ms	remaining: 2.25s
4:	learn: 0.6561078	total: 10.9ms	remaining: 2.17s
5:	learn: 0.6486866	total: 12.7ms	remaining: 2.1s
6:	learn: 0.6412659	total: 14.7ms	remaining: 2.09s
7:	learn: 0.6360543	total: 16.5ms	remaining: 2.04s
8:	learn: 0.6299853	total: 18.3ms	remaining: 2.02s
9:	learn: 0.6230391	total: 21.1ms	remaining: 2.09s
10:	learn: 0.6165793	total: 23.1ms	remaining: 2.07s
11:	learn: 0.6106138	total: 25.1ms	remaining: 2.06s
12:	learn: 0.6053246	total: 27ms	remaining: 2.05s
13:	learn: 0.6005356	total: 30.1ms	remaining: 2.12s
14:	learn: 0.5947323	total: 31.8ms	remaining: 2.09s
15:	learn: 0.5891167	total: 33.9ms	remaining:

In [None]:
X_train, y_train = imbtrain[0], imbtrain[1]
imbmean, imbstd, all_imbscores = [], [], []

for model_name, model in models.items():
    print(model_name)
    
    pl = Pipeline(
        steps=[
            ('classifiy', model)
        ]
    )
    
    metric = make_scorer(precision_score)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    
    scores = cross_val_score(pl, X_train, y_train, scoring=metric, cv=cv)
    all_imbscores.append(scores)
    
    print('-' * 30)
    
    imbmean.append(np.mean(scores))
    imbstd.append(np.std(scores))

RFC
------------------------------
GaussianNB
------------------------------
XGB
------------------------------
XGBRF
------------------------------
CatBoost
Learning rate set to 0.010553
0:	learn: 0.6862794	total: 1.06ms	remaining: 1.06s
1:	learn: 0.6788378	total: 1.72ms	remaining: 861ms
2:	learn: 0.6711361	total: 2.7ms	remaining: 896ms
3:	learn: 0.6625620	total: 3.78ms	remaining: 942ms
4:	learn: 0.6549449	total: 4.74ms	remaining: 943ms
5:	learn: 0.6480131	total: 5.63ms	remaining: 933ms
6:	learn: 0.6404200	total: 6.58ms	remaining: 934ms
7:	learn: 0.6332463	total: 7.47ms	remaining: 927ms
8:	learn: 0.6271394	total: 8.36ms	remaining: 921ms
9:	learn: 0.6208953	total: 9.38ms	remaining: 928ms
10:	learn: 0.6151938	total: 10.3ms	remaining: 924ms
11:	learn: 0.6096392	total: 11.3ms	remaining: 928ms
12:	learn: 0.6023379	total: 12.2ms	remaining: 927ms
13:	learn: 0.5960408	total: 13.8ms	remaining: 969ms
14:	learn: 0.5908108	total: 14.6ms	remaining: 956ms
15:	learn: 0.5850181	total: 16.8ms	remainin

In [29]:
scores_df = pd.DataFrame()
scores_df['Balanced Mean Precision'] = bmean
scores_df['Balanced Std'] = bstd
scores_df['Imbalanced Mean Precision'] = imbmean
scores_df['Imbalanced Std'] = imbstd
scores_df.index = models.keys()
scores_df

Unnamed: 0,Balanced Mean Precision,Balanced Std,Imbalanced Mean Precision,Imbalanced Std
RFC,0.953119,0.026624,0.681455,0.146768
GaussianNB,0.665232,0.029003,0.437485,0.111588
XGB,0.942206,0.023325,0.620188,0.14163
XGBRF,0.880048,0.02633,0.550606,0.118926
CatBoost,0.959653,0.019488,0.758201,0.169035
LGBM,0.951675,0.021848,0.634946,0.129869


Mô hình CatBoost cho kết quả tốt nhất, vì vậy ta sẽ lựa chọn mô hình này

### 3. Fine-tuning

#### 3.1. GridSearchCV

In [55]:
from sklearn.base import BaseEstimator
class ClassifierSwitcher(BaseEstimator):
  def __init__(
      self, 
      estimator = CatBoostClassifier(),
  ):
      """
      A Custom BaseEstimator that can switch between classifiers.
      :param estimator: sklearn object - The classifier
      """ 
      
      self.estimator = estimator


  def fit(self, X, y=None, **kwargs):
      self.estimator.fit(X, y)
      return self


  def predict(self, X, y=None):
      return self.estimator.predict(X)


  def predict_proba(self, X):
      return self.estimator.predict_proba(X)


  def score(self, X, y):
      return self.estimator.score(X, y)

In [56]:
pipeline = Pipeline(
    steps=[("clf", ClassifierSwitcher())]
)

In [57]:
grid_params = [
    {
        'clf__estimator': [CatBoostClassifier()],
        'clf__estimator__iterations': 100*np.arange(5, 21),
        'clf__estimator__learning_rate': np.exp(np.arange(-10, 0)),
        'clf__estimator__depth': np.arange(6,11),
        'clf__estimator__random_strength': np.arange(1,11),
        'clf__estimator__max_leaves': np.arange(20,40),
        'clf__estimator__min_data_in_leaf': np.arange(1,10),
        'clf__estimator__scale_pos_weight': 0.2*np.arange(5, 11),
        'clf__estimator__grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"]
    },
]

In [59]:
gscv = GridSearchCV(pipeline, grid_params, cv=5, n_jobs=12, scoring=make_scorer(precision_score), return_train_score=True)

In [None]:
X_train, y_train = btrain[0], btrain[1]
gscv.fit(X_train, y_train)

#### 3.2. Optuna

In [66]:
import optuna

def objective(trial, X_train = btrain[0], y_train = btrain[1], X_valid = bvalid[0], y_valid = bvalid[1]):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'random_strength': trial.suggest_int('random_strength', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 20, 40),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 2.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ["Lossguide"]),
        'verbose': 0
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return precision_score(y_valid, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-04-04 15:36:28,990] A new study created in memory with name: no-name-f2fa520f-a6cf-48e8-93ff-2e6a38c35f0b
[I 2025-04-04 15:36:32,626] Trial 0 finished with value: 0.9262295081967213 and parameters: {'iterations': 900, 'learning_rate': 0.02077944372181392, 'depth': 7, 'random_strength': 4, 'max_leaves': 31, 'min_data_in_leaf': 4, 'scale_pos_weight': 1.5235725844853305, 'grow_policy': 'Lossguide'}. Best is trial 0 with value: 0.9262295081967213.
[I 2025-04-04 15:36:36,784] Trial 1 finished with value: 0.9421487603305785 and parameters: {'iterations': 1400, 'learning_rate': 0.07617582338446005, 'depth': 8, 'random_strength': 9, 'max_leaves': 21, 'min_data_in_leaf': 10, 'scale_pos_weight': 1.2469419340434558, 'grow_policy': 'Lossguide'}. Best is trial 1 with value: 0.9421487603305785.
[I 2025-04-04 15:36:42,002] Trial 2 finished with value: 0.9583333333333334 and parameters: {'iterations': 1500, 'learning_rate': 0.03718637776405896, 'depth': 6, 'random_strength': 7, 'max_leaves': 3

In [67]:
optuna_trials = study.trials_dataframe()
optuna_trials

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_depth,params_grow_policy,params_iterations,params_learning_rate,params_max_leaves,params_min_data_in_leaf,params_random_strength,params_scale_pos_weight,state
0,0,0.926230,2025-04-04 15:36:28.992200,2025-04-04 15:36:32.625417,0 days 00:00:03.633217,7,Lossguide,900,0.020779,31,4,4,1.523573,COMPLETE
1,1,0.942149,2025-04-04 15:36:32.626418,2025-04-04 15:36:36.784369,0 days 00:00:04.157951,8,Lossguide,1400,0.076176,21,10,9,1.246942,COMPLETE
2,2,0.958333,2025-04-04 15:36:36.785477,2025-04-04 15:36:42.002208,0 days 00:00:05.216731,6,Lossguide,1500,0.037186,38,5,7,1.980818,COMPLETE
3,3,0.899160,2025-04-04 15:36:42.003205,2025-04-04 15:36:46.632383,0 days 00:00:04.629178,8,Lossguide,1500,0.003723,20,8,7,1.046707,COMPLETE
4,4,0.965812,2025-04-04 15:36:46.632383,2025-04-04 15:36:50.504673,0 days 00:00:03.872290,6,Lossguide,1500,0.157790,30,7,5,1.533144,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.909091,2025-04-04 15:44:22.423563,2025-04-04 15:44:28.848991,0 days 00:00:06.425428,9,Lossguide,1500,0.004815,36,6,9,1.085842,COMPLETE
96,96,0.934959,2025-04-04 15:44:28.850099,2025-04-04 15:44:37.908434,0 days 00:00:09.058335,9,Lossguide,1900,0.023768,39,4,4,1.958083,COMPLETE
97,97,0.949580,2025-04-04 15:44:37.909424,2025-04-04 15:44:41.320045,0 days 00:00:03.410621,6,Lossguide,1200,0.081075,30,7,8,1.030437,COMPLETE
98,98,0.957627,2025-04-04 15:44:41.321046,2025-04-04 15:44:45.366245,0 days 00:00:04.045199,8,Lossguide,1000,0.034073,29,5,6,1.208878,COMPLETE


In [68]:
print("Best trial:")
print(f"  Value (Precision): {study.best_value}")
print(f"  Params: {study.best_trial.params}")

Best trial:
  Value (Precision): 0.9658119658119658
  Params: {'iterations': 1500, 'learning_rate': 0.15778952742071178, 'depth': 6, 'random_strength': 5, 'max_leaves': 30, 'min_data_in_leaf': 7, 'scale_pos_weight': 1.533144096921553, 'grow_policy': 'Lossguide'}


In [70]:
best_model = CatBoostClassifier(**study.best_trial.params)
best_model.fit(imbtrain[0], imbtrain[1])
y_pred = best_model.predict(imbvalid[0])
precision_score(imbvalid[1], y_pred)

0:	learn: 0.6201449	total: 2.15ms	remaining: 3.22s
1:	learn: 0.5776158	total: 4.16ms	remaining: 3.11s
2:	learn: 0.5373103	total: 5.84ms	remaining: 2.92s
3:	learn: 0.5018168	total: 7.72ms	remaining: 2.89s
4:	learn: 0.4736961	total: 9.95ms	remaining: 2.97s
5:	learn: 0.4581969	total: 12.1ms	remaining: 3.01s
6:	learn: 0.4421871	total: 13.9ms	remaining: 2.96s
7:	learn: 0.4295789	total: 16ms	remaining: 2.99s
8:	learn: 0.4209349	total: 18ms	remaining: 2.99s
9:	learn: 0.4145515	total: 19.6ms	remaining: 2.92s
10:	learn: 0.4049516	total: 21.5ms	remaining: 2.91s
11:	learn: 0.3987656	total: 23.2ms	remaining: 2.88s
12:	learn: 0.3905976	total: 24.8ms	remaining: 2.84s
13:	learn: 0.3842886	total: 26.5ms	remaining: 2.81s
14:	learn: 0.3787887	total: 28.2ms	remaining: 2.79s
15:	learn: 0.3752566	total: 32.8ms	remaining: 3.04s
16:	learn: 0.3718394	total: 34.8ms	remaining: 3.03s
17:	learn: 0.3668017	total: 36.7ms	remaining: 3.02s
18:	learn: 0.3619188	total: 38.3ms	remaining: 2.99s
19:	learn: 0.3574971	total

0.75

#### 3.3. RandomizedSearchCV