### 1. Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
def read_data(df_path: str, feature_path: str):
    df = pd.read_parquet(df_path)
    features = pd.read_parquet(feature_path)
    
    y = df['Attrition']
    selected_features = features[features['Final'] == 1].index
    X = df.loc[:,selected_features]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)
    
    return (X_train, y_train), (X_test, y_test), (X_valid, y_valid)

In [4]:
# balance handled data
bdf_path = 'Preprocessing/balanced_df.parquet'
imbdf_path = 'Preprocessing/imbalanced_df.parquet'
feature_path = 'Preprocessing/selected_features.parquet'

btrain, btest, bvalid = read_data(bdf_path, feature_path)
imbtrain, imbtest, imbvalid = read_data(imbdf_path, feature_path)

### 2. Model Cross Validation

In [5]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

Ở đây bước preprocessing được tách riêng, do đó pipeline sẽ chỉ bao gồm bước estimate

In [7]:
models = {
            'RFC': RandomForestClassifier(),
            'GaussianNB': GaussianNB(),
            'XGB': XGBClassifier(),
            'XGBRF': XGBRFClassifier(),
            'CatBoost': CatBoostClassifier(),
            'LGBM': LGBMClassifier()
        }

In [8]:
X_train, y_train = btrain[0], btrain[1]
bmean, bstd, all_bscores = [], [], []

for model_name, model in models.items():
    print(model_name)
    
    pl = Pipeline(
        steps=[
            ('classifier', model)
        ]
    )
    
    metric = make_scorer(precision_score)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    
    scores = cross_val_score(pl, X_train, y_train, scoring=metric, cv=cv)
    all_bscores.append(scores)
    
    print('-' * 30)
    
    bmean.append(np.mean(scores))
    bstd.append(np.std(scores))

RFC
------------------------------
GaussianNB
------------------------------
XGB
------------------------------
XGBRF
------------------------------
CatBoost
Learning rate set to 0.013159
0:	learn: 0.6859223	total: 146ms	remaining: 2m 25s
1:	learn: 0.6780627	total: 151ms	remaining: 1m 15s
2:	learn: 0.6707909	total: 157ms	remaining: 52s
3:	learn: 0.6624840	total: 161ms	remaining: 40.2s
4:	learn: 0.6548314	total: 165ms	remaining: 32.9s
5:	learn: 0.6472390	total: 170ms	remaining: 28.1s
6:	learn: 0.6417085	total: 173ms	remaining: 24.6s
7:	learn: 0.6340452	total: 176ms	remaining: 21.8s
8:	learn: 0.6280935	total: 178ms	remaining: 19.6s
9:	learn: 0.6207963	total: 181ms	remaining: 17.9s
10:	learn: 0.6144868	total: 183ms	remaining: 16.4s
11:	learn: 0.6091346	total: 184ms	remaining: 15.2s
12:	learn: 0.6023809	total: 186ms	remaining: 14.1s
13:	learn: 0.5963124	total: 188ms	remaining: 13.3s
14:	learn: 0.5909191	total: 190ms	remaining: 12.5s
15:	learn: 0.5850342	total: 191ms	remaining: 11.8s
16:	le

In [9]:
X_train, y_train = imbtrain[0], imbtrain[1]
imbmean, imbstd, all_imbscores = [], [], []

for model_name, model in models.items():
    print(model_name)
    
    pl = Pipeline(
        steps=[
            ('classifiy', model)
        ]
    )
    
    metric = make_scorer(precision_score)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    
    scores = cross_val_score(pl, X_train, y_train, scoring=metric, cv=cv)
    all_imbscores.append(scores)
    
    print('-' * 30)
    
    imbmean.append(np.mean(scores))
    imbstd.append(np.std(scores))

RFC
------------------------------
GaussianNB
------------------------------
XGB
------------------------------
XGBRF
------------------------------
CatBoost
Learning rate set to 0.010553
0:	learn: 0.6860612	total: 932us	remaining: 932ms
1:	learn: 0.6767615	total: 1.85ms	remaining: 924ms
2:	learn: 0.6685302	total: 2.85ms	remaining: 948ms
3:	learn: 0.6599064	total: 3.8ms	remaining: 946ms
4:	learn: 0.6513806	total: 4.71ms	remaining: 938ms
5:	learn: 0.6439187	total: 5.55ms	remaining: 920ms
6:	learn: 0.6375695	total: 6.44ms	remaining: 914ms
7:	learn: 0.6314728	total: 7.81ms	remaining: 969ms
8:	learn: 0.6258149	total: 8.83ms	remaining: 972ms
9:	learn: 0.6195878	total: 9.72ms	remaining: 963ms
10:	learn: 0.6149330	total: 10.8ms	remaining: 967ms
11:	learn: 0.6085251	total: 11.7ms	remaining: 963ms
12:	learn: 0.6024714	total: 12.6ms	remaining: 957ms
13:	learn: 0.5968035	total: 13.5ms	remaining: 950ms
14:	learn: 0.5901099	total: 14.7ms	remaining: 963ms
15:	learn: 0.5848681	total: 15.6ms	remaining

In [10]:
scores_df = pd.DataFrame()
scores_df['Balanced Mean Precision'] = bmean
scores_df['Balanced Std'] = bstd
scores_df['Imbalanced Mean Precision'] = imbmean
scores_df['Imbalanced Std'] = imbstd
scores_df.index = models.keys()
scores_df

Unnamed: 0,Balanced Mean Precision,Balanced Std,Imbalanced Mean Precision,Imbalanced Std
RFC,0.956225,0.019644,0.742937,0.20183
GaussianNB,0.676219,0.035725,0.456636,0.066598
XGB,0.94733,0.022817,0.599336,0.127777
XGBRF,0.906868,0.027985,0.570162,0.129036
CatBoost,0.961702,0.021873,0.743304,0.150791
LGBM,0.952435,0.02221,0.634775,0.121001


Mô hình CatBoost cho kết quả tốt nhất, vì vậy ta sẽ lựa chọn mô hình này

### 3. Fine-tuning

#### 3.1. Optuna

In [15]:
import optuna

def objective(trial, X_train = btrain[0], y_train = btrain[1], X_valid = bvalid[0], y_valid = bvalid[1]):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'random_strength': trial.suggest_int('random_strength', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 20, 40),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 2.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ["Lossguide"]),
        'verbose': 0
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return precision_score(y_valid, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-04-04 21:40:57,224] A new study created in memory with name: no-name-f77ffd0b-a9c6-4941-85b6-0f26368affef
[I 2025-04-04 21:41:02,953] Trial 0 finished with value: 0.8102189781021898 and parameters: {'iterations': 1800, 'learning_rate': 0.003336424812494266, 'depth': 6, 'random_strength': 6, 'max_leaves': 21, 'min_data_in_leaf': 4, 'scale_pos_weight': 1.610023784064942, 'grow_policy': 'Lossguide'}. Best is trial 0 with value: 0.8102189781021898.
[I 2025-04-04 21:41:05,340] Trial 1 finished with value: 0.923728813559322 and parameters: {'iterations': 500, 'learning_rate': 0.027406747293135317, 'depth': 8, 'random_strength': 9, 'max_leaves': 34, 'min_data_in_leaf': 3, 'scale_pos_weight': 1.7708663972232759, 'grow_policy': 'Lossguide'}. Best is trial 1 with value: 0.923728813559322.
[I 2025-04-04 21:41:08,011] Trial 2 finished with value: 0.9310344827586207 and parameters: {'iterations': 600, 'learning_rate': 0.28682602712777877, 'depth': 8, 'random_strength': 10, 'max_leaves': 39,

In [16]:
optuna_trials = study.trials_dataframe()
optuna_trials

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_depth,params_grow_policy,params_iterations,params_learning_rate,params_max_leaves,params_min_data_in_leaf,params_random_strength,params_scale_pos_weight,state
0,0,0.810219,2025-04-04 21:40:57.225347,2025-04-04 21:41:02.953862,0 days 00:00:05.728515,6,Lossguide,1800,0.003336,21,4,6,1.610024,COMPLETE
1,1,0.923729,2025-04-04 21:41:02.954847,2025-04-04 21:41:05.340594,0 days 00:00:02.385747,8,Lossguide,500,0.027407,34,3,9,1.770866,COMPLETE
2,2,0.931034,2025-04-04 21:41:05.341601,2025-04-04 21:41:08.010578,0 days 00:00:02.668977,8,Lossguide,600,0.286826,39,9,10,1.401628,COMPLETE
3,3,0.938596,2025-04-04 21:41:08.011577,2025-04-04 21:41:12.958677,0 days 00:00:04.947100,10,Lossguide,1100,0.060659,33,7,8,1.312311,COMPLETE
4,4,0.922414,2025-04-04 21:41:12.959590,2025-04-04 21:41:16.282164,0 days 00:00:03.322574,8,Lossguide,600,0.056366,35,1,6,1.078031,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.947368,2025-04-04 21:50:25.261108,2025-04-04 21:50:31.761150,0 days 00:00:06.500042,10,Lossguide,2000,0.057447,21,5,8,1.044130,COMPLETE
96,96,0.955357,2025-04-04 21:50:31.762163,2025-04-04 21:50:38.007954,0 days 00:00:06.245791,10,Lossguide,1700,0.045737,24,4,6,1.118604,COMPLETE
97,97,0.921739,2025-04-04 21:50:38.008973,2025-04-04 21:50:44.151107,0 days 00:00:06.142134,10,Lossguide,1700,0.049920,24,4,5,1.187850,COMPLETE
98,98,0.937500,2025-04-04 21:50:44.151107,2025-04-04 21:50:50.571570,0 days 00:00:06.420463,7,Lossguide,1800,0.045182,26,3,6,1.118280,COMPLETE


In [17]:
print("Best trial:")
print(f"  Value (Precision): {study.best_value}")
print(f"  Params: {study.best_trial.params}")

Best trial:
  Value (Precision): 0.9636363636363636
  Params: {'iterations': 1900, 'learning_rate': 0.017977253264553336, 'depth': 9, 'random_strength': 8, 'max_leaves': 24, 'min_data_in_leaf': 5, 'scale_pos_weight': 1.033615987201441, 'grow_policy': 'Lossguide'}


In [18]:
best_model = CatBoostClassifier(**study.best_trial.params)
best_model.fit(imbtrain[0], imbtrain[1])
y_pred = best_model.predict(imbvalid[0])
print(f'Accuracy Score: {accuracy_score(imbvalid[1], y_pred)}, Precision score: {precision_score(imbvalid[1], y_pred)}')

0:	learn: 0.6831363	total: 2.61ms	remaining: 4.95s
1:	learn: 0.6729386	total: 4.11ms	remaining: 3.9s
2:	learn: 0.6616293	total: 6.53ms	remaining: 4.13s
3:	learn: 0.6527848	total: 8.63ms	remaining: 4.09s
4:	learn: 0.6434934	total: 11ms	remaining: 4.17s
5:	learn: 0.6340535	total: 13ms	remaining: 4.09s
6:	learn: 0.6260684	total: 15.4ms	remaining: 4.15s
7:	learn: 0.6168361	total: 17.3ms	remaining: 4.1s
8:	learn: 0.6080722	total: 19.1ms	remaining: 4s
9:	learn: 0.6002369	total: 21.2ms	remaining: 4.01s
10:	learn: 0.5935582	total: 22.7ms	remaining: 3.9s
11:	learn: 0.5866758	total: 24.5ms	remaining: 3.85s
12:	learn: 0.5783384	total: 26.1ms	remaining: 3.78s
13:	learn: 0.5722886	total: 27.8ms	remaining: 3.74s
14:	learn: 0.5652854	total: 29.5ms	remaining: 3.71s
15:	learn: 0.5593604	total: 31.6ms	remaining: 3.72s
16:	learn: 0.5538800	total: 33.6ms	remaining: 3.73s
17:	learn: 0.5486042	total: 35.2ms	remaining: 3.68s
18:	learn: 0.5437240	total: 36.7ms	remaining: 3.64s
19:	learn: 0.5391074	total: 38.2

In [20]:
import os
os.makedirs('Models', exist_ok=True)
np.savez_compressed('Models/CatBoost.npz',
                    params = study.best_trial.params,
                    model = best_model)

#### 3.2. GridSearchCV

In [None]:
# from sklearn.base import BaseEstimator
# class ClassifierSwitcher(BaseEstimator):
#   def __init__(
#       self, 
#       estimator = CatBoostClassifier(),
#   ):
#       """
#       A Custom BaseEstimator that can switch between classifiers.
#       :param estimator: sklearn object - The classifier
#       """ 
      
#       self.estimator = estimator


#   def fit(self, X, y=None, **kwargs):
#       self.estimator.fit(X, y)
#       return self


#   def predict(self, X, y=None):
#       return self.estimator.predict(X)


#   def predict_proba(self, X):
#       return self.estimator.predict_proba(X)


#   def score(self, X, y):
#       return self.estimator.score(X, y)

In [None]:
# pipeline = Pipeline(
#     steps=[("clf", ClassifierSwitcher())]
# )

# grid_params = [
#     {
#         'clf__estimator': [CatBoostClassifier()],
#         'clf__estimator__iterations': 100*np.arange(5, 21),
#         'clf__estimator__learning_rate': np.exp(np.arange(-10, 0)),
#         'clf__estimator__depth': np.arange(6,11),
#         'clf__estimator__random_strength': np.arange(1,11),
#         'clf__estimator__max_leaves': np.arange(20,40),
#         'clf__estimator__min_data_in_leaf': np.arange(1,10),
#         'clf__estimator__scale_pos_weight': 0.2*np.arange(5, 11),
#         'clf__estimator__grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"]
#     },
# ]

# gscv = GridSearchCV(pipeline, grid_params, cv=5, n_jobs=12, scoring=make_scorer(precision_score), return_train_score=True)

In [None]:
# X_train, y_train = btrain[0], btrain[1]

# gscv.fit(X_train, y_train)