In [1]:
import numpy as np
import pandas as pd
import os 
# import data
current_dir = os.getcwd()
parent_dir  = os.path.dirname(current_dir)
file_path1  = os.path.join(parent_dir, 'data', 'processed_data', 'train.csv')
file_path2  = os.path.join(parent_dir, 'data', 'processed_data', 'test.csv')
train = pd.read_csv(file_path1)
test = pd.read_csv(file_path2)

In [2]:
# sort data
train = train.sort_values("SK_ID_CURR")
y     = train["TARGET"]
# exclude features
excluded_feats = ["SK_ID_CURR", "TARGET"]
features = [f for f in train.columns if f not in excluded_feats]
# check dimensions
print(train[features].shape)
print(test[features].shape)
X = train[features].copy()

(307511, 106)
(48744, 106)


In [3]:

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

train['OWN_CAR_AGE'] = train['OWN_CAR_AGE'].fillna(0)
test['OWN_CAR_AGE'] = test['OWN_CAR_AGE'].fillna(0)

numeric_cols = [col for col in features if train[col].dtype != np.object_]
categorical_cols = [col for col in features if train[col].dtype == np.object_]

# 2. Xây dựng Preprocessor (Bộ xử lý)
# - Cột Số: 'passthrough' -> Giữ nguyên NaN để XGBoost tự xử lý (Nhanh & Hiệu quả)
# - Cột Chữ: Điền 'MISSING' -> OneHot
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols), 
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ],
    verbose_feature_names_out=False
)

# 3. Tính toán tỷ lệ mẫu lệch (Imbalance Ratio)
# Công thức: Số lượng mẫu âm (0) / Số lượng mẫu dương (1)
scale_pos_weight_value = (y == 0).sum() / (y == 1).sum()

# 4. Định nghĩa Pipeline chính
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        scale_pos_weight=scale_pos_weight_value, # CỰC KỲ QUAN TRỌNG cho Credit Scoring
        n_jobs=-1,
        random_state=42,
        tree_method='hist' # Dùng thuật toán Histogram (nhanh hơn cho dữ liệu lớn)
    ))
])

# 5. Thiết lập GridSearch
param_grid = {
    'clf__n_estimators': [100, 200, 300],    # Số lượng cây
    'clf__learning_rate': [0.01, 0.05, 0.1], # Tốc độ học
    'clf__max_depth': [3, 5, 7],             # Độ sâu (Thấp để tránh Overfit)
    'clf__subsample': [0.8],                 # Chống Overfit
    'clf__colsample_bytree': [0.8]           # Chống Overfit
}

grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,       # Cross Validation 3 lần
    verbose=1,
    n_jobs=-1
)

# 6. Chạy huấn luyện
print("Bắt đầu training...")
grid_search.fit(X_train, y_train)

# 7. Kết quả
print(f"Best AUC: {grid_search.best_score_:.4f}")
print("Best Params:", grid_search.best_params_)

Bắt đầu training...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best AUC: 0.7629
Best Params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 300, 'clf__subsample': 0.8}


In [6]:
best_model = grid_search.best_estimator_
best_model

In [7]:
from sklearn.metrics import roc_auc_score
y_hat_train = best_model.predict_proba(X_train)[:, 1]
y_hat_test = best_model.predict_proba(X_test)[:, 1]
print (f"AUC train: {roc_auc_score(y_train, y_hat_train)}")
print (f"AUC test: {roc_auc_score(y_test, y_hat_test)}")

AUC train: 0.7852811766977149
AUC test: 0.7687220569099549
