# UnderWriting Model Exploration

This notebook is used to explore the functionality of the `UnderWritingModel` class, which utilizes OptBinning for binning, SHAP for model explanation, MLflow for logging, and XGBoost as the training model.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
import shap
import os

os.chdir(os.pardir)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()

'/home/s48gb/Desktop/dhduc/vcb'

In [3]:
train_df = pd.read_csv("data/application_train.csv")
test_df = pd.read_csv("data/application_test.csv")
train_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Get numerical and categorical columns
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove 'SK_ID_CURR' and 'TARGET' from numerical features as they are identifiers/target
if 'SK_ID_CURR' in numerical_features:
    numerical_features.remove('SK_ID_CURR')
if 'TARGET' in numerical_features:
    numerical_features.remove('TARGET')

print("Categorical features:", len(categorical_features))
print(categorical_features)
print("\nNumerical features:", len(numerical_features))
print(numerical_features)

Categorical features: 16
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

Numerical features: 104
['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'

# Split intime and outtime data

Due to the data does not contain date, we assume that the train and test set are data from different date

In [5]:
from src.utils import PreprocessFeatureSelector

# Prepare the data (remove target column for features)
X_train = train_df.drop(columns=["TARGET"])
X_test = test_df.copy()
y_train = train_df["TARGET"].values

data_version = "v1"
save_train_data_path = "data/processed/train"
save_test_data_path = "data/processed/test"

# Create and run the PreprocessFeatureSelector wrapper.
pfs = PreprocessFeatureSelector(X_train, X_test, y_train,
                                categorical_features, numerical_features,
                                data_version=data_version,
                                save_train_data_path=save_train_data_path,
                                save_test_data_path=save_test_data_path)
selected_train, selected_test, selected_features, excluded_features = pfs.run()

print("Final Selected Features:", selected_features)
print("Total Excluded Features:", excluded_features)

2025-04-11 10:53:36,919 - INFO - Starting preprocessing and filtering...
2025-04-11 10:53:54,143 - INFO - Preprocessing complete.
2025-04-11 10:53:54,147 - INFO - Features before filtering: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_

Final Selected Features: ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_EMP_PHONE', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_3']
Total Excluded Features: ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG',

In [6]:
print("Target value counts:")
print(selected_train['TARGET'].value_counts())
print("\nPercentage distribution:")
print(selected_train['TARGET'].value_counts(normalize=True) * 100)

Target value counts:
TARGET
0    282686
1     24825
Name: count, dtype: int64

Percentage distribution:
TARGET
0    91.927118
1     8.072882
Name: proportion, dtype: float64


In [7]:
# from imblearn.over_sampling import SMOTE
# from collections import Counter

# # Create SMOTE instance
# smote = SMOTE(sampling_strategy='minority',random_state=42)

# # Get features and target
# X = selected_train.drop('TARGET', axis=1)
# y = selected_train['TARGET']

# # Apply SMOTE
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Convert back to dataframe
# selected_train_balanced = pd.DataFrame(X_resampled, columns=X.columns)
# selected_train_balanced['TARGET'] = y_resampled

# print("\nAfter SMOTE:")
# print("New target value counts:")
# print(Counter(y_resampled))
# print("\nNew percentage distribution:")
# print(pd.Series(y_resampled).value_counts(normalize=True) * 100)

Deal with imbalanced data later

# Preprocessing

In [8]:
from src.underwriting_trainer import UnderWritingTrainer

processed_train = "data/processed/train/processed_train_v1.csv"
processed_test = "data/processed/test/processed_test_v1.csv"

trained_model = UnderWritingTrainer.train_model(
    model_name="xgb",
    processed_train=processed_train,
    processed_test=processed_test,
    version="v3",
    experiment_name="LightGBM_Classifier",
)


2025-04-11 10:58:33,502 - INFO - MLflow tracking URI: http://localhost:5002
2025-04-11 10:58:33,535 - INFO - MLflow experiment set to: LightGBM_Classifier
2025-04-11 10:58:33,536 - INFO - Start training underwriting model xgb with version v3
2025-04-11 10:58:34,153 - INFO - No categorical features provided for schema mapping.
[I 2025-04-11 10:58:34,187] A new study created in memory with name: no-name-c0538675-264f-44fb-bf4d-6202b1fe5505


[I 2025-04-11 10:58:34,468] Trial 0 finished with value: 0.9195323805342829 and parameters: {'max_depth': 5, 'learning_rate': 0.2688415636715583, 'n_estimators': 374, 'subsample': 0.7234149476529428, 'colsample_bytree': 0.7621781066368358}. Best is trial 0 with value: 0.9195323805342829.
[I 2025-04-11 10:58:34,721] Trial 1 finished with value: 0.9195323805342829 and parameters: {'max_depth': 5, 'learning_rate': 0.003020197762974337, 'n_estimators': 356, 'subsample': 0.8042368710737102, 'colsample_bytree': 0.5298048708158646}. Best is trial 0 with value: 0.9195323805342829.
[I 2025-04-11 10:58:35,235] Trial 2 finished with value: 0.9195323805342829 and parameters: {'max_depth': 6, 'learning_rate': 0.004240660418941263, 'n_estimators': 178, 'subsample': 0.7527579681187748, 'colsample_bytree': 0.8845429733627161}. Best is trial 0 with value: 0.9195323805342829.
[I 2025-04-11 10:58:35,870] Trial 3 finished with value: 0.9195323805342829 and parameters: {'max_depth': 8, 'learning_rate': 0.0