In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
import shap
import warnings

warnings.filterwarnings('ignore')

In [2]:
import boto3
import io

s3 = boto3.client(
    's3',
    endpoint_url='http://minio.minio.svc.cluster.local:9000',
    aws_access_key_id='minio',
    aws_secret_access_key='minio123',
)

In [3]:
obj_test = s3.get_object(Bucket='sample-data', Key='data/application_test.csv')
test_df = pd.read_csv(io.BytesIO(obj_test['Body'].read()))

In [4]:
obj_train = s3.get_object(Bucket='sample-data', Key='data/application_train.csv')
train_df = pd.read_csv(io.BytesIO(obj_train['Body'].read()))

In [5]:
# Get numerical and categorical columns
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove 'SK_ID_CURR' and 'TARGET' from numerical features as they are identifiers/target
if 'SK_ID_CURR' in numerical_features:
    numerical_features.remove('SK_ID_CURR')
if 'TARGET' in numerical_features:
    numerical_features.remove('TARGET')

print("Categorical features:", len(categorical_features))
print(categorical_features)
print("\nNumerical features:", len(numerical_features))
print(numerical_features)

Categorical features: 16
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

Numerical features: 104
['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'

In [6]:
from src.utils import PreprocessFeatureSelector

# Prepare the data (remove target column for features)
X_train = train_df.drop(columns=["TARGET"])
X_test = test_df.copy()
y_train = train_df["TARGET"].values

data_version = "v1"
save_train_data_path = "data/processed/train"
save_test_data_path = "data/processed/test"

# Create and run the PreprocessFeatureSelector wrapper.
pfs = PreprocessFeatureSelector(X_train, X_test, y_train,
                                categorical_features, numerical_features,
                                data_version=data_version,
                                save_train_data_path=save_train_data_path,
                                save_test_data_path=save_test_data_path)
selected_train, selected_test, selected_features, excluded_features = pfs.run()

print("Final Selected Features:", selected_features)
print("Total Excluded Features:", excluded_features)

(CVXPY) May 05 09:02:31 AM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Version of ortools (9.5.2237) is too old. Expected >= 9.7.0.')


[32m2025-05-05 09:02:31.847[0m | [1mINFO    [0m | [36msrc.utils[0m:[36mrun[0m:[36m119[0m - [1m🚀 Starting preprocessing and filtering...[0m
[32m2025-05-05 09:03:03.562[0m | [1mINFO    [0m | [36msrc.utils[0m:[36mrun[0m:[36m123[0m - [1m✅ Preprocessing complete.[0m
[32m2025-05-05 09:03:03.566[0m | [1mINFO    [0m | [36msrc.utils[0m:[36mrun[0m:[36m125[0m - [1m🧮 Features before filtering: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHON

Final Selected Features: ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'ORGANIZATION_TYPE', 'AMT_CREDIT', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_3']
Total Excluded Features: ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'CNT_CHILDREN', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMI

In [7]:
print("Target value counts:")
print(selected_train['TARGET'].value_counts())
print("\nPercentage distribution:")
print(selected_train['TARGET'].value_counts(normalize=True) * 100)

Target value counts:
TARGET
0    282686
1     24825
Name: count, dtype: int64

Percentage distribution:
TARGET
0    91.927118
1     8.072882
Name: proportion, dtype: float64


In [8]:
from src.trainer import UnderWritingTrainer   

processed_train = "data/processed/train/processed_train_v1.csv"
processed_test  = "data/processed/test/processed_test_v1.csv"

trained_model = UnderWritingTrainer.train_model(
    model_name       = "lgbm",         
    processed_train  = processed_train,
    processed_test   = processed_test,
    version          = "v2",
    experiment_name  = "LGBM",     
    categorical_features = None
)


[32m2025-05-05 09:06:26.009[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_model[0m:[36m191[0m - [1mMLflow experiment = LGBM[0m
[32m2025-05-05 09:06:26.739[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_model[0m:[36m83[0m - [1mRunning Optuna search (10 trials)…[0m
[I 2025-05-05 09:06:26,741] A new study created in memory with name: no-name-e1bed0ab-d7aa-4e64-a57d-9cac8b127e85


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:28,969] Trial 0 finished with value: 0.9192722306228964 and parameters: {'max_depth': 4, 'learning_rate': 0.0036479850578234104, 'n_estimators': 478, 'subsample': 0.7764231446406293, 'colsample_bytree': 0.99252028648306}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:31,576] Trial 1 finished with value: 0.9192722306228964 and parameters: {'max_depth': 5, 'learning_rate': 0.004990631038335131, 'n_estimators': 424, 'subsample': 0.9437106074112057, 'colsample_bytree': 0.5873458216615974}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:32,308] Trial 2 finished with value: 0.9192722306228964 and parameters: {'max_depth': 2, 'learning_rate': 0.04272701548036857, 'n_estimators': 209, 'subsample': 0.5860541205623473, 'colsample_bytree': 0.7903603775567252}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:34,891] Trial 3 finished with value: 0.9192559712534348 and parameters: {'max_depth': 6, 'learning_rate': 0.11525038993759926, 'n_estimators': 490, 'subsample': 0.9882446749779161, 'colsample_bytree': 0.6683537235804131}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:35,518] Trial 4 finished with value: 0.9192722306228964 and parameters: {'max_depth': 2, 'learning_rate': 0.0307542656184166, 'n_estimators': 183, 'subsample': 0.9965893129836452, 'colsample_bytree': 0.88156558612614}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:37,278] Trial 5 finished with value: 0.9192234525145115 and parameters: {'max_depth': 3, 'learning_rate': 0.01636662581568947, 'n_estimators': 500, 'subsample': 0.6409872563257485, 'colsample_bytree': 0.705888538475}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:38,147] Trial 6 finished with value: 0.9192722306228964 and parameters: {'max_depth': 6, 'learning_rate': 0.013461819807808476, 'n_estimators': 113, 'subsample': 0.6396792074392962, 'colsample_bytree': 0.5232012637194403}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:40,468] Trial 7 finished with value: 0.9192071931450498 and parameters: {'max_depth': 5, 'learning_rate': 0.050167965302449075, 'n_estimators': 437, 'subsample': 0.7620543569934298, 'colsample_bytree': 0.7765337213133219}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:42,171] Trial 8 finished with value: 0.9192722306228964 and parameters: {'max_depth': 8, 'learning_rate': 0.0035844768656730698, 'n_estimators': 267, 'subsample': 0.6276452464951567, 'colsample_bytree': 0.955600573559492}. Best is trial 0 with value: 0.9192722306228964.


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


[I 2025-05-05 09:06:43,968] Trial 9 finished with value: 0.9192397118839731 and parameters: {'max_depth': 8, 'learning_rate': 0.021255678167405395, 'n_estimators': 276, 'subsample': 0.8666242214263773, 'colsample_bytree': 0.6862753290439687}. Best is trial 0 with value: 0.9192722306228964.
[32m2025-05-05 09:06:43.971[0m | [32m[1mSUCCESS [0m | [36msrc.trainer[0m:[36mtrain_model[0m:[36m88[0m - [32m[1mBest params: {'max_depth': 4, 'learning_rate': 0.0036479850578234104, 'n_estimators': 478, 'subsample': 0.7764231446406293, 'colsample_bytree': 0.99252028648306}[0m


[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486


Successfully registered model 'v2_LightGBM'.
2025/05/05 09:09:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: v2_LightGBM, version 1
Created version '1' of model 'v2_LightGBM'.
[32m2025-05-05 09:09:00.687[0m | [32m[1mSUCCESS [0m | [36msrc.trainer[0m:[36mlog_model[0m:[36m175[0m - [32m[1mModel logged & registered as ‘v2_LightGBM’[0m
