In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [36]:
from lightgbm import LGBMClassifier, log_evaluation, early_stopping

In [5]:
from sklearn.utils import resample

In [6]:
import gc

In [7]:
from pipe_utils import full_path

## Data loading

In [8]:
df = pd.read_parquet(full_path('data/res_11.pq'))
df

Unnamed: 0_level_0,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,pclose_flag,fclose_flag,...,enc_loans_credit_type_4,enc_loans_credit_type_5,enc_loans_credit_type_6,enc_loans_credit_type_7,rn_max,credit_history_length_max,num_loan_months_max,frac_loan_months_max,has_loans,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.900000,1.000000,1.000000,1.000000,1.000000,0.600000,0.900000,0.900000,0.100000,0.200000,...,0.700000,0.000000,0.0,0.000000,10,24.0,1,0.600000,0,0
1,0.857143,0.714286,0.857143,0.857143,0.785714,0.714286,0.857143,0.785714,0.071429,0.142857,...,0.571429,0.000000,0.0,0.000000,14,24.0,2,1.000000,0,0
2,1.000000,0.666667,0.666667,0.666667,1.000000,0.333333,1.000000,0.666667,0.666667,0.666667,...,0.333333,0.000000,0.0,0.000000,3,24.0,4,1.000000,0,0
3,1.000000,1.000000,1.000000,1.000000,1.000000,0.533333,0.933333,0.933333,0.333333,0.400000,...,0.600000,0.066667,0.0,0.000000,15,24.0,1,0.750000,0,0
4,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.000000,0.0,0.000000,1,24.0,1,1.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,0.818182,0.454545,1.000000,1.000000,1.000000,0.727273,1.000000,0.636364,0.181818,0.454545,...,0.545455,0.000000,0.0,0.090909,11,24.0,5,0.571429,0,0
2999996,0.923077,1.000000,0.923077,1.000000,1.000000,0.692308,0.846154,0.846154,0.384615,0.307692,...,0.615385,0.000000,0.0,0.076923,13,24.0,2,0.750000,0,0
2999997,0.900000,0.700000,1.000000,1.000000,1.000000,0.600000,1.000000,0.900000,0.100000,0.100000,...,0.700000,0.000000,0.0,0.000000,10,24.0,6,0.750000,0,0
2999998,1.000000,1.000000,0.800000,0.800000,0.800000,0.200000,0.800000,0.800000,0.400000,0.200000,...,0.400000,0.000000,0.0,0.000000,5,24.0,1,1.000000,0,0


## Train-test split

In [9]:
# train, test = train_test_split(df, stratify=df.target, test_size=0.2, random_state=44)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target'), df['target'], stratify=df.target, test_size=0.2, random_state=44)

In [11]:
X_train.shape

(2400000, 385)

In [12]:
y_test.shape

(600000,)

## LightGBM

In [13]:
lgbm = LGBMClassifier(class_weight='balanced', random_state=44)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.163920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40647
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 371
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [14]:
confusion_matrix(y_test, lgbm.predict(X_test))

array([[385870, 192842],
       [  6067,  15221]], dtype=int64)

In [15]:
roc_auc_score(y_train, lgbm.predict_proba(X_train)[:,1])

0.7687644562873315

In [16]:
roc_auc_score(y_test, lgbm.predict_proba(X_test)[:,1])

0.7574396315506281

In [17]:
temp = pd.DataFrame(lgbm.feature_importances_, index=X_train.columns, columns=['imp'])

In [18]:
for k, v in temp.sort_index().imp.items():
    print(f"{k}: {'-' if v > 2 else v}")

credit_history_length_max: 0
enc_loans_account_holder_type_0: 0
enc_loans_account_holder_type_1: -
enc_loans_account_holder_type_2: 0
enc_loans_account_holder_type_3: -
enc_loans_account_holder_type_4: -
enc_loans_account_holder_type_5: 0
enc_loans_account_holder_type_6: -
enc_loans_credit_status_0: 0
enc_loans_credit_status_1: -
enc_loans_credit_status_2: -
enc_loans_credit_status_3: -
enc_loans_credit_status_4: -
enc_loans_credit_status_5: -
enc_loans_credit_status_6: 0
enc_loans_credit_type_0: -
enc_loans_credit_type_1: -
enc_loans_credit_type_2: -
enc_loans_credit_type_3: -
enc_loans_credit_type_4: -
enc_loans_credit_type_5: -
enc_loans_credit_type_6: 1
enc_loans_credit_type_7: 0
enc_paym_0_0: -
enc_paym_0_1: -
enc_paym_0_2: 1
enc_paym_0_3: -
enc_paym_10_0: -
enc_paym_10_1: 0
enc_paym_10_2: 0
enc_paym_10_3: -
enc_paym_11_0: 0
enc_paym_11_1: 1
enc_paym_11_2: 0
enc_paym_11_3: 2
enc_paym_12_0: -
enc_paym_12_1: 0
enc_paym_12_2: 0
enc_paym_12_3: 1
enc_paym_13_0: -
enc_paym_13_1: 0
enc_p

In [19]:
temp.sort_values(by='imp', ascending=False).head(20)

Unnamed: 0,imp
pre_till_pclose_10,80
enc_loans_credit_type_0,73
pre_util_3,69
pre_util_6,65
pre_loans_credit_cost_rate_11,62
is_zero_loans3060,59
is_zero_loans530,58
pre_loans_outstanding_1,57
pre_util_16,57
pre_loans_credit_cost_rate_9,49


## LightGBM tuning

In [20]:
gc.collect()

1374

In [21]:
lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=44,
    n_estimators=300,
)

In [32]:
gs = GridSearchCV(
    lgbm,
    {
        'learning_rate': [0.05],  # [0.01 * 5**i for i in range(4)],  # 0.05
        'reg_lambda': [0.0, 0.1, 1.0, 10],  # 10
        'max_depth': [5], #  [-1, 5, 10, 20],
        # 'num_leaves': [31],  #[10, 30, 50, 70]
        
    },
    scoring='roc_auc',
    cv=4,
    verbose=4,
    n_jobs=2,
    pre_dispatch='n_jobs'
)

In [33]:
%%time

gs.fit(X_train, y_train, callbacks=[log_evaluation(1)])

Fitting 4 folds for each of 4 candidates, totalling 16 fits
[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.351687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40647
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 371
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
CPU times: total: 12min 41s
Wall time: 54min 37s


In [34]:
res = pd.concat((res, pd.DataFrame(gs.cv_results_)), ignore_index=True)
# res = pd.DataFrame(gs.cv_results_)

In [35]:
res.sort_values(by='mean_test_score', ascending=False)[['mean_fit_time', 'params', 'mean_test_score', 'std_test_score']]

Unnamed: 0,mean_fit_time,params,mean_test_score,std_test_score
15,341.201774,"{'learning_rate': 0.05, 'reg_lambda': 10}",0.759868,0.001771
14,356.898133,"{'learning_rate': 0.05, 'reg_lambda': 1.0}",0.759591,0.001974
13,327.185993,"{'learning_rate': 0.05, 'reg_lambda': 0.1}",0.759508,0.002087
12,357.577686,"{'learning_rate': 0.05, 'reg_lambda': 0.0}",0.759454,0.002072
1,250.685808,{'max_depth': 5},0.759043,0.001802
6,257.7291,"{'max_depth': 5, 'num_leaves': 50}",0.758992,0.00209
7,304.608891,"{'max_depth': 5, 'num_leaves': 70}",0.758992,0.00209
10,257.7291,"{'max_depth': 5, 'num_leaves': 50}",0.758992,0.00209
11,304.608891,"{'max_depth': 5, 'num_leaves': 70}",0.758992,0.00209
3,267.214544,{'max_depth': 20},0.758962,0.002048


In [38]:
lgbm_tu = LGBMClassifier(
    class_weight='balanced',
    random_state=44,
    n_estimators=2000,
    learning_rate=0.05,
    reg_lambda=10,
    max_depth=5,
    num_leaves=33
)
lgbm_tu.fit(
    X_train, y_train, 
    eval_set=(X_test, y_test), 
    eval_metric='auc',
    callbacks=[
        early_stopping(10, first_metric_only=True, verbose=True, min_delta=0.000001),
        log_evaluation(50)
    ]
)

[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.953262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40647
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 371
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.737146	valid_0's binary_logloss: 0.609456
[100]	valid_0's auc: 0.747077	valid_0's binary_logloss: 0.595432
[150]	valid_0's auc: 0.751814	valid_0's binary_logloss: 0.588749
[200]	valid_0's auc: 0.754468	valid_0's binary_logloss: 0.584519
[250]	valid_0's auc: 0.75614	valid_0's binary_logloss: 0.58139
[300]	valid_0's auc: 0.757398	valid_0's binary_logloss: 0.578638
[350]	valid_0's auc: 0.758306	valid_0's binary_loglo

In [43]:
confusion_matrix(y_test, lgbm_tu.predict(X_test))

array([[404195, 174517],
       [  6612,  14676]], dtype=int64)

In [40]:
roc_auc_score(y_train, lgbm_tu.predict_proba(X_train)[:,1])

0.8000156293342382

In [39]:
roc_auc_score(y_test, lgbm_tu.predict_proba(X_test)[:,1])

0.7624549529813069

## Hist Gradient Boosting

In [41]:
gc.collect()

2520

In [42]:
hgb = HistGradientBoostingClassifier(class_weight='balanced', random_state=44)
hgb.fit(X_train, y_train)

In [31]:
confusion_matrix(y_test, hgb.predict(X_test))

array([[385329, 193383],
       [  6052,  15236]], dtype=int64)

In [44]:
roc_auc_score(y_train, hgb.predict_proba(X_train)[:,1])

0.7686523550328745

In [45]:
roc_auc_score(y_test, hgb.predict_proba(X_test)[:,1])

0.7570793887736931

## HGB tuning

In [53]:
hgb = HistGradientBoostingClassifier(
    class_weight='balanced', 
    random_state=44, 
    scoring='roc_auc', 
    n_iter_no_change=50, 
    tol=1e-5)

In [81]:
gs = GridSearchCV(
    hgb,
    {
        # 'learning_rate': (0.1, 0.5, 1.0), # 0.1
        'l2_regularization': (20.0, 30.0), # 10
        # 'max_iter': (20, 100, 200), # 100
        # 'max_leaf_nodes': (10, 30, 100) # 30
        
    },
    scoring='roc_auc',
    cv=5,
    verbose=2,
    n_jobs=4,
    pre_dispatch='n_jobs'
)

In [82]:
gs.fit(data2, target[data2.index])

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [83]:
res = pd.concat((res, pd.DataFrame(gs.cv_results_)), ignore_index=True)

In [84]:
res.sort_values(by='mean_test_score', ascending=False)[['mean_fit_time', 'params', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,mean_fit_time,params,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
18,182.809031,{'l2_regularization': 30.0},0.772052,0.766935,0.77176,0.761958,0.767454,0.003962
17,207.52016,{'l2_regularization': 20.0},0.771374,0.766976,0.773341,0.761402,0.767331,0.004516
16,184.316712,{'l2_regularization': 10.0},0.771287,0.766424,0.773452,0.761136,0.767029,0.004732
14,177.814388,{'l2_regularization': 3.0},0.770827,0.766197,0.770276,0.76097,0.766127,0.004001
15,214.158009,{'l2_regularization': 4.0},0.769373,0.765858,0.77217,0.760659,0.766009,0.004338
3,228.111624,{'l2_regularization': 1.0},0.77084,0.764593,0.771379,0.761264,0.76599,0.004332
13,207.803556,{'l2_regularization': 2.0},0.769764,0.765534,0.771879,0.760148,0.765913,0.004411
11,208.284054,{'max_leaf_nodes': 30},0.769656,0.765166,0.771262,0.760364,0.765826,0.004108
0,228.670399,{'l2_regularization': 0.0},0.76953,0.765899,0.77234,0.759486,0.765815,0.004741
8,201.737245,{'max_iter': 100},0.76953,0.765899,0.77234,0.759486,0.765815,0.004741


In [47]:
hgb_tu = HistGradientBoostingClassifier(
    l2_regularization=10,
    max_iter=2000,
    class_weight='balanced', 
    scoring='roc_auc', 
    n_iter_no_change=10, 
    tol=1e-6,
    random_state=44, 
)

In [48]:
hgb_tu.fit(X_train, y_train)

In [52]:
confusion_matrix(y_test, hgb_tu.predict(X_test))

array([[393933, 184779],
       [  6307,  14981]], dtype=int64)

In [53]:
roc_auc_score(y_train, hgb_tu.predict_proba(X_train)[:,1])

0.7849445223852966

In [54]:
roc_auc_score(y_test, hgb_tu.predict_proba(X_test)[:,1])

0.7597672178351133

## Best model CV with the entire data

In [62]:
gc.collect()

611

In [63]:
cv = cross_validate(
    lgbm_tu,
    df.drop(columns='target'),
    df['target'],
    cv=5,
    scoring='roc_auc',
    verbose=2,
    return_estimator=True,
    n_jobs=1,
    pre_dispatch='n_jobs'
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.964732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40901
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 374
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[CV] END .................................................... total time=14.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.7min remaining:    0.0s


[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.213454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40650
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 374
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[CV] END .................................................... total time=15.1min
[LightGBM] [Info] Number of positive: 85154, number of negative: 2314846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.085022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40466
[LightGBM] [Info] Num

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 75.5min finished


In [64]:
cv

{'fit_time': array([847.12470984, 870.87092495, 870.21495032, 889.37571216,
        883.28773761]),
 'score_time': array([33.00100279, 33.09684062, 33.77157831, 32.62229371, 34.98822045]),
 'estimator': [LGBMClassifier(class_weight='balanced', learning_rate=0.05, max_depth=5,
                 n_estimators=2000, num_leaves=33, random_state=44,
                 reg_lambda=10),
  LGBMClassifier(class_weight='balanced', learning_rate=0.05, max_depth=5,
                 n_estimators=2000, num_leaves=33, random_state=44,
                 reg_lambda=10),
  LGBMClassifier(class_weight='balanced', learning_rate=0.05, max_depth=5,
                 n_estimators=2000, num_leaves=33, random_state=44,
                 reg_lambda=10),
  LGBMClassifier(class_weight='balanced', learning_rate=0.05, max_depth=5,
                 n_estimators=2000, num_leaves=33, random_state=44,
                 reg_lambda=10),
  LGBMClassifier(class_weight='balanced', learning_rate=0.05, max_depth=5,
                 n_

In [65]:
cv['test_score'].mean()

0.754668769907204