In [1]:
import numpy as np
import pandas as pd

In [2]:
import time

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
import os
os.environ['SKLEARN_USE_CUDA'] = 'true'

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

In [6]:
from sklearn.svm import SVC

In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
RS = 42

In [10]:
def view_importance(coef, features):
    ft_weights = pd.DataFrame(data=coef, index=features, columns=['value'])
    ft_weights['abs_value'] = ft_weights['value'].map(np.abs)
    ft_weights.sort_values(by='abs_value', ascending=False, inplace=True)
    return ft_weights

## LASSO feature selection

In [8]:
train = pd.read_pickle('train-5.pkl').drop(columns='id')
test = pd.read_pickle('test-5.pkl').drop(columns='id')

In [9]:
target = 'flag'
features = train.drop(columns=target).columns.tolist()

In [10]:
X = train[features]
y = train[target]

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2100000 entries, 0 to 2099999
Columns: 974 entries, pre_loans5_sum to fclose_flag_1
dtypes: bool(306), int64(1), int8(667)
memory usage: 1.9 GB


In [12]:
class_weights = {1: 1.0, 0: 0.03678585326438938}

In [11]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'gini',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight=class_weights, 
    warm_start=True
)

In [12]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7273
CPU times: total: 8h 36min 12s
Wall time: 56min 31s


In [13]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight='balanced', 
    warm_start=True
)

In [14]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

  warn(


roc-auc: 0.7089
CPU times: total: 3h 59min 2s
Wall time: 26min 13s


In [15]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'entropy',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight='balanced', 
    warm_start=False
)

In [16]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7089
CPU times: total: 3h 58min 34s
Wall time: 24min 51s


In [17]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'gini',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight='balanced', 
    warm_start=False
)

In [18]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7054
CPU times: total: 4h 7min 45s
Wall time: 30min 44s


In [21]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight='balanced', 
    warm_start=False
)

In [None]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

In [23]:
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7134


In [24]:
linreg = LinearRegression()

In [25]:
linreg.fit(X, y)
lin_pred = linreg.predict(test[features])
roc_auc_score(test[target], lin_pred)

0.7541480814627602

In [13]:
lss = Lasso(alpha=0.0001, warm_start=False, random_state=RS)

In [14]:
%%time
lss.fit(X, y)
lss_pred = lss.predict(test[features])
roc_auc_score(test[target], lss_pred)

CPU times: total: 14min 1s
Wall time: 2h 22min 43s


0.7527214868010316

In [15]:
lss_pred.max()

0.4261354962712952

In [16]:
lss_pred.min()

-0.09654992713458349

In [18]:
importance = view_importance(lss.coef_, lss.feature_names_in_)

In [20]:
importance[:50]

Unnamed: 0,value,abs_value
is_zero_loans3060_1,-0.017205,0.017205
pre_util_6_sum,0.012002,0.012002
is_zero_loans90_1,-0.009766,0.009766
pre_loans_credit_cost_rate_6,0.009078,0.009078
pre_util_17_sum,0.008625,0.008625
pre_util_3_sum,0.008021,0.008021
enc_loans_credit_status_5_sum,0.007813,0.007813
enc_paym_0_1,0.006625,0.006625
enc_loans_credit_type_3,0.00658,0.00658
pre_since_opened_12,-0.005995,0.005995


In [21]:
importance[50:100]

Unnamed: 0,value,abs_value
pre_util_17_mean,0.001248,0.001248
enc_loans_credit_type_4_sum,0.00118,0.00118
pre_loans_credit_cost_rate_13_sum,-0.001149,0.001149
pre_loans_credit_limit_18_sum,-0.001131,0.001131
pre_loans_credit_cost_rate_3,0.001131,0.001131
pre_loans_credit_cost_rate_5,-0.00113,0.00113
pre_fterm_7_sum,0.00108,0.00108
enc_paym_0_3_sum,-0.001069,0.001069
pre_loans_credit_cost_rate_6_mean,0.00106,0.00106
enc_paym_21_1_sum,-0.000991,0.000991


In [22]:
importance[100:150]

Unnamed: 0,value,abs_value
pre_over2limit_15_mean,0.00044,0.00044
pre_loans_credit_limit_15,0.000436,0.000436
enc_paym_12_3,-0.000431,0.000431
pre_since_opened_16_sum,-0.00043,0.00043
is_zero_loans90_1_mean,-0.000424,0.000424
enc_loans_credit_status_4_mean,0.000412,0.000412
pre_till_fclose_11_sum,-0.000403,0.000403
pre_since_opened_19,-0.000401,0.000401
pre_util_13_sum,-0.000391,0.000391
pre_pterm_9_sum,-0.000382,0.000382


In [23]:
importance[150:200]

Unnamed: 0,value,abs_value
pre_util_13_mean,-0.000227,0.000227
pre_since_opened_7,-0.00022,0.00022
pre_maxover2limit_17_mean,-0.000218,0.000218
pre_since_confirmed_7_sum,0.000217,0.000217
pre_loans530_13_mean,0.000212,0.000212
enc_paym_3_2_mean,-0.000208,0.000208
enc_paym_13_2_mean,-0.000199,0.000199
enc_paym_24_3_mean,-0.000199,0.000199
is_zero_loans6090_1_mean,-0.000198,0.000198
is_zero_maxover2limit_1_mean,0.000195,0.000195


In [24]:
importance[200:250]

Unnamed: 0,value,abs_value
pre_loans5_mean,-0.000129,0.000129
pre_loans_credit_cost_rate_11,-0.000128,0.000128
pre_loans_next_pay_summ_2_sum,-0.000128,0.000128
pre_till_pclose_8_sum,0.000127,0.000127
pre_util_12_sum,-0.000126,0.000126
enc_paym_22_3_sum,0.000124,0.000124
pre_loans_next_pay_summ_5_mean,-0.000123,0.000123
pre_since_opened_3_mean,0.000123,0.000123
pre_till_fclose_5_mean,0.000122,0.000122
is_zero_loans5_1_mean,-0.000122,0.000122


In [86]:
zero_cols = importance[importance['value']==0].index.tolist()

In [87]:
zero_cols

['pre_since_confirmed_3',
 'pre_till_fclose_2',
 'pclose_flag_1',
 'pre_since_opened_16',
 'pre_since_confirmed_9',
 'pre_till_fclose_9',
 'pre_till_pclose_13',
 'enc_loans_account_cur_3',
 'pre_since_confirmed_2',
 'pre_since_confirmed_1',
 'pre_till_fclose_8',
 'pre_since_opened_18',
 'pre_till_pclose_16',
 'pre_since_opened_17',
 'pre_since_confirmed_6',
 'pre_since_confirmed_7',
 'pre_till_fclose_1',
 'pre_till_pclose_15',
 'pre_since_confirmed_5',
 'pre_till_fclose_3',
 'pre_since_confirmed_4',
 'pre_till_fclose_7',
 'pre_till_pclose_14',
 'pre_till_fclose_5',
 'pre_since_confirmed_8',
 'pre_till_fclose_6',
 'pre_since_opened_9',
 'pre_till_fclose_4',
 'enc_paym_24_4',
 'enc_loans_credit_type_4',
 'pre_till_pclose_11',
 'enc_loans_account_holder_type_6',
 'pre_pterm_14',
 'pre_pterm_15',
 'pre_till_fclose_10',
 'enc_loans_credit_status_3',
 'pre_since_opened_11',
 'enc_loans_credit_status_1',
 'pre_pterm_17',
 'pre_since_opened_15',
 'pre_fterm_1',
 'pre_fterm_2',
 'pre_pterm_4',


In [36]:
importance['col'] = importance.index

In [84]:
importance[importance['col'].apply(lambda x: x.find('fclose_flag') > -1)]

Unnamed: 0,value,abs_value,col
fclose_flag_1_mean,0.000482,0.000482,fclose_flag_1_mean
fclose_flag_1_sum,0.0,0.0,fclose_flag_1_sum
fclose_flag_1,0.0,0.0,fclose_flag_1


In [85]:
importance.to_pickle('importance.pkl')

In [8]:
# train = pd.read_pickle('train-5.pkl').drop(columns='id')
# test = pd.read_pickle('test-5.pkl').drop(columns='id')

In [88]:
train = train.drop(columns=zero_cols)
test = test.drop(columns=zero_cols)

In [92]:
train.to_pickle('clean-train-5.pkl')
test.to_pickle('clean-test-5.pkl')


## Seleted features

In [11]:
train = pd.read_pickle('clean-train-5.pkl')
test = pd.read_pickle('clean-test-5.pkl')

In [12]:
for col in train.columns:
    if train[col].dtype == bool:
        train[col] = train[col].astype(np.int8)
        test[col] = test[col].astype(np.int8)

In [13]:
target = 'flag'
features = train.drop(columns=target).columns.tolist()

In [14]:
X = train[features]
y = train[target]

In [162]:
X

Unnamed: 0,pre_loans5_mean,pre_loans3060_sum,pre_loans3060_mean,pre_loans_max_overdue_sum_1_sum,pre_loans_max_overdue_sum_1_mean,pre_loans_max_overdue_sum_3_sum,pre_loans_max_overdue_sum_3_mean,pre_loans530_0_mean,pre_loans530_2_mean,pre_loans530_3_mean,...,pre_util_16,is_zero_util_1,is_zero_maxover2limit_1,enc_paym_0_1,enc_paym_0_3,enc_paym_1_1,enc_paym_5_3,enc_paym_12_3,enc_loans_credit_status_2,enc_loans_credit_type_3
0,100,8,100,0,0,3,37,0,0,0,...,1,1,1,0,1,0,1,1,1,1
1,100,2,100,2,100,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,100,1,100,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,1,0
3,100,10,100,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,1,1
4,100,1,100,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099995,100,10,100,0,0,0,0,0,0,0,...,1,1,1,0,0,0,1,1,0,0
2099996,100,7,100,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
2099997,100,2,100,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,1,0,0
2099998,100,6,100,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,1,1


In [13]:
lss = Lasso(alpha=0.00001, warm_start=False, random_state=RS)

In [14]:
%%time
lss.fit(X, y)
lss_pred = lss.predict(test[features])
roc_auc_score(test[target], lss_pred)

CPU times: total: 2min 14s
Wall time: 15min 20s


0.7534148642697582

In [17]:
importance2 = view_importance(lss.coef_, lss.feature_names_in_)

In [18]:
importance2[importance2['value']==0].index.tolist()

['enc_paym_20_2_mean', 'enc_paym_12_3']

In [19]:
lgr = LogisticRegression(class_weight='balanced', random_state=RS, C=1, l1_ratio=0.5, n_jobs=-1, verbose=True)

In [20]:
%%time
lgr.fit(X, y)
lgr_pred = lgr.predict(test[features])
roc_auc_score(test[target], lgr_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.4min finished


CPU times: total: 13min 1s
Wall time: 17min 18s


0.680195754392056

In [17]:
class_weights = {1: 1.0, 0: 0.03678585326438938}

In [22]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion = 'gini',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.34963506e-06,
    n_jobs=-1, 
    class_weight=class_weights, 
    warm_start=True
)

In [23]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7065
CPU times: total: 3h 22min 52s
Wall time: 19min 34s


In [24]:
X.shape

(2100000, 411)

In [15]:

nn = MLPClassifier(
    hidden_layer_sizes=(411, 12, 6),
    activation='relu',
    solver='adam',
    # alpha=0.0001,
    learning_rate='adaptive',  # {‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
    learning_rate_init=0.0001,  # default=0.001
    random_state=RS,
    verbose=True,
    # warm_start=True,
    # n_iter_no_change=30,
    max_iter=7  # 50
)

In [16]:
%%time
nn.fit(X, y)
nn_pred = nn.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], nn_pred))

Iteration 1, loss = 0.14407212
Iteration 2, loss = 0.13914631
Iteration 3, loss = 0.13801521
Iteration 4, loss = 0.13735581
Iteration 5, loss = 0.13669856
Iteration 6, loss = 0.13615537
Iteration 7, loss = 0.13558492




0.7573436523390867
CPU times: total: 1h 49min 28s
Wall time: 19min 44s


In [29]:
nn = MLPClassifier(
    hidden_layer_sizes=(411, 32, 16),
    activation='relu',
    solver='sgd',
    # alpha=0.0001,
    learning_rate='adaptive',  # {‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
    learning_rate_init=0.5,  # default=0.001
    random_state=RS,
    verbose=True,
    warm_start=False,
    n_iter_no_change=100,
    max_iter=100  # 50
)

In [30]:
%%time
nn.fit(X, y)
nn_pred = nn.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], nn_pred))

Iteration 1, loss = 89590632449667759102208277618966432815039370957242148948360900378624.00000000
Iteration 2, loss = 85172779590298538681351763789310279095391841598057731461559418355712.00000000
Iteration 3, loss = 80816468724836889008130164129194274722334712539437261520337721360384.00000000
Iteration 4, loss = 76682969002180584809079296778284719520133378837548517699898306461696.00000000
Iteration 5, loss = 72760884356511080712748040201880869192579471562730115732652657999872.00000000
Iteration 6, loss = 69039401593735931869928622566108300347537504940571264438215039254528.00000000
Iteration 7, loss = 65508260579499593732787051460278591634257637749317053154336377405440.00000000
Iteration 8, loss = 62157725951972773650259501498522222254506889935786227309090267201536.00000000




0.5
CPU times: total: 1h 49min 23s
Wall time: 19min 39s


In [31]:
nn = MLPClassifier(
    hidden_layer_sizes=(411, 32, 16),
    activation='relu',
    solver='sgd',
    # alpha=0.0001,
    learning_rate='adaptive',  # {‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
    learning_rate_init=1000,  # default=0.001
    random_state=RS,
    verbose=True,
    warm_start=False,
    n_iter_no_change=100,
    max_iter=100  # 50
)

In [32]:
%%time
nn.fit(X, y)
nn_pred = nn.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], nn_pred))

Iteration 1, loss = 416434300341379338010370225978629060104267069976787147466407510065690221477888.00000000
Iteration 2, loss = 858031708112901526155306205184.00000000
Iteration 3, loss = 2.37866926
Iteration 4, loss = 2.37946823
Iteration 5, loss = 2.36865822
Iteration 6, loss = 2.39455933
Iteration 7, loss = 2.38479724
Iteration 8, loss = 2.41103094
Iteration 9, loss = 2.39502548
Iteration 10, loss = 2.40341370
Iteration 11, loss = 2.37660368
Iteration 12, loss = 2.37619832
Iteration 13, loss = 2.38371212
Iteration 14, loss = 2.38169173
Iteration 15, loss = 2.38385637
Iteration 16, loss = 2.38236933
Iteration 17, loss = 2.40001052
Iteration 18, loss = 2.40821889
Iteration 19, loss = 2.39263408
Iteration 20, loss = 2.39185684
Iteration 21, loss = 2.37194457
Iteration 22, loss = 2.37362796
Iteration 23, loss = 2.40900594
Iteration 24, loss = 2.37251549
Iteration 25, loss = 2.39477419
Iteration 26, loss = 2.40709191
Iteration 27, loss = 2.38713131
Iteration 28, loss = 2.39562406
Iterati



0.5
CPU times: total: 3d 15h 21min 5s
Wall time: 14h 48min 39s


In [33]:
nn = MLPClassifier(
    hidden_layer_sizes=(3, 2, 1),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    learning_rate='adaptive',  # {‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
    learning_rate_init=0.01,  # default=0.001
    random_state=RS,
    verbose=True,
    # warm_start=True,
    n_iter_no_change=30,
    # max_iter=10  # 50
    batch_size=200
)

In [34]:
%%time
nn.fit(X, y)
nn_pred = nn.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], nn_pred))

Iteration 1, loss = 0.14386074
Iteration 2, loss = 0.14200508
Iteration 3, loss = 0.14233267
Iteration 4, loss = 0.14210792
Iteration 5, loss = 0.14306831
Iteration 6, loss = 0.14256147
Iteration 7, loss = 0.14340926
Iteration 8, loss = 0.14471600
Iteration 9, loss = 0.14427953
Iteration 10, loss = 0.14387825
Iteration 11, loss = 0.14295457
Iteration 12, loss = 0.14286020
Iteration 13, loss = 0.14471186
Iteration 14, loss = 0.14409413
Iteration 15, loss = 0.14350903
Iteration 16, loss = 0.14334563
Iteration 17, loss = 0.14334490
Iteration 18, loss = 0.14371569
Iteration 19, loss = 0.14423727
Iteration 20, loss = 0.14408387
Iteration 21, loss = 0.14413670




0.7236160930629782
CPU times: total: 2h 1min 45s
Wall time: 22min 21s


In [39]:
nn = MLPClassifier(
    hidden_layer_sizes=(3, 2, 1),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    # learning_rate='adaptive',  # {‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
    learning_rate_init=0.5,  # default=0.001
    random_state=RS,
    verbose=True,
    # warm_start=True,
    n_iter_no_change=30,
    # max_iter=10  # 50
    batch_size=200
)

In [40]:
%%time
nn.fit(X, y)
nn_pred = nn.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], nn_pred))

Iteration 1, loss = 0.15545775
Iteration 2, loss = 0.15464743
Iteration 3, loss = 0.15464447
Iteration 4, loss = 0.15470893
Iteration 5, loss = 0.15474708
Iteration 6, loss = 0.15469132
Iteration 7, loss = 0.15465303
Iteration 8, loss = 0.15458095
Iteration 9, loss = 0.15465916
Iteration 10, loss = 0.15467299
Iteration 11, loss = 0.15466821
Iteration 12, loss = 0.15477733
Iteration 13, loss = 0.15456469
Iteration 14, loss = 0.15465932
Iteration 15, loss = 0.15465764
Iteration 16, loss = 0.15466439
Iteration 17, loss = 0.15460310
Iteration 18, loss = 0.15469291
Iteration 19, loss = 0.15473235
Iteration 20, loss = 0.15472253
Iteration 21, loss = 0.15467721
Iteration 22, loss = 0.15466486
Iteration 23, loss = 0.15460343
Iteration 24, loss = 0.15466086
Iteration 25, loss = 0.15470853
Iteration 26, loss = 0.15465502
Iteration 27, loss = 0.15468496
Iteration 28, loss = 0.15470304




0.5
CPU times: total: 2h 43min 14s
Wall time: 28min 35s


In [None]:
GridSearchCV

In [14]:
class_weights = {1: 1.0, 0: 0.03678585326438938}

In [15]:
tree = DecisionTreeClassifier(class_weight=class_weights, random_state=RS)

In [16]:
%%time
path = tree.cost_complexity_pruning_path(X, y)

CPU times: total: 17min 19s
Wall time: 17min 20s


In [19]:
with open('ccp_alphas.npy', 'wb') as f:
    np.save(f, path['ccp_alphas'])
    

In [20]:
path['ccp_alphas'].shape

(88008,)

In [30]:
path['ccp_alphas'][::12000].shape

(8,)

In [23]:
tree = DecisionTreeClassifier(ccp_alpha=5.27588200e-06 ,class_weight=class_weights, random_state=RS)

In [24]:
%%time
tree.fit(X, y)

CPU times: total: 18min 14s
Wall time: 18min 15s


In [25]:
tree_pred = tree.predict_proba(test[features])[:, 1]
print(roc_auc_score(test[target], tree_pred))

0.5871090895686067


In [31]:
selected_alphas = path['ccp_alphas'][::12000]

In [32]:
# for alpha in path['ccp_alphas']:ccp_alpha=alpha, 

tree = DecisionTreeClassifier(class_weight=class_weights, random_state=RS)
params = {
    'ccp_alpha': selected_alphas
}

gs = GridSearchCV(
    estimator=tree,
    param_grid= params,
    cv=4, 
    n_jobs=4, 
    verbose=True)
    

In [33]:
%%time
gs.fit(X, y)

Fitting 4 folds for each of 8 candidates, totalling 32 fits
CPU times: total: 43min 57s
Wall time: 3h 58min 24s


In [34]:
gs.best_score_

0.927545238095238

In [35]:
gs.best_params_

{'ccp_alpha': 0.0}

In [36]:
gs.cv_results_

{'mean_fit_time': array([1439.85245776, 1282.11252815, 1232.36799741, 1991.4360711 ,
        1175.5975517 , 1205.94436306, 1511.82247818, 1029.93331343]),
 'std_fit_time': array([ 28.33907589,  56.32790921, 125.00300204,  60.83684271,
         25.17593474, 210.14300556, 128.16851503, 100.25444954]),
 'mean_score_time': array([22.7804386 , 15.58362651, 40.35017037, 22.20688343, 17.15403283,
        13.02568996, 18.62048322,  9.77639067]),
 'std_score_time': array([13.73807943,  5.79697198, 27.0645031 ,  5.12263629,  5.11856656,
        10.19547289,  8.68770277,  6.59098661]),
 'param_ccp_alpha': masked_array(data=[0.0, 3.573787425017714e-20, 1.0737806082223471e-19,
                    6.199315303213248e-19, 9.119198107203774e-07,
                    2.263024861125098e-06, 5.275882000915201e-06,
                    1.3400447179657328e-05],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ccp_alp

In [37]:
pred = gs.best_estimator_.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.5237


In [38]:
path['ccp_alphas'][::12000]

array([0.00000000e+00, 3.57378743e-20, 1.07378061e-19, 6.19931530e-19,
       9.11919811e-07, 2.26302486e-06, 5.27588200e-06, 1.34004472e-05])

In [44]:
tree = DecisionTreeClassifier(class_weight=class_weights, random_state=RS)
params = {
    'ccp_alpha': selected_alphas
}

gs = GridSearchCV(
    estimator=tree,
    scoring='roc_auc',
    param_grid= params,
    cv=4, 
    n_jobs=4, 
    verbose=True)

In [45]:
%%time
gs.fit(X, y)

Fitting 4 folds for each of 8 candidates, totalling 32 fits
CPU times: total: 45min 28s
Wall time: 3h 58min 14s


In [46]:
gs.best_score_

0.6060916374123267

In [47]:
gs.best_params_

{'ccp_alpha': 1.3400447179657328e-05}

In [48]:
gs.cv_results_

{'mean_fit_time': array([2112.758223  , 1751.63642246, 1251.90868026, 1235.47472894,
        1135.98749715, 1141.2889483 , 1190.64058214, 1036.31251091]),
 'std_fit_time': array([166.77096777,  85.38159498,  39.28805593,  21.90902363,
         16.95444197,  32.8386313 ,  35.45302395,  41.39634352]),
 'mean_score_time': array([27.03667438, 23.23392558, 22.12680548, 19.96889257, 15.23392165,
        11.78902233, 17.77537459,  9.72700405]),
 'std_score_time': array([15.02936525, 11.11829408,  8.97004905,  5.04450421,  3.43835446,
         1.17685542,  7.27252803,  3.94565775]),
 'param_ccp_alpha': masked_array(data=[0.0, 3.573787425017714e-20, 1.0737806082223471e-19,
                    6.199315303213248e-19, 9.119198107203774e-07,
                    2.263024861125098e-06, 5.275882000915201e-06,
                    1.3400447179657328e-05],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ccp_alp

In [49]:
pred = gs.best_estimator_.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.6489


In [50]:
model = gs.best_estimator_

In [51]:
model.fit(X, y)

In [52]:
pred = model.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.6489


In [53]:
rf_clf = RandomForestClassifier(
    n_estimators=2000,
    max_depth=5,
    max_leaf_nodes=128,
    criterion = 'gini',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    ccp_alpha=1.3400447179657328e-05,
    n_jobs=-1, 
    class_weight=class_weights, 
    warm_start=True
)

In [54]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7151
CPU times: total: 17h 40min 35s
Wall time: 1h 46min 11s


In [56]:
rf_clf = RandomForestClassifier(
    n_estimators=2000,
    minBinSize=64,
    verbose=True,
    max_depth=5,
    # max_leaf_nodes=128,
    criterion = 'entropy',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.3400447179657328e-05,
    n_jobs=-1, 
    # class_weight=class_weights, 
    warm_start=True
)

In [57]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 62.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 93.7min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 107.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    5.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   13.6s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   25.7s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   39.7s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   57.9s
[Parallel(n_jo

roc-auc: 0.7165
CPU times: total: 17h 57min 33s
Wall time: 2h 6min 28s


In [58]:
rf_clf = RandomForestClassifier(
    n_estimators=200,
    minBinSize=64,
    verbose=True,
    max_depth=5,
    # max_leaf_nodes=128,
    criterion = 'entropy',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.3400447179657328e-05,
    n_jobs=-1, 
    # class_weight=class_weights, 
    warm_start=True,
    oob_score=roc_auc_score
)

In [59]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 10.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.5s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    4.0s finished


roc-auc: 0.7158
CPU times: total: 1h 51min 47s
Wall time: 20min 19s


In [61]:
!pip install catboost

Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/e8/37/3afd3c02798734efcd7840bfa872d3efc06f5d5c92f9613fea3ff5b4311f/catboost-1.2.3-cp311-cp311-win_amd64.whl.metadata
  Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl.metadata
  Downloading graphviz-0.20.1-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl (101.1 MB)
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB 325.1 kB/s eta 0:05:11
   -----------------------------------

In [63]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/e1/4c/4685ccfae9806f561de716e32549190c1f533dde5bcadaf83bdf23972cf0/lightgbm-4.3.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
    --------------------------------------- 0.0/1.3 MB 325.1 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.3 MB 491.5 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.3 MB 833.5 kB/s eta 0:00:02
   ------- -------------------------------- 0.2/1.3 MB 1.1 MB/s eta 0:00:02
   ---------- ----------------------------- 0.3/1.3 MB 1.3 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.3 MB 1.9 MB/s eta 0:00:01
   -------------------------- ------------- 0

In [64]:
import catboost as cb
import lightgbm as lgb

In [65]:
tree_params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "max_depth": 5,
    "reg_lambda": 1,
    "num_leaves": 64,
    "n_jobs": 5,
    "n_estimators": 2000
}

In [66]:
lgb_model = lgb.LGBMClassifier(**tree_params)

In [69]:
lgb_model.fit(X, y)

[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.395787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16735
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 411
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035480 -> initscore=-3.302647
[LightGBM] [Info] Start training from score -3.302647


In [None]:
lgb_model.predict_proba(val[feature_cols])[:, 1]

In [70]:
pred = lgb_model.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

roc-auc: 0.7659


In [71]:
rf_clf = RandomForestClassifier(
    n_estimators=500,
    verbose=True,
    max_depth=6,
    max_leaf_nodes=64,
    criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
    random_state=RS, 
    # ccp_alpha=1.3400447179657328e-05,
    n_jobs=-1, 
    # class_weight=class_weights, 
    warm_start=True,
    oob_score=roc_auc_score
)

In [72]:
%%time
rf_clf.fit(X, y)
pred = rf_clf.predict_proba(test[features])
print(f'roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 31.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   10.2s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.9s finished


roc-auc: 0.7219
CPU times: total: 5h 32min 26s
Wall time: 48min 43s


In [105]:
alphas = path['ccp_alphas'][::8710]
alphas.shape

(11,)

In [106]:
alphas

array([0.00000000e+00, 3.57378743e-20, 5.44290017e-20, 1.42951497e-19,
       4.70841012e-19, 4.87543768e-07, 1.32267485e-06, 2.42665378e-06,
       4.47846068e-06, 8.68328402e-06, 2.24503787e-05])

In [107]:
for alpha in alphas:
    rf_clf = RandomForestClassifier(
        n_estimators=500,
        verbose=True,
        max_depth=6,
        max_leaf_nodes=64,
        criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
        random_state=RS, 
        ccp_alpha=alpha,
        n_jobs=-1, 
        # class_weight=class_weights, 
        warm_start=False,
        oob_score=roc_auc_score
    )
    rf_clf.fit(X, y)
    pred = rf_clf.predict_proba(test[features])
    print(f'alpha: {alpha}   \t roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.3s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   10.6s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   12.3s finished


alpha: 0.0   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.9s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.5s finished


alpha: 3.573787425017714e-20   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.9s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.6s finished


alpha: 5.442900173378205e-20   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.8s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.5s finished


alpha: 1.4295149700070856e-19   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.9s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.6s finished


alpha: 4.708410119770272e-19   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.8s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.5s finished


alpha: 4.875437678650105e-07   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.8s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.4s finished


alpha: 1.3226748507708517e-06   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.7s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.4s finished


alpha: 2.4266537755322888e-06   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.5min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   10.0s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.6s finished


alpha: 4.478460678099516e-06   	 roc-auc: 0.7219


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.6s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   11.2s finished


alpha: 8.683284022896746e-06   	 roc-auc: 0.7218


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.3min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.7s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.2s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   10.8s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7210


In [108]:
for e in [3, 5, 15, 20, 25, 50, 100, 150, 200, 250, 500, 1000, 2000]:
    rf_clf = RandomForestClassifier(
        n_estimators=e,
        verbose=True,
        max_depth=5,
        # max_leaf_nodes=64,
        criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
        random_state=RS, 
        # ccp_alpha=alpha,
        n_jobs=-1, 
        # class_weight=class_weights, 
        warm_start=False,
        oob_score=roc_auc_score
    )
    rf_clf.fit(X, y)
    pred = rf_clf.predict_proba(test[features])
    print(f'alpha: {alpha}   \t roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   10.9s finished
  warn(
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.0s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.6836


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.2s finished
  warn(
[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.0s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.6954


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   34.4s remaining:   30.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   44.6s finished
  warn(
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 out of  15 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  15 out of  15 | elapsed:    0.2s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7088


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:   55.2s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   55.4s finished
  warn(
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  18 out of  20 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  20 out of  20 | elapsed:    0.3s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7120


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.3min finished
  warn(
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  25 out of  25 | elapsed:    0.4s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7116


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.9s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7131


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.6min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.8s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7157


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    3.0s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7157


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  9.2min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.4s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    3.8s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7158


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 11.5min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.6s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    5.1s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7165


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 23.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.5s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.5s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:   10.0s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7163


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 36.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 46.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.7s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    9.1s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   16.6s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   21.3s finished


alpha: 2.2450378730397823e-05   	 roc-auc: 0.7164


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 35.9min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 56.7min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 82.2min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 92.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.5s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.7s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   15.9s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   25.3s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   36.7s
[Parallel(n_job

alpha: 2.2450378730397823e-05   	 roc-auc: 0.7165


In [117]:
alphas = path['ccp_alphas'][-19::3]
alphas

array([0.00053534, 0.00088528, 0.00119024, 0.00127703, 0.00203289,
       0.00465139, 0.01292502])

In [118]:
for alpha in alphas:
    rf_clf = RandomForestClassifier(
        n_estimators=250,
        verbose=True,
        max_depth=6,
        max_leaf_nodes=64,
        criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
        random_state=RS, 
        ccp_alpha=alpha,
        n_jobs=-1, 
        # class_weight=class_weights, 
        warm_start=False,
        oob_score=roc_auc_score
    )
    rf_clf.fit(X, y)
    pred = rf_clf.predict_proba(test[features])
    print(f'alpha: {alpha}   \t roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.8s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.9s finished


alpha: 0.0005353389032965589   	 roc-auc: 0.6983


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 14.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    4.3s finished


alpha: 0.0008852799535060199   	 roc-auc: 0.6908


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 14.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.8s finished


alpha: 0.0011902398634732608   	 roc-auc: 0.6822


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.9s finished


alpha: 0.0012770321549587585   	 roc-auc: 0.6791


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.8min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.8s finished


alpha: 0.0020328877864630446   	 roc-auc: 0.6445


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.8min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.7s finished


alpha: 0.0046513872986087   	 roc-auc: 0.5000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    3.8s finished


alpha: 0.012925016406968415   	 roc-auc: 0.5000


In [119]:
for alpha in alphas:
    rf_clf = RandomForestClassifier(
        n_estimators=2000,
        verbose=True,
        max_depth=6,
        max_leaf_nodes=64,
        criterion = 'log_loss',  # {“gini”, “entropy”, “log_loss”}, default=”gini”
        random_state=RS, 
        ccp_alpha=alpha,
        n_jobs=-1, 
        # class_weight=class_weights, 
        warm_start=False,
        oob_score=roc_auc_score
    )
    rf_clf.fit(X, y)
    pred = rf_clf.predict_proba(test[features])
    print(f'alpha: {alpha}   \t roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 43.2min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 68.0min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 101.3min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 114.6min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    7.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   13.2s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   22.6s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   32.9s
[Parallel(n_j

alpha: 0.0005353389032965589   	 roc-auc: 0.6966


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 25.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 76.1min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 106.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 118.7min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    7.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   12.6s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   20.0s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   28.9s
[Parallel(n_j

alpha: 0.0008852799535060199   	 roc-auc: 0.6888


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 24.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 68.8min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 99.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 111.8min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    6.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   12.5s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   19.7s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   28.5s
[Parallel(n_jo

alpha: 0.0011902398634732608   	 roc-auc: 0.6794


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 68.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 99.3min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 111.6min finished


KeyboardInterrupt: 

In [131]:
y.value_counts(dropna=False)

flag
0    2025491
1      74509
Name: count, dtype: int64

In [163]:

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2100000 entries, 0 to 2099999
Columns: 411 entries, pre_loans5_mean to enc_loans_credit_type_3
dtypes: int8(411)
memory usage: 831.1 MB


In [18]:
svc = SVC(
    C=0.9, 
    probability=True,
    class_weight=class_weights,
    random_state=RS,
    max_iter=10
)


In [None]:
svc.fit(X, y)


In [None]:
pred = svc.predict_proba(test[features])
print(f'alpha: {alpha}   \t roc-auc: {roc_auc_score(test[target], pred[:,1]):.4f}')

In [13]:
sgd = SGDClassifier(
    penalty='elasticnet',
    alpha=0.001, 
    l1_ratio=0.5,
    n_jobs=-1,
    random_state=RS,
    class_weight=class_weights,
    warm_start=False
)

# (loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, 
# max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, 
# random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, 
# validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)

In [15]:
%%time
n_parts = 10
timer = time.time()
pred_n = 5

epoches = 100
# train = pd.read_pickle(f'{i}_part_v5.pkl')

part_size = int(X.shape[0] / n_parts)

X_i = X[:part_size]
y_i = y[:part_size]

sgd.fit(X_i, y_i)
        
for j in range(epoches):
    print('epoch', j)
    ets = time.time()
    for i in range(n_parts-1):
        print('part #', i)
        start = time.time()
        X_i = X[i*part_size: (i+1)*part_size]
        y_i = y[i*part_size: (i+1)*part_size]

        sgd.partial_fit(X_i, y_i)
        print(f'total time: {(time.time() - timer)/60:.2f}min  part time: {(time.time() - start):.1f}sec')
    print('part #', i+1)
    start = time.time()
    X_i = X[(i+1)*part_size:]
    y_i = y[(i+1)*part_size:]
    sgd.partial_fit(X_i, y_i)
    print(f'total time: {(time.time() - timer)/60:.2f}min  part time: {(time.time() - start):.1f}sec')
        
    if (j % pred_n == 0) or (j==epoches-1):
        pred = sgd.predict(test[features])
        print(f'total time: {(time.time() - timer)/60:.2f}min  epoch time: {(time.time() - ets):.1f}sec  roc-auc: {roc_auc_score(test[target], pred):.4f}')
    else:
        print(f'total time: {(time.time() - timer)/60:.2f}min  epoch time: {(time.time() - ets):.1f}sec')

epoch 0
part # 0
total time: 1.74min  part time: 3.3sec
part # 1
total time: 1.79min  part time: 3.2sec
part # 2
total time: 1.84min  part time: 3.2sec
part # 3
total time: 1.89min  part time: 3.1sec
part # 4
total time: 1.95min  part time: 3.2sec
part # 5
total time: 2.00min  part time: 3.2sec
part # 6
total time: 2.07min  part time: 4.3sec
part # 7
total time: 2.12min  part time: 3.1sec
part # 8
total time: 2.18min  part time: 3.2sec
part # 9
total time: 2.23min  part time: 3.2sec
total time: 4.50min  epoch time: 169.1sec  roc-auc: 0.5227
epoch 1
part # 0
total time: 5.14min  part time: 31.0sec
part # 1
total time: 5.39min  part time: 14.9sec
part # 2
total time: 5.60min  part time: 12.7sec
part # 3
total time: 5.82min  part time: 13.2sec
part # 4
total time: 6.22min  part time: 24.0sec
part # 5
total time: 6.48min  part time: 15.4sec
part # 6
total time: 6.72min  part time: 14.2sec
part # 7
total time: 7.07min  part time: 21.0sec
part # 8
total time: 7.35min  part time: 16.9sec
part