In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv('diabetes_cleaned_12-15-20.csv')

In [78]:
df.readmit_30d.value_counts()

False    63693
True      6277
Name: readmit_30d, dtype: int64

In [2]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='readmit_30d')
y = df.readmit_30d
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = []
for i in range(42,47):
    sampled_indices.append(resample(y_train[major_indices],replace=False,n_samples=5022,random_state=i).index)
    
#scaler_X = StandardScaler().fit(X_train)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_train)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_train.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[100,200],'max_depth':[2,3],'learning_rate':[0.06,0.1],'random_state':[5]}

results = []
predicts = []
for i in sampled_indices:
    X_balanced = pd.concat([X_train.loc[i],X_train.loc[minor_indices]])
    y_balanced = pd.concat([y_train[i],y_train[minor_indices]])
    selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf, scoring=roc_scorer, verbose=1,n_jobs=-2)
    selector = selector.fit(X_balanced, y_balanced)
    results.append(selector.cv_results_)
    tmp_pred=selector.best_estimator_.predict(X_test)
    predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:   27.7s finished


      Iter       Train Loss   Remaining Time 
         1           1.3787            2.94s
         2           1.3723            2.99s
         3           1.3668            2.85s
         4           1.3622            2.88s
         5           1.3583            2.90s
         6           1.3548            2.90s
         7           1.3518            2.98s
         8           1.3482            2.89s
         9           1.3455            2.88s
        10           1.3426            2.82s
        20           1.3242            2.98s
        30           1.3129            2.60s
        40           1.3047            2.39s
        50           1.2971            2.15s
        60           1.2915            2.00s
        70           1.2859            1.80s
        80           1.2814            1.63s
        90           1.2770            1.46s
       100           1.2725            1.33s
       200           1.2418            0.00s
Fitting 5 folds for each of 8 candidates, totalling 40

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:   28.2s finished


      Iter       Train Loss   Remaining Time 
         1           1.3817            2.65s
         2           1.3777            2.70s
         3           1.3740            2.65s
         4           1.3709            2.71s
         5           1.3681            2.78s
         6           1.3653            2.77s
         7           1.3631            2.78s
         8           1.3608            2.76s
         9           1.3589            2.77s
        10           1.3569            2.80s
        20           1.3419            3.16s
        30           1.3314            2.69s
        40           1.3233            2.57s
        50           1.3169            2.30s
        60           1.3115            2.10s
        70           1.3067            1.94s
        80           1.3028            1.78s
        90           1.2997            1.57s
       100           1.2964            1.43s
       200           1.2721            0.00s
Fitting 5 folds for each of 8 candidates, totalling 40

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:   27.5s finished


      Iter       Train Loss   Remaining Time 
         1           1.3810            1.61s
         2           1.3765            1.63s
         3           1.3725            1.66s
         4           1.3691            1.62s
         5           1.3657            1.60s
         6           1.3628            1.65s
         7           1.3597            1.66s
         8           1.3573            1.69s
         9           1.3549            1.65s
        10           1.3526            1.66s
        20           1.3377            1.65s
        30           1.3283            1.56s
        40           1.3217            1.38s
        50           1.3167            1.25s
        60           1.3124            1.15s
        70           1.3090            1.07s
        80           1.3062            0.96s
        90           1.3038            0.90s
       100           1.3014            0.82s
       200           1.2850            0.00s
Fitting 5 folds for each of 8 candidates, totalling 40

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:   26.5s finished


      Iter       Train Loss   Remaining Time 
         1           1.3814            2.60s
         2           1.3771            2.68s
         3           1.3731            2.65s
         4           1.3695            2.62s
         5           1.3662            2.68s
         6           1.3632            2.74s
         7           1.3604            2.77s
         8           1.3580            2.84s
         9           1.3558            2.89s
        10           1.3537            2.94s
        20           1.3363            3.14s
        30           1.3249            2.94s
        40           1.3164            2.66s
        50           1.3099            2.36s
        60           1.3039            2.19s
        70           1.2991            1.96s
        80           1.2948            1.76s
        90           1.2909            1.60s
       100           1.2877            1.44s
       200           1.2624            0.00s
Fitting 5 folds for each of 8 candidates, totalling 40

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:   30.4s finished


      Iter       Train Loss   Remaining Time 
         1           1.3780            2.69s
         2           1.3713            2.71s
         3           1.3659            2.78s
         4           1.3614            2.84s
         5           1.3573            2.79s
         6           1.3539            2.89s
         7           1.3507            2.87s
         8           1.3476            2.90s
         9           1.3449            2.86s
        10           1.3425            2.87s
        20           1.3238            3.22s
        30           1.3111            2.73s
        40           1.3016            2.40s
        50           1.2943            2.15s
        60           1.2883            1.93s
        70           1.2818            1.83s
        80           1.2771            1.67s
        90           1.2724            1.51s
       100           1.2682            1.39s
       200           1.2366            0.00s


In [5]:
results[0]

{'mean_fit_time': array([0.95865564, 1.77474775, 1.56909051, 2.99077072, 0.93060613,
        1.76766701, 1.5329    , 2.72031603]),
 'std_fit_time': array([0.01343719, 0.02185997, 0.03249041, 0.0647935 , 0.01075394,
        0.03408077, 0.02748662, 0.17182758]),
 'mean_score_time': array([0.01651139, 0.01898394, 0.01587763, 0.02094231, 0.01719446,
        0.01648521, 0.01630344, 0.0184762 ]),
 'std_score_time': array([0.00492329, 0.00234579, 0.0025288 , 0.00179306, 0.00114723,
        0.00164882, 0.0019511 , 0.00271446]),
 'param_learning_rate': masked_array(data=[0.06, 0.06, 0.06, 0.06, 0.1, 0.1, 0.1, 0.1],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 200, 100, 200, 10

In [10]:
tmp = results[0]['mean_test_score']
for i in range(1,len(results)):
    tmp = np.concatenate([tmp,results[i]['mean_test_score']])
tmp2 = pd.DataFrame(tmp.reshape(5,8)).apply(lambda x: x.mean(),axis=0)
tmp2

0    0.599373
1    0.602871
2    0.602128
3    0.605729
4    0.601542
5    0.605469
6    0.604035
7    0.605168
dtype: float64

In [11]:
for i in range(len(results)):
    print(results[i]['rank_test_score'])

[8 4 5 2 6 3 7 1]
[8 5 2 1 4 3 7 6]
[8 5 6 2 7 1 4 3]
[8 5 6 1 7 3 2 4]
[5 6 8 4 7 2 3 1]


In [8]:
predicts

[0.6014760332248211,
 0.6021014302160226,
 0.6038293485919733,
 0.6035465641945914,
 0.6100624583853143]

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = []
for i in range(42,52):
    sampled_indices.append(resample(y_train[major_indices],replace=False,n_samples=5022,random_state=i).index)
    
#scaler_X = StandardScaler().fit(X_train)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_train)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_train.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[200,300,500],'max_depth':[2,3,4],'learning_rate':[0.1],'random_state':[5]}

results = []
predicts = []
for i in sampled_indices:
    X_balanced = pd.concat([X_train.loc[i],X_train.loc[minor_indices]])
    y_balanced = pd.concat([y_train[i],y_train[minor_indices]])
    selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf, scoring=roc_scorer, verbose=1,n_jobs=-2)
    selector = selector.fit(X_balanced, y_balanced)
    results.append(selector.cv_results_)
    tmp_pred=selector.best_estimator_.predict(X_test)
    predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3768            4.18s
         2           1.3689            4.24s
         3           1.3623            4.31s
         4           1.3564            4.47s
         5           1.3513            4.54s
         6           1.3466            4.46s
         7           1.3427            4.58s
         8           1.3389            4.67s
         9           1.3358            5.20s
        10           1.3325            5.47s
        20           1.3082            4.89s
        30           1.2921            4.25s
        40           1.2808            3.81s
        50           1.2700            3.41s
        60           1.2608            3.08s
        70           1.2524            2.83s
        80           1.2452            2.54s
        90           1.2389            2.33s
       100           1.2322            2.08s
       200           1.1784            0.00s
Fitting 5 folds for each of 9 candidates, totalling 45

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.3812            2.29s
         2           1.3770            2.44s
         3           1.3733            2.47s
         4           1.3699            2.42s
         5           1.3670            2.49s
         6           1.3646            2.51s
         7           1.3621            2.45s
         8           1.3599            2.43s
         9           1.3579            2.36s
        10           1.3561            2.35s
        20           1.3429            2.18s
        30           1.3339            2.45s
        40           1.3273            2.27s
        50           1.3221            2.09s
        60           1.3182            1.95s
        70           1.3146            1.88s
        80           1.3118            1.78s
        90           1.3094            1.69s
       100           1.3072            1.59s
       200           1.2907            0.75s
       300           1.2808            0.00s
Fitting 5

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3810            1.62s
         2           1.3765            1.63s
         3           1.3725            1.68s
         4           1.3691            1.68s
         5           1.3657            1.64s
         6           1.3628            1.70s
         7           1.3597            1.70s
         8           1.3573            1.72s
         9           1.3549            1.68s
        10           1.3526            1.69s
        20           1.3377            1.50s
        30           1.3283            1.66s
        40           1.3217            1.45s
        50           1.3167            1.29s
        60           1.3124            1.16s
        70           1.3090            1.08s
        80           1.3062            0.98s
        90           1.3038            0.89s
       100           1.3014            0.81s
       200           1.2850            0.00s
Fitting 5 folds for each of 9 candidates, totalling 45

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3805            2.41s
         2           1.3759            2.31s
         3           1.3718            2.59s
         4           1.3686            2.73s
         5           1.3653            2.77s
         6           1.3622            2.66s
         7           1.3594            2.64s
         8           1.3569            2.65s
         9           1.3544            2.61s
        10           1.3522            2.58s
        20           1.3369            2.37s
        30           1.3273            2.60s
        40           1.3204            2.34s
        50           1.3152            2.18s
        60           1.3107            2.12s
        70           1.3071            2.14s
        80           1.3040            2.04s
        90           1.3014            1.94s
       100           1.2992            1.83s
       200           1.2830            0.82s
       300           1.2723            0.00s
Fitting 5

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3780            2.78s
         2           1.3713            2.80s
         3           1.3659            2.86s
         4           1.3614            2.89s
         5           1.3573            2.83s
         6           1.3539            2.92s
         7           1.3507            2.93s
         8           1.3476            2.96s
         9           1.3449            2.91s
        10           1.3425            2.98s
        20           1.3238            2.94s
        30           1.3111            2.55s
        40           1.3016            2.28s
        50           1.2943            2.06s
        60           1.2883            1.86s
        70           1.2818            1.73s
        80           1.2771            1.57s
        90           1.2724            1.42s
       100           1.2682            1.28s
       200           1.2366            0.00s
Fitting 5 folds for each of 9 candidates, totalling 45

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3802            2.39s
         2           1.3753            2.39s
         3           1.3712            2.48s
         4           1.3677            2.62s
         5           1.3641            2.58s
         6           1.3611            2.54s
         7           1.3586            2.60s
         8           1.3560            2.55s
         9           1.3536            2.53s
        10           1.3515            2.51s
        20           1.3367            2.24s
        30           1.3274            2.42s
        40           1.3210            2.18s
        50           1.3158            2.01s
        60           1.3116            1.89s
        70           1.3083            1.79s
        80           1.3051            1.67s
        90           1.3022            1.61s
       100           1.2995            1.52s
       200           1.2831            0.73s
       300           1.2720            0.00s
Fitting 5

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.3809            2.24s
         2           1.3765            2.40s
         3           1.3726            2.37s
         4           1.3693            2.38s
         5           1.3661            2.37s
         6           1.3634            2.34s
         7           1.3607            2.40s
         8           1.3586            2.43s
         9           1.3565            2.42s
        10           1.3544            2.39s
        20           1.3403            2.36s
        30           1.3312            2.24s
        40           1.3241            2.08s
        50           1.3185            1.94s
        60           1.3141            1.83s
        70           1.3103            1.77s
        80           1.3070            1.66s
        90           1.3042            1.57s
       100           1.3018            1.48s
       200           1.2843            0.71s
       300           1.2734            0.00s
Fitting 5

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.5min finished


      Iter       Train Loss   Remaining Time 
         1           1.3777            2.66s
         2           1.3707            2.78s
         3           1.3650            2.77s
         4           1.3601            2.83s
         5           1.3562            3.00s
         6           1.3526            3.03s
         7           1.3491            3.06s
         8           1.3455            2.99s
         9           1.3428            3.05s
        10           1.3401            3.02s
        20           1.3200            2.74s
        30           1.3074            2.41s
        40           1.2982            2.17s
        50           1.2908            2.02s
        60           1.2851            1.83s
        70           1.2793            1.68s
        80           1.2747            1.53s
        90           1.2702            1.39s
       100           1.2664            1.27s
       200           1.2360            0.00s
Fitting 5 folds for each of 9 candidates, totalling 45

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.3807            1.60s
         2           1.3762            1.59s
         3           1.3723            1.65s
         4           1.3690            1.61s
         5           1.3658            1.57s
         6           1.3630            1.63s
         7           1.3607            1.65s
         8           1.3583            1.63s
         9           1.3561            1.60s
        10           1.3542            1.60s
        20           1.3409            1.55s
        30           1.3321            1.42s
        40           1.3259            1.28s
        50           1.3212            1.17s
        60           1.3173            1.07s
        70           1.3140            0.98s
        80           1.3110            0.89s
        90           1.3085            0.81s
       100           1.3064            0.75s
       200           1.2899            0.00s
Fitting 5 folds for each of 9 candidates, totalling 45

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3806            4.10s
         2           1.3760            3.99s
         3           1.3719            4.17s
         4           1.3680            4.08s
         5           1.3647            4.15s
         6           1.3618            4.02s
         7           1.3592            4.13s
         8           1.3567            4.15s
         9           1.3545            4.11s
        10           1.3524            4.15s
        20           1.3373            3.66s
        30           1.3270            3.69s
        40           1.3196            3.55s
        50           1.3138            3.38s
        60           1.3092            3.25s
        70           1.3054            3.23s
        80           1.3022            3.10s
        90           1.2989            2.98s
       100           1.2965            2.89s
       200           1.2772            2.10s
       300           1.2652            1.42s
       40

In [14]:
results[0]

{'mean_fit_time': array([ 1.76781197,  2.65475812,  4.40110993,  2.93333268,  4.40269351,
         7.38985353,  4.45246377,  6.69774776, 10.46639209]),
 'std_fit_time': array([0.02314706, 0.03195532, 0.03986365, 0.04959141, 0.06951998,
        0.09920113, 0.07182956, 0.0952597 , 0.81362855]),
 'mean_score_time': array([0.0169158 , 0.02099452, 0.02572408, 0.01841917, 0.02392616,
        0.03089762, 0.023666  , 0.02647815, 0.03249011]),
 'std_score_time': array([0.00280748, 0.00140407, 0.00151877, 0.00142648, 0.00150013,
        0.00178764, 0.00158311, 0.00200923, 0.00508161]),
 'param_learning_rate': masked_array(data=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[2, 2, 2, 3, 3, 3, 4, 4, 4],
              mask=[False, False, False, False, False, False, False, False,
                    False],
 

In [16]:
tmp = results[0]['mean_test_score']
for i in range(1,len(results)):
    tmp = np.concatenate([tmp,results[i]['mean_test_score']])
tmp2 = pd.DataFrame(tmp.reshape(10,9)).apply(lambda x: x.mean(),axis=0)
tmp2

0    0.606481
1    0.607139
2    0.606601
3    0.606255
4    0.604266
5    0.600423
6    0.603097
7    0.599046
8    0.592736
dtype: float64

In [17]:
for i in range(len(results)):
    print(results[i]['rank_test_score'])

[8 2 6 3 4 7 1 5 9]
[3 1 2 4 5 7 6 8 9]
[1 5 3 4 2 7 6 8 9]
[2 1 3 4 6 7 5 8 9]
[5 2 3 1 4 7 6 8 9]
[3 1 4 2 5 7 6 8 9]
[3 1 2 4 6 7 5 8 9]
[6 3 2 1 4 8 5 7 9]
[1 3 4 2 5 6 7 8 9]
[2 3 1 4 5 7 6 8 9]


In [18]:
predicts

[0.5935358026251225,
 0.6028277814247368,
 0.6038293485919733,
 0.6014270885685611,
 0.6100624583853143,
 0.60121310816081,
 0.6005690402687859,
 0.5988399334602872,
 0.6055499487253905,
 0.6003732616437462]

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = []
for i in range(42,47):
    sampled_indices.append(resample(y_train[major_indices],replace=False,n_samples=5022,random_state=i).index)
    
#scaler_X = StandardScaler().fit(X_train)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_train)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_train.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[200,300,500,800],'max_depth':[2,3,4,5],'learning_rate':[0.1],'random_state':[5]}

results = []
predicts = []
for i in sampled_indices:
    X_balanced = pd.concat([X_train.loc[i],X_train.loc[minor_indices]])
    y_balanced = pd.concat([y_train[i],y_train[minor_indices]])
    selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf, scoring=roc_scorer, verbose=1,n_jobs=-1)
    selector = selector.fit(X_balanced, y_balanced)
    results.append(selector.cv_results_)
    tmp_pred=selector.best_estimator_.predict(X_test)
    predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.0min finished


      Iter       Train Loss   Remaining Time 
         1           1.3768            5.76s
         2           1.3689            5.45s
         3           1.3623            5.10s
         4           1.3564            5.20s
         5           1.3513            5.22s
         6           1.3466            5.22s
         7           1.3427            5.65s
         8           1.3389            5.74s
         9           1.3358            5.74s
        10           1.3325            5.70s
        20           1.3082            4.91s
        30           1.2921            4.47s
        40           1.2808            3.99s
        50           1.2700            3.60s
        60           1.2608            3.30s
        70           1.2524            2.95s
        80           1.2452            2.69s
        90           1.2389            2.41s
       100           1.2322            2.18s
       200           1.1784            0.00s
Fitting 5 folds for each of 16 candidates, totalling 8

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.2min finished


      Iter       Train Loss   Remaining Time 
         1           1.3812            2.41s
         2           1.3770            2.47s
         3           1.3733            2.43s
         4           1.3699            2.54s
         5           1.3670            2.57s
         6           1.3646            2.65s
         7           1.3621            2.55s
         8           1.3599            2.53s
         9           1.3579            2.48s
        10           1.3561            2.46s
        20           1.3429            2.36s
        30           1.3339            2.17s
        40           1.3273            2.04s
        50           1.3221            1.99s
        60           1.3182            1.89s
        70           1.3146            1.82s
        80           1.3118            1.71s
        90           1.3094            1.62s
       100           1.3072            1.55s
       200           1.2907            0.75s
       300           1.2808            0.00s
Fitting 5

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.3810            1.73s
         2           1.3765            1.71s
         3           1.3725            1.73s
         4           1.3691            1.71s
         5           1.3657            1.70s
         6           1.3628            1.83s
         7           1.3597            1.85s
         8           1.3573            1.85s
         9           1.3549            1.81s
        10           1.3526            1.81s
        20           1.3377            1.74s
        30           1.3283            1.51s
        40           1.3217            1.35s
        50           1.3167            1.22s
        60           1.3124            1.16s
        70           1.3090            1.38s
        80           1.3062            1.60s
        90           1.3038            1.42s
       100           1.3014            1.23s
       200           1.2850            0.00s
Fitting 5 folds for each of 16 candidates, totalling 8

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.6min finished


      Iter       Train Loss   Remaining Time 
         1           1.3805            2.46s
         2           1.3759            2.42s
         3           1.3718            2.48s
         4           1.3686            2.65s
         5           1.3653            2.71s
         6           1.3622            2.64s
         7           1.3594            2.61s
         8           1.3569            2.64s
         9           1.3544            2.59s
        10           1.3522            2.56s
        20           1.3369            2.35s
        30           1.3273            2.59s
        40           1.3204            2.34s
        50           1.3152            2.17s
        60           1.3107            2.12s
        70           1.3071            1.99s
        80           1.3040            1.86s
        90           1.3014            1.75s
       100           1.2992            1.67s
       200           1.2830            0.80s
       300           1.2723            0.00s
Fitting 5

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3780            3.05s
         2           1.3713            2.89s
         3           1.3659            2.97s
         4           1.3614            3.19s
         5           1.3573            3.05s
         6           1.3539            3.13s
         7           1.3507            3.08s
         8           1.3476            3.36s
         9           1.3449            3.26s
        10           1.3425            3.27s
        20           1.3238            2.88s
        30           1.3111            2.58s
        40           1.3016            2.90s
        50           1.2943            3.01s
        60           1.2883            2.75s
        70           1.2818            2.43s
        80           1.2771            2.13s
        90           1.2724            1.90s
       100           1.2682            1.68s
       200           1.2366            0.00s
Fitting 5 folds for each of 16 candidates, totalling 8

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min


KeyboardInterrupt: 

In [21]:
results[0]

{'mean_fit_time': array([ 2.22316227,  3.33777184,  5.54706073,  9.05935159,  3.68700123,
         5.54333968,  9.35164804, 15.57568393,  6.24702578,  8.36181769,
        14.09797626, 23.29615364,  8.21782179, 12.41125832, 23.18390331,
        30.97603264]),
 'std_fit_time': array([0.03157141, 0.05595879, 0.09130734, 0.11961558, 0.04544104,
        0.10350554, 0.12620764, 0.92542195, 0.69399796, 0.07396962,
        0.3517465 , 0.40883588, 0.13843126, 0.17565375, 1.0242505 ,
        5.69656783]),
 'mean_score_time': array([0.02311401, 0.02899847, 0.03702121, 0.0493    , 0.02639742,
        0.03267388, 0.03766437, 0.06841841, 0.02555647, 0.0335197 ,
        0.04673743, 0.0604423 , 0.02890534, 0.06572309, 0.05606146,
        0.05836949]),
 'std_score_time': array([0.00254638, 0.00204148, 0.00358288, 0.00690155, 0.0061351 ,
        0.00459833, 0.00221083, 0.03510089, 0.00126534, 0.00171306,
        0.00982932, 0.00136063, 0.00198358, 0.04395552, 0.00571313,
        0.01091526]),
 'param_le

In [23]:
tmp = results[0]['mean_test_score']
for i in range(1,len(results)):
    tmp = np.concatenate([tmp,results[i]['mean_test_score']])
tmp2 = pd.DataFrame(tmp.reshape(5,16)).apply(lambda x: x.mean(),axis=0)
tmp2

0     0.605469
1     0.605936
2     0.605375
3     0.602998
4     0.605168
5     0.604083
6     0.600492
7     0.595170
8     0.603150
9     0.599687
10    0.591830
11    0.585973
12    0.599582
13    0.594200
14    0.585192
15    0.578207
dtype: float64

In [24]:
for i in range(len(results)):
    print(results[i]['rank_test_score'])

[10  2  6  7  3  4  8 11  1  5 12 14  9 13 15 16]
[ 3  1  2  7  4  5  9 11  8 10 13 15  6 12 14 16]
[ 1  5  3  6  4  2  8 11  7 10 13 14  9 12 15 16]
[ 2  1  3  7  4  6  8 12  5  9 13 14 10 11 15 16]
[ 6  2  3  5  1  4  8 10  7  9 12 14 11 13 15 16]


In [25]:
predicts

[0.5935358026251225,
 0.6028277814247368,
 0.6038293485919733,
 0.6014270885685611,
 0.6100624583853143]

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = []
for i in range(42,47):
    sampled_indices.append(resample(y_train[major_indices],replace=False,n_samples=5022,random_state=i).index)
    
#scaler_X = StandardScaler().fit(X_train)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_train)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_train.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[100,200,300],'max_depth':[2,3],'learning_rate':[0.05,0.1,0.15],'random_state':[5]}

results = []
predicts = []
for i in sampled_indices:
    X_balanced = pd.concat([X_train.loc[i],X_train.loc[minor_indices]])
    y_balanced = pd.concat([y_train[i],y_train[minor_indices]])
    selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf, scoring=roc_scorer, verbose=1,n_jobs=-1)
    selector = selector.fit(X_balanced, y_balanced)
    results.append(selector.cv_results_)
    tmp_pred=selector.best_estimator_.predict(X_test)
    predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3824            8.99s
         2           1.3788            6.51s
         3           1.3755            5.72s
         4           1.3725            5.45s
         5           1.3698            5.36s
         6           1.3673            5.18s
         7           1.3649            5.11s
         8           1.3628            5.07s
         9           1.3608            5.02s
        10           1.3587            4.96s
        20           1.3434            5.18s
        30           1.3326            4.79s
        40           1.3246            4.48s
        50           1.3185            4.14s
        60           1.3131            3.89s
        70           1.3085            3.65s
        80           1.3046            3.43s
        90           1.3010            3.17s
       100           1.2980            3.05s
       200           1.2733            1.40s
       300           1.2571            0.00s
Fitting 5

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           1.3754            1.47s
         2           1.3673            1.40s
         3           1.3612            1.43s
         4           1.3563            1.45s
         5           1.3520            1.44s
         6           1.3476            1.41s
         7           1.3441            1.40s
         8           1.3410            1.38s
         9           1.3379            1.34s
        10           1.3350            1.34s
        20           1.3159            1.28s
        30           1.3041            1.03s
        40           1.2950            0.85s
        50           1.2876            0.69s
        60           1.2819            0.56s
        70           1.2760            0.40s
        80           1.2711            0.27s
        90           1.2664            0.13s
       100           1.2618            0.00s
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.2min finished


      Iter       Train Loss   Remaining Time 
         1           1.3785            2.51s
         2           1.3723            2.51s
         3           1.3672            2.72s
         4           1.3626            2.54s
         5           1.3581            2.60s
         6           1.3546            2.62s
         7           1.3515            2.56s
         8           1.3487            2.50s
         9           1.3462            2.44s
        10           1.3440            2.43s
        20           1.3275            2.24s
        30           1.3182            2.35s
        40           1.3117            2.17s
        50           1.3065            2.10s
        60           1.3028            1.97s
        70           1.2997            1.86s
        80           1.2965            1.74s
        90           1.2934            1.64s
       100           1.2906            1.57s
       200           1.2734            0.76s
       300           1.2615            0.00s
Fitting 5

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.2min finished


      Iter       Train Loss   Remaining Time 
         1           1.3783            1.34s
         2           1.3717            1.33s
         3           1.3658            1.41s
         4           1.3611            1.43s
         5           1.3569            1.42s
         6           1.3534            1.44s
         7           1.3495            1.40s
         8           1.3463            1.43s
         9           1.3434            1.43s
        10           1.3404            1.39s
        20           1.3216            1.42s
        30           1.3095            1.14s
        40           1.3003            0.94s
        50           1.2931            0.75s
        60           1.2869            0.60s
        70           1.2818            0.43s
        80           1.2774            0.29s
        90           1.2730            0.14s
       100           1.2692            0.00s
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.2min finished


      Iter       Train Loss   Remaining Time 
         1           1.3783            2.86s
         2           1.3719            2.66s
         3           1.3671            2.95s
         4           1.3624            2.67s
         5           1.3586            2.63s
         6           1.3553            2.68s
         7           1.3522            2.76s
         8           1.3498            2.74s
         9           1.3473            2.72s
        10           1.3450            2.62s
        20           1.3288            2.33s
        30           1.3185            2.38s
        40           1.3108            2.17s
        50           1.3051            2.02s
        60           1.3006            1.88s
        70           1.2964            1.77s
        80           1.2927            1.70s
        90           1.2897            1.61s
       100           1.2868            1.52s
       200           1.2691            0.73s
       300           1.2561            0.00s


In [27]:
results[0]

{'mean_fit_time': array([1.32035837, 2.3340385 , 3.43992248, 3.00150218, 4.61292272,
        6.77831645, 1.35634556, 2.44299197, 3.90355749, 2.02686377,
        3.70049667, 5.66928902, 1.17115173, 2.29018064, 3.44702582,
        1.92741737, 3.73252692, 5.06327376]),
 'std_fit_time': array([0.02134474, 0.05646736, 0.0655548 , 0.67418663, 0.50455132,
        0.25344309, 0.31399973, 0.19245855, 0.12589031, 0.07820718,
        0.03514635, 0.14294571, 0.01711732, 0.03705716, 0.01701634,
        0.03580455, 0.0646196 , 0.86479237]),
 'mean_score_time': array([0.02041411, 0.0233304 , 0.02929387, 0.03708501, 0.02453141,
        0.03549538, 0.01892858, 0.02811861, 0.02784219, 0.02070518,
        0.0264236 , 0.03375506, 0.01918569, 0.02354398, 0.02920213,
        0.02091656, 0.02783952, 0.02320409]),
 'std_score_time': array([0.00197639, 0.00429698, 0.00755434, 0.02012042, 0.00154617,
        0.0070841 , 0.00200284, 0.00757797, 0.00335078, 0.00153875,
        0.00772171, 0.00360992, 0.0008161 , 

In [29]:
tmp = results[0]['mean_test_score']
for i in range(1,len(results)):
    tmp = np.concatenate([tmp,results[i]['mean_test_score']])
tmp2 = pd.DataFrame(tmp.reshape(5,18)).apply(lambda x: x.mean(),axis=0)
tmp2

0     0.598527
1     0.601875
2     0.604072
3     0.601464
4     0.605183
5     0.606141
6     0.601542
7     0.605469
8     0.605936
9     0.604035
10    0.605168
11    0.604083
12    0.604709
13    0.605361
14    0.605494
15    0.605472
16    0.603805
17    0.601601
dtype: float64

In [30]:
for i in range(len(results)):
    print(results[i]['rank_test_score'])

[18 15 13 14 12  1 16 11  3 17  5  8  7  6  4  2  9 10]
[18 11 13 12  9  5  8  6  2 14 10 17  4  3 15  1  7 16]
[18 15 12 14  2  7 16  3  9 11  8  5 10  6  1  4 13 17]
[18 13  7 15  3  2 16  5  4  1  9 12  6  8 11 10 14 17]
[17 16 11 18 12  4 15  7  3  8  2  5 13 10  1 14  6  9]


In [31]:
predicts

[0.6040116478899537,
 0.6045370288998648,
 0.6016372847568827,
 0.5988323650214277,
 0.6029303931929084]

In [33]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = []
for i in range(42,47):
    sampled_indices.append(resample(y_train[major_indices],replace=False,n_samples=5022,random_state=i).index)
    
#scaler_X = StandardScaler().fit(X_train)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_train)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_train.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[100,200],'max_depth':[2,3],'learning_rate':[0.1,0.15],'random_state':[5],
         'loss':['deviance','exponential'],'subsample':[0.8,1.0],'min_weight_fraction_leaf':[0,0.2]},

results = []
predicts = []
for i in sampled_indices:
    X_balanced = pd.concat([X_train.loc[i],X_train.loc[minor_indices]])
    y_balanced = pd.concat([y_train[i],y_train[minor_indices]])
    selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf, scoring=roc_scorer, verbose=1,n_jobs=-1)
    selector = selector.fit(X_balanced, y_balanced)
    results.append(selector.cv_results_)
    tmp_pred=selector.best_estimator_.predict(X_test)
    predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.7min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3775           0.0069            2.06s
         2           1.3712           0.0049            1.98s
         3           1.3658           0.0045            1.93s
         4           1.3613           0.0025            1.95s
         5           1.3581           0.0027            1.97s
         6           1.3559           0.0038            1.93s
         7           1.3490           0.0016            1.91s
         8           1.3497           0.0018            1.89s
         9           1.3462           0.0016            1.88s
        10           1.3421           0.0016            1.85s
        20           1.3285           0.0006            1.74s
        30           1.3198          -0.0000            1.76s
        40           1.3136           0.0002            1.65s
        50           1.3064          -0.0003            1.50s
        60           1.3016          -0.0004            1.41s
       

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.7min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9974           0.0022            1.90s
         2           0.9953           0.0020            1.82s
         3           0.9933           0.0011            1.89s
         4           0.9914           0.0011            1.86s
         5           0.9900           0.0013            1.82s
         6           0.9892           0.0008            1.84s
         7           0.9878           0.0005            1.84s
         8           0.9873           0.0011            1.80s
         9           0.9852           0.0004            1.76s
        10           0.9840           0.0004            1.76s
        20           0.9782           0.0004            1.62s
        30           0.9731           0.0000            1.64s
        40           0.9688           0.0001            1.52s
        50           0.9665          -0.0001            1.37s
        60           0.9615          -0.0004            1.29s
       

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.7min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3780           0.0058            2.39s
         2           1.3717           0.0052            2.26s
         3           1.3657           0.0036            2.19s
         4           1.3606           0.0045            2.08s
         5           1.3571           0.0030            2.06s
         6           1.3536           0.0035            2.02s
         7           1.3482           0.0023            2.02s
         8           1.3474           0.0019            1.98s
         9           1.3442           0.0011            1.94s
        10           1.3402           0.0005            1.92s
        20           1.3259           0.0008            1.82s
        30           1.3166          -0.0001            1.83s
        40           1.3126          -0.0000            1.64s
        50           1.3053          -0.0006            1.48s
        60           1.2981          -0.0005            1.37s
       

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.8min finished


      Iter       Train Loss   Remaining Time 
         1           0.9960            2.55s
         2           0.9926            1.86s
         3           0.9896            1.70s
         4           0.9872            1.62s
         5           0.9850            1.55s
         6           0.9832            1.53s
         7           0.9811            1.45s
         8           0.9795            1.45s
         9           0.9780            1.43s
        10           0.9764            1.38s
        20           0.9666            1.27s
        30           0.9601            1.01s
        40           0.9553            0.81s
        50           0.9514            0.67s
        60           0.9483            0.52s
        70           0.9454            0.38s
        80           0.9429            0.25s
        90           0.9408            0.13s
       100           0.9386            0.00s
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  2.9min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9956           0.0036            2.85s
         2           0.9924           0.0033            2.83s
         3           0.9892           0.0017            2.88s
         4           0.9871           0.0015            2.89s
         5           0.9842           0.0015            2.79s
         6           0.9831           0.0011            3.11s
         7           0.9808           0.0012            3.29s
         8           0.9804           0.0015            3.27s
         9           0.9782           0.0008            3.18s
        10           0.9764           0.0007            3.19s
        20           0.9670           0.0003            3.02s
        30           0.9604          -0.0001            2.60s
        40           0.9565           0.0001            2.41s
        50           0.9502           0.0001            2.21s
        60           0.9456          -0.0003            2.02s
       

In [34]:
results[0]

{'mean_fit_time': array([1.31302562, 1.06218581, 2.50530987, 2.02587552, 1.23940501,
        1.00041666, 2.46563458, 1.99831405, 2.03173413, 1.74383149,
        3.94359117, 3.3611619 , 1.41365318, 1.25211639, 2.77538071,
        2.47233381, 1.28459315, 1.01908698, 2.334271  , 1.90830164,
        1.1483151 , 0.95669098, 2.23869209, 1.88822718, 1.89043975,
        1.67832894, 3.70824857, 3.18177862, 1.37369456, 1.29381099,
        2.61493034, 2.37219176, 1.30536127, 1.05698209, 2.53625364,
        2.04889851, 1.23801136, 1.04745216, 2.49862595, 2.07968645,
        1.98651094, 1.7295702 , 3.93230314, 3.37459974, 1.42091122,
        1.23176832, 2.81678247, 2.4126245 , 1.21807485, 0.99428253,
        2.36879444, 1.94821458, 1.17482562, 0.9875031 , 2.30104151,
        1.9106606 , 1.91239066, 1.6815762 , 3.70818248, 3.2265862 ,
        1.34055781, 1.19248815, 2.62211747, 2.08185821]),
 'std_fit_time': array([0.01888516, 0.01281381, 0.0247114 , 0.02890522, 0.01107384,
        0.0243119 , 0.033

In [36]:
tmp = results[0]['mean_test_score']
for i in range(1,len(results)):
    tmp = np.concatenate([tmp,results[i]['mean_test_score']])
tmp2 = pd.DataFrame(tmp.reshape(5,64)).apply(lambda x: x.mean(),axis=0)
tmp2

0     0.604189
1     0.601542
2     0.606826
3     0.605469
4     0.561639
5     0.561981
6     0.561176
7     0.560710
8     0.606808
9     0.604035
10    0.605241
11    0.605168
12    0.562012
13    0.562330
14    0.562124
15    0.561450
16    0.604919
17    0.602260
18    0.606833
19    0.605039
20    0.561634
21    0.562153
22    0.561277
23    0.560325
24    0.606630
25    0.604978
26    0.604217
27    0.605147
28    0.562866
29    0.562347
30    0.561420
31    0.562191
32    0.605534
33    0.604709
34    0.606759
35    0.605361
36    0.560372
37    0.560926
38    0.560065
39    0.560551
40    0.603533
41    0.605472
42    0.599939
43    0.603805
44    0.562424
45    0.562340
46    0.559838
47    0.560218
48    0.606105
49    0.604015
50    0.605900
51    0.604670
52    0.561087
53    0.560462
54    0.560202
55    0.560819
56    0.602953
57    0.604648
58    0.598869
59    0.604556
60    0.562154
61    0.562211
62    0.560756
63    0.560103
dtype: float64

In [38]:
np.bincount(y_test)

array([12739,  1255])

In [40]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(class_weight='balanced')
logit.fit(X_test,y_test)
logit



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
dir(logit)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_predict_proba_lr',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify',
 'tol',
 'verbose',
 'warm_start']

In [48]:
logit.decision_function(X_test)

array([-1.2226637 , -0.38281727, -1.47794306, ..., -0.85384802,
       -0.38739217, -0.33515728])

In [51]:
len(y_test)/(2*np.bincount(y_test))

array([0.54925818, 5.5752988 ])

In [55]:
weight_params = len(y_train)/(2*np.bincount(y_train))
y_train.apply(lambda x: weight_params[1] if x==True else weight_params[0])

11696    0.549280
16809    0.549280
66493    0.549280
30590    0.549280
19336    0.549280
36777    0.549280
24276    0.549280
65173    0.549280
16453    0.549280
49869    0.549280
50384    5.573078
2411     0.549280
14144    0.549280
985      0.549280
8509     0.549280
45421    0.549280
51631    0.549280
6633     0.549280
20021    0.549280
54108    0.549280
43468    0.549280
29226    0.549280
54868    0.549280
19548    0.549280
27459    0.549280
49143    0.549280
69572    0.549280
41058    0.549280
19606    0.549280
12513    0.549280
19623    0.549280
8414     0.549280
18182    0.549280
62678    5.573078
43521    0.549280
62755    0.549280
9778     0.549280
56108    0.549280
33376    0.549280
61117    0.549280
133      0.549280
12232    0.549280
63810    0.549280
61009    5.573078
4734     0.549280
16420    0.549280
17581    0.549280
44049    0.549280
28487    0.549280
1176     0.549280
11753    0.549280
55979    0.549280
32495    0.549280
16903    0.549280
2098     5.573078
68192    0

In [56]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

weight_params = len(y_train)/(2*np.bincount(y_train))
train_weights = y_train.apply(lambda x: weight_params[1] if x==True else weight_params[0])
roc_scorer = make_scorer(roc_auc_score, sample_weight = train_weights)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(verbose=1)
                       
params = {'n_estimators':[200,300],'max_depth':[2,3],'learning_rate':[0.1],'random_state':[5]}

results = []
predicts = []

selector = GridSearchCV(estimator=estimator, param_grid=params,cv=5, scoring=roc_scorer, verbose=1,n_jobs=-2)
selector = selector.fit(X_balanced, y_balanced)
results.append(selector.cv_results_)
tmp_pred=selector.best_estimator_.predict(X_test)
predicts.append(roc_auc_score(y_test,tmp_pred))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


ValueError: Found input variables with inconsistent numbers of samples: [2010, 2010, 55976]

In [60]:
from sklearn.ensemble import GradientBoostingClassifier

weight_params = len(y_train)/(2*np.bincount(y_train))
weight_params2 = len(y_test)/(2*np.bincount(y_test))
train_weights = y_train.apply(lambda x: weight_params[1] if x==True else weight_params[0])
test_weights = y_test.apply(lambda x: weight_params2[1] if x==True else weight_params2[0])
estimator = GradientBoostingClassifier(verbose=1,random_state=5,n_estimators=200,max_depth=3)
estimator.fit(X_train,y_train)
pred = estimator.predict(X_test)
print(roc_auc_score(y_test,pred))#,sample_weight = train_weights)
print(roc_auc_score(y_test,pred,sample_weight = test_weights))

      Iter       Train Loss   Remaining Time 
         1           0.5999           15.66s
         2           0.5970           15.27s
         3           0.5949           14.59s
         4           0.5933           16.22s
         5           0.5916           15.70s
         6           0.5903           15.80s
         7           0.5893           15.78s
         8           0.5881           15.30s
         9           0.5873           16.02s
        10           0.5865           16.22s
        20           0.5810           15.86s
        30           0.5776           14.50s
        40           0.5752           13.55s
        50           0.5735           12.40s
        60           0.5719           11.39s
        70           0.5707           10.43s
        80           0.5696            9.58s
        90           0.5686            8.84s
       100           0.5678            8.52s
       200           0.5615            0.00s
0.5035915682587181
0.5035915682587181


In [62]:
#Upsampling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=5,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(random_state=5,verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = resample(y_train[minor_indices],replace=True,n_samples=50954,random_state=5).index
X_balanced = pd.concat([X_train.loc[major_indices],X_train.loc[sampled_indices]])
y_balanced = pd.concat([y_train[major_indices],y_train[sampled_indices]])

#scaler_X = StandardScaler().fit(X_balanced)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_balanced)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_balanced.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[100,200],'max_depth':[2,3],'learning_rate':[0.05,0.1,.15],'random_state':[5]}

selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf,scoring=roc_scorer,verbose=1,n_jobs=-2)
selector = selector.fit(X_balanced, y_balanced)
print(selector.cv_results_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-2)]: Done  60 out of  60 | elapsed:  9.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.3746           34.22s
         2           1.3660           34.45s
         3           1.3594           34.41s
         4           1.3541           34.89s
         5           1.3490           33.44s
         6           1.3449           33.42s
         7           1.3417           34.13s
         8           1.3384           33.48s
         9           1.3357           33.42s
        10           1.3333           33.12s
        20           1.3167           29.34s
        30           1.3066           26.85s
        40           1.2997           24.46s
        50           1.2944           22.69s
        60           1.2899           20.93s
        70           1.2857           19.26s
        80           1.2820           17.67s
        90           1.2789           16.03s
       100           1.2755           14.48s
       200           1.2509            0.00s
{'mean_fit_time': array([16.88805151, 28.8481288 , 24.

In [63]:
selector.cv_results_

{'mean_fit_time': array([16.88805151, 28.8481288 , 24.08425064, 45.74603457, 14.10005593,
        27.19663515, 22.96450138, 44.11152868, 14.74170828, 27.32953281,
        22.64455099, 41.23692169]),
 'std_fit_time': array([2.53033535, 2.07357295, 0.56391379, 0.48233138, 0.13119788,
        0.16984214, 0.338443  , 0.53965683, 0.53884372, 0.25723922,
        0.57343008, 3.4394969 ]),
 'mean_score_time': array([0.1477304 , 0.22450256, 0.15768986, 0.22977037, 0.16392779,
        0.23651643, 0.15524755, 0.22709546, 0.21694398, 0.22645979,
        0.15794473, 0.1942986 ]),
 'std_score_time': array([0.0052707 , 0.01734455, 0.00667508, 0.00901907, 0.01530233,
        0.02635993, 0.01078534, 0.01105692, 0.11009702, 0.01158782,
        0.01693187, 0.0482902 ]),
 'param_learning_rate': masked_array(data=[0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15,
                    0.15, 0.15],
              mask=[False, False, False, False, False, False, False, False,
                    False, Fal

In [64]:
selector.cv_results_['mean_test_score']

array([0.60194565, 0.6106151 , 0.61228555, 0.62213368, 0.61106838,
       0.61753588, 0.62171493, 0.63363766, 0.61561103, 0.6203123 ,
       0.62858367, 0.6406843 ])

In [65]:
preds = selector.best_estimator_.predict(X_test)
preds

array([False, False, False, ..., False, False, False])

In [66]:
roc_auc_score(y_test,preds)

0.6062758308159935

In [67]:
#Upsampling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=10,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(random_state=5,verbose=1)

major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = resample(y_train[minor_indices],replace=True,n_samples=50954,random_state=5).index
X_balanced = pd.concat([X_train.loc[major_indices],X_train.loc[sampled_indices]])
y_balanced = pd.concat([y_train[major_indices],y_train[sampled_indices]])

#scaler_X = StandardScaler().fit(X_balanced)
#scaler_y = StandardScaler().fit(y_train.to_numpy().reshape(-1,1))
#scaled_X = scaler_X.transform(X_balanced)
#scaled_y = scaler_y.transform(y_train.to_numpy().reshape(-1,1))
#scaled_X = pd.DataFrame(scaled_X,index=X_balanced.index)
#scaled_y = pd.Series(scaled_y.flatten(),index=y_train.index)
                       
params = {'n_estimators':[100,200,300],'max_depth':[2,3],'learning_rate':[0.05,0.1,.15],'random_state':[5],'subsample':[0.8,1.0]}

selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf,scoring=roc_scorer,verbose=1,n_jobs=-2)
selector = selector.fit(X_balanced, y_balanced)
print(selector.cv_results_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed: 55.1min
[Parallel(n_jobs=-2)]: Done 360 out of 360 | elapsed: 129.6min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3748           0.0116           57.45s
         2           1.3660           0.0085            1.04m
         3           1.3594           0.0068           58.11s
         4           1.3538           0.0053            1.00m
         5           1.3492           0.0044           59.52s
         6           1.3467           0.0032            1.02m
         7           1.3422           0.0035            1.01m
         8           1.3388           0.0038           59.78s
         9           1.3362           0.0020            1.01m
        10           1.3332           0.0021           59.11s
        20           1.3169           0.0012           53.17s
        30           1.3072           0.0006           49.27s
        40           1.2992           0.0003           46.51s
        50           1.2929           0.0001           43.84s
        60           1.2888           0.0005           41.89s
       

In [68]:
preds = selector.best_estimator_.predict(X_test)
preds

array([False, False, False, ..., False, False, False])

In [69]:
roc_auc_score(y_test,preds)

0.603233756238098

In [70]:
selector.cv_results_['params']

[{'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 100,
  'random_state': 5,
  'subsample': 0.8},
 {'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 100,
  'random_state': 5,
  'subsample': 1.0},
 {'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 200,
  'random_state': 5,
  'subsample': 0.8},
 {'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 200,
  'random_state': 5,
  'subsample': 1.0},
 {'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 300,
  'random_state': 5,
  'subsample': 0.8},
 {'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 300,
  'random_state': 5,
  'subsample': 1.0},
 {'learning_rate': 0.05,
  'max_depth': 3,
  'n_estimators': 100,
  'random_state': 5,
  'subsample': 0.8},
 {'learning_rate': 0.05,
  'max_depth': 3,
  'n_estimators': 100,
  'random_state': 5,
  'subsample': 1.0},
 {'learning_rate': 0.05,
  'max_depth': 3,
  'n_estimators': 200,
  'random_state': 5,
  'subsample': 0.8},
 {'learning_rate': 0.05,
  '

In [71]:
selector.cv_results_['mean_test_score']

array([0.60210581, 0.60102323, 0.61126489, 0.61011625, 0.61587089,
       0.61530202, 0.61274145, 0.61151038, 0.62467179, 0.62295545,
       0.63034729, 0.62889433, 0.6106577 , 0.61079757, 0.61758231,
       0.61773592, 0.62316876, 0.62130587, 0.62489911, 0.62300624,
       0.63588361, 0.63333206, 0.64470801, 0.64123531, 0.61630082,
       0.61580358, 0.62286756, 0.62231463, 0.62764302, 0.62694161,
       0.62914403, 0.62953378, 0.64433754, 0.64273127, 0.6550963 ,
       0.65058064])

In [72]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=10,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(random_state=5,verbose=1)
major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = resample(y_train[minor_indices],replace=True,n_samples=50954,random_state=5).index
X_balanced = pd.concat([X_train.loc[major_indices],X_train.loc[sampled_indices]])
y_balanced = pd.concat([y_train[major_indices],y_train[sampled_indices]])
params = {'n_estimators':[300],'max_depth':[2],'learning_rate':[.15],'random_state':[5],'subsample':[0.7]}

selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf,scoring=roc_scorer,verbose=1,n_jobs=-2)
selector = selector.fit(X_balanced, y_balanced)
print(selector.cv_results_)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:  2.1min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3778           0.0082           21.40s
         2           1.3716           0.0062           22.69s
         3           1.3665           0.0052           23.66s
         4           1.3624           0.0041           24.37s
         5           1.3585           0.0034           24.55s
         6           1.3560           0.0034           24.27s
         7           1.3519           0.0032           23.97s
         8           1.3494           0.0023           24.26s
         9           1.3472           0.0022           24.30s
        10           1.3447           0.0022           23.75s
        20           1.3308           0.0009           21.26s
        30           1.3222           0.0005           19.67s
        40           1.3156           0.0002           18.33s
        50           1.3106           0.0002           17.16s
        60           1.3081           0.0005           15.71s
       

In [73]:
preds = selector.best_estimator_.predict(X_test)
preds

array([False, False, False, ..., False, False, False])

In [74]:
roc_auc_score(y_test,preds)

0.6081855793718134

In [75]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

roc_scorer = make_scorer(roc_auc_score)
kf = KFold(n_splits=10,shuffle=True,random_state=5)
estimator = GradientBoostingClassifier(random_state=5,verbose=1)
major_indices = y_train[y_train==0].index
minor_indices = y_train[y_train==1].index
sampled_indices = resample(y_train[minor_indices],replace=True,n_samples=50954,random_state=5).index
X_balanced = pd.concat([X_train.loc[major_indices],X_train.loc[sampled_indices]])
y_balanced = pd.concat([y_train[major_indices],y_train[sampled_indices]])
params = {'n_estimators':[300],'max_depth':[2],'learning_rate':[.15],'random_state':[5],'subsample':[0.7]}

selector = GridSearchCV(estimator=estimator, param_grid=params,cv=kf,scoring=roc_scorer,verbose=1,n_jobs=-2)
selector = selector.fit(X_balanced, y_balanced)
print(selector.cv_results_)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  10 out of  10 | elapsed:  3.2min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3778           0.0083           35.90s
         2           1.3717           0.0062           36.62s
         3           1.3667           0.0052           36.76s
         4           1.3622           0.0040           36.61s
         5           1.3580           0.0037           37.50s
         6           1.3554           0.0034           36.41s
         7           1.3519           0.0029           36.59s
         8           1.3497           0.0023           35.93s
         9           1.3472           0.0021           36.44s
        10           1.3448           0.0023           37.11s
        20           1.3318           0.0010           33.45s
        30           1.3219           0.0007           31.61s
        40           1.3161           0.0002           30.11s
        50           1.3108           0.0003           28.65s
        60           1.3095           0.0001           27.24s
       