In [26]:
import xgboost as xgb
from catboost import CatBoostClassifier, cv ,Pool
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit, GridSearchCV
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
import hyperopt
import optuna

# Download data

In [2]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [5]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [6]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [7]:
cat_boost_selecting_feature = ['Var126', 'Var113', 'Var199', 'Var57', 'Var202', 'Var74', 'Var218', 'Var73', 'Var205', 'Var81',
 'Var207', 'Var133', 'Var38', 'Var192', 'Var216', 'Var13', 'Var193', 'Var222', 'Var134', 'Var153', 'Var123', 'Var226', 'Var149',
 'Var210', 'Var206', 'Var212', 'Var219', 'Var204', 'Var125', 'Var28']

In [8]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
new_numeric = np.intersect1d(numeric_col, cat_boost_selecting_feature)
new_cat = np.intersect1d(cat_col, cat_boost_selecting_feature)

In [11]:
data_train = data[cat_boost_selecting_feature]

hyper opt cat boost

In [12]:
train_data_boost, test_data_boost, train_label_boost, test_label_boost = train_test_split(data_train, labels,
                                                                                         test_size=0.3,random_state=42)

In [13]:
train_data_boost.fillna('NaN',inplace=True)
test_data_boost.fillna('NaN',inplace=True)

In [14]:
train_pool = Pool(train_data_boost, train_label_boost, cat_features=list(new_cat))
test_pool = Pool(test_data_boost, test_label_boost, cat_features=list(new_cat))

Find optimal parameters model

In [17]:
def hyperopt_obj(params):
    model = CatBoostClassifier(
                                l2_leaf_reg=int(params['l2_leaf_reg']),
                                learning_rate=params['learning_rate'],
                                depth = int(params['depth']),
                                iterations=200,
                                eval_metric='F1',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False
                              )
    cv_calc = cv(train_pool,
                 model.get_params(),
                 verbose=False)
    best_metrics = np.max(cv_calc['test-F1-mean'])
    return 1 - best_metrics

In [18]:
params_opt = {
                'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 2, 5, 1),
                'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-1, 5e-1),
                'depth': hyperopt.hp.quniform('depth', 1, 10,2)
                }

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_obj,
    space=params_opt,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    #rstate=np.random.RandomState(123)
)


  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]Training on fold [0/3]

bestTest = 0.0421686747
bestIteration = 70

Training on fold [1/3]

bestTest = 0.07492795389
bestIteration = 195

Training on fold [2/3]

bestTest = 0.04819277108
bestIteration = 193

  2%|▉                                                | 1/50 [00:40<33:04, 40.49s/trial, best loss: 0.9488587482407141]Training on fold [0/3]

bestTest = 0.01246105919
bestIteration = 166

Training on fold [1/3]

bestTest = 0.006269592476
bestIteration = 80

Training on fold [2/3]

bestTest = 0.03692307692
bestIteration = 198

  4%|█▉                                               | 2/50 [01:27<35:23, 44.24s/trial, best loss: 0.9488587482407141]Training on fold [0/3]

bestTest = 0.05357142857
bestIteration = 196

Training on fold [1/3]

bestTest = 0.07514450867
bestIteration = 182

Training on fold [2/3]

bestTest = 0.05405405405
bestIteration = 135

  6%|██▉       

 50%|████████████████████████▌                        | 25/50 [12:55<12:47, 30.69s/trial, best loss: 0.917475251387779]Training on fold [0/3]

bestTest = 0.05865102639
bestIteration = 119

Training on fold [1/3]

bestTest = 0.0523255814
bestIteration = 154

Training on fold [2/3]

bestTest = 0.05373134328
bestIteration = 117

 52%|█████████████████████████▍                       | 26/50 [14:19<18:42, 46.77s/trial, best loss: 0.917475251387779]Training on fold [0/3]

bestTest = 0.07079646018
bestIteration = 167

Training on fold [1/3]

bestTest = 0.09550561798
bestIteration = 109

Training on fold [2/3]

bestTest = 0.06976744186
bestIteration = 136

 54%|██████████████████████████▍                      | 27/50 [14:51<16:12, 42.30s/trial, best loss: 0.917475251387779]Training on fold [0/3]

bestTest = 0.08139534884
bestIteration = 159

Training on fold [1/3]

bestTest = 0.04651162791
bestIteration = 185

Training on fold [2/3]

bestTest = 0.05970149254
bestIteration = 120

 56%|█████████

100%|████████████████████████████████████████████████| 50/50 [27:30<00:00, 33.00s/trial, best loss: 0.9108221335927121]


In [None]:
# old best params
#{'depth': 4.0, 'l2_leaf_reg': 2.0, 'learning_rate': 0.28614075781169124}
# new best params
#{'depth': 6.0, 'l2_leaf_reg': 14.0, 'learning_rate': 0.4702911427230305}

In [19]:
best

{'depth': 6.0, 'l2_leaf_reg': 14.0, 'learning_rate': 0.4702911427230305}

In [20]:
best_model_hyper = CatBoostClassifier(
                                l2_leaf_reg=int(best['l2_leaf_reg']),
                                learning_rate=best['learning_rate'],
                                depth = int(best['depth']),
                                iterations=100,
                                eval_metric='AUC',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False,
                                use_best_model=True
                              )

In [21]:
best_model_hyper.fit(train_pool,
               eval_set =test_pool,
               verbose=False,
               plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1cb39207940>

In [22]:
best_model_hyper.eval_metrics(test_pool, ['AUC','F1'], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'AUC': [0.5573381831522022,
  0.5892415865295086,
  0.5892415865295086,
  0.6692054037646462,
  0.679777642664005,
  0.6905319127144336,
  0.7020294378356635,
  0.7108158161327086,
  0.7153679915679074,
  0.7171878375257916,
  0.7167844360106965,
  0.7137057352382513,
  0.7096032838233433,
  0.7115115848162743,
  0.7121663848520288,
  0.7126314721152485,
  0.713454336717592,
  0.7152334468040731,
  0.7136961914055227,
  0.7136961914055227,
  0.7149278113803454,
  0.7143002461843291,
  0.7198356691669956,
  0.7172762925608385,
  0.7164243309074928,
  0.7166180008789638,
  0.720425291808505,
  0.7207902852162772],
 'F1': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.004694835680751173,
  0.01405152224824356,
  0.018604651162790697,
  0.018561484918793503,
  0.018561484918793503,
  0.018604651162790697,
  0.02777777777777778,
  0.023201856148491882,
  0.02777777777777778,
  0.02777777777777778,
  0.03225806451612903,
  0.03218390804597701,
  0.03218390804597701,
  0.032110091743119

optuna

In [29]:
def objective(trial):
    params = {
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 1e-1, 5e-1),
        'depth': trial.suggest_int('depth', 1, 10)
    }
    
    model = CatBoostClassifier(
                                l2_leaf_reg=int(params['l2_leaf_reg']),
                                learning_rate=params['learning_rate'],
                                depth = int(params['depth']),
                                iterations=200,
                                eval_metric='F1',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False
                              )
    cv_data = cv(train_pool, model.get_params(), verbose=False)
    best_metric = np.max(cv_data['test-F1-mean'])
    return best_metric

In [30]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[32m[I 2023-04-20 12:45:37,271][0m A new study created in memory with name: no-name-2ba0c3cc-462b-4b2c-83b9-8eaf24bd99ec[0m


Training on fold [0/3]

bestTest = 0.03039513678
bestIteration = 74

Training on fold [1/3]

bestTest = 0.03658536585
bestIteration = 69

Training on fold [2/3]

bestTest = 0.03669724771
bestIteration = 135



[32m[I 2023-04-20 12:46:53,160][0m Trial 0 finished with value: 0.028530520382950602 and parameters: {'l2_leaf_reg': 5, 'learning_rate': 0.2045598368336966, 'depth': 10}. Best is trial 0 with value: 0.028530520382950602.[0m


Training on fold [0/3]

bestTest = 0.03048780488
bestIteration = 171

Training on fold [1/3]

bestTest = 0.02469135802
bestIteration = 130

Training on fold [2/3]

bestTest = 0.04229607251
bestIteration = 188



[32m[I 2023-04-20 12:47:19,655][0m Trial 1 finished with value: 0.03038918355472585 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.10188519874235885, 'depth': 5}. Best is trial 1 with value: 0.03038918355472585.[0m


Training on fold [0/3]

bestTest = 0.09577464789
bestIteration = 67

Training on fold [1/3]

bestTest = 0.0790960452
bestIteration = 173

Training on fold [2/3]

bestTest = 0.06303724928
bestIteration = 98



[32m[I 2023-04-20 12:47:50,078][0m Trial 2 finished with value: 0.07485517834006206 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.42373548587855636, 'depth': 6}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.05325443787
bestIteration = 176

Training on fold [1/3]

bestTest = 0.06837606838
bestIteration = 181

Training on fold [2/3]

bestTest = 0.08308605341
bestIteration = 169



[32m[I 2023-04-20 12:48:09,710][0m Trial 3 finished with value: 0.06620171656693 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.31758968381496444, 'depth': 4}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.08163265306
bestIteration = 85

Training on fold [1/3]

bestTest = 0.06340057637
bestIteration = 33

Training on fold [2/3]


[32m[I 2023-04-20 12:49:46,067][0m Trial 4 finished with value: 0.05628311188958379 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.45103816739063096, 'depth': 10}. Best is trial 2 with value: 0.07485517834006206.[0m



bestTest = 0.04747774481
bestIteration = 31

Training on fold [0/3]

bestTest = 0.04179104478
bestIteration = 161

Training on fold [1/3]

bestTest = 0.05373134328
bestIteration = 180

Training on fold [2/3]

bestTest = 0.04242424242
bestIteration = 162



[32m[I 2023-04-20 12:49:59,161][0m Trial 5 finished with value: 0.04582324169006144 and parameters: {'l2_leaf_reg': 5, 'learning_rate': 0.26381027648313315, 'depth': 2}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.08022922636
bestIteration = 137

Training on fold [1/3]

bestTest = 0.07799442897
bestIteration = 156

Training on fold [2/3]


[32m[I 2023-04-20 12:50:25,976][0m Trial 6 finished with value: 0.07398103845472266 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.29911069933310597, 'depth': 5}. Best is trial 2 with value: 0.07485517834006206.[0m



bestTest = 0.07558139535
bestIteration = 194

Training on fold [0/3]

bestTest = 0.04255319149
bestIteration = 155

Training on fold [1/3]

bestTest = 0.04776119403
bestIteration = 166

Training on fold [2/3]

bestTest = 0.03692307692
bestIteration = 106



[32m[I 2023-04-20 12:51:17,040][0m Trial 7 finished with value: 0.03641685532372244 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.1108536497859487, 'depth': 8}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.05917159763
bestIteration = 147

Training on fold [1/3]

bestTest = 0.07580174927
bestIteration = 186

Training on fold [2/3]

bestTest = 0.06042296073
bestIteration = 54



[32m[I 2023-04-20 12:51:54,436][0m Trial 8 finished with value: 0.058641650377103866 and parameters: {'l2_leaf_reg': 5, 'learning_rate': 0.2633541285251853, 'depth': 7}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.06896551724
bestIteration = 168

Training on fold [1/3]

bestTest = 0.08670520231
bestIteration = 88

Training on fold [2/3]

bestTest = 0.04747774481
bestIteration = 114



[32m[I 2023-04-20 12:52:10,286][0m Trial 9 finished with value: 0.06393618396592515 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.37766683961704717, 'depth': 3}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.05421686747
bestIteration = 145

Training on fold [1/3]

bestTest = 0.02476780186
bestIteration = 107

Training on fold [2/3]

bestTest = 0.02476780186
bestIteration = 191



[32m[I 2023-04-20 12:52:18,303][0m Trial 10 finished with value: 0.034504404645443514 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.487117425081978, 'depth': 1}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.128342246
bestIteration = 175

Training on fold [1/3]

bestTest = 0.06896551724
bestIteration = 52

Training on fold [2/3]

bestTest = 0.05988023952
bestIteration = 78



[32m[I 2023-04-20 12:52:49,977][0m Trial 11 finished with value: 0.07398610429564646 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.391950281552241, 'depth': 6}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.06358381503
bestIteration = 78

Training on fold [1/3]

bestTest = 0.07242339833
bestIteration = 94

Training on fold [2/3]

bestTest = 0.07386363636
bestIteration = 115



[32m[I 2023-04-20 12:53:31,703][0m Trial 12 finished with value: 0.06638040842404495 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.39338480182881114, 'depth': 7}. Best is trial 2 with value: 0.07485517834006206.[0m


Training on fold [0/3]

bestTest = 0.1005586592
bestIteration = 109

Training on fold [1/3]

bestTest = 0.08743169399
bestIteration = 128

Training on fold [2/3]

bestTest = 0.06936416185
bestIteration = 66



[32m[I 2023-04-20 12:54:12,566][0m Trial 13 finished with value: 0.07723958085231385 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.4278584574619844, 'depth': 7}. Best is trial 13 with value: 0.07723958085231385.[0m


Training on fold [0/3]

bestTest = 0.1005586592
bestIteration = 119

Training on fold [1/3]

bestTest = 0.07123287671
bestIteration = 89

Training on fold [2/3]

bestTest = 0.06916426513
bestIteration = 32



[32m[I 2023-04-20 12:55:06,578][0m Trial 14 finished with value: 0.06824880629758678 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.49600027739821595, 'depth': 8}. Best is trial 13 with value: 0.07723958085231385.[0m


Training on fold [0/3]

bestTest = 0.09917355372
bestIteration = 98

Training on fold [1/3]

bestTest = 0.06666666667
bestIteration = 77

Training on fold [2/3]

bestTest = 0.05797101449
bestIteration = 156



[32m[I 2023-04-20 12:55:56,422][0m Trial 15 finished with value: 0.0653827648588798 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.44078666404515165, 'depth': 8}. Best is trial 13 with value: 0.07723958085231385.[0m


Training on fold [0/3]

bestTest = 0.108401084
bestIteration = 159

Training on fold [1/3]

bestTest = 0.08376963351
bestIteration = 129

Training on fold [2/3]

bestTest = 0.1169916435
bestIteration = 116



[32m[I 2023-04-20 12:56:30,115][0m Trial 16 finished with value: 0.08694479141413804 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4425917761309252, 'depth': 6}. Best is trial 16 with value: 0.08694479141413804.[0m


Training on fold [0/3]

bestTest = 0.08356545961
bestIteration = 171

Training on fold [1/3]

bestTest = 0.09039548023
bestIteration = 108

Training on fold [2/3]


[32m[I 2023-04-20 12:56:50,152][0m Trial 17 finished with value: 0.07916551296034767 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.3679792295719691, 'depth': 4}. Best is trial 16 with value: 0.08694479141413804.[0m



bestTest = 0.08595988539
bestIteration = 186

Training on fold [0/3]

bestTest = 0.05747126437
bestIteration = 172

Training on fold [1/3]

bestTest = 0.06267806268
bestIteration = 137

Training on fold [2/3]


[32m[I 2023-04-20 12:57:10,463][0m Trial 18 finished with value: 0.063355159117932 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.34527346454985847, 'depth': 4}. Best is trial 16 with value: 0.08694479141413804.[0m



bestTest = 0.07079646018
bestIteration = 172

Training on fold [0/3]

bestTest = 0.07492795389
bestIteration = 162

Training on fold [1/3]

bestTest = 0.09418282548
bestIteration = 188

Training on fold [2/3]


[32m[I 2023-04-20 12:57:25,969][0m Trial 19 finished with value: 0.08368488009254217 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.3566484803136365, 'depth': 3}. Best is trial 16 with value: 0.08694479141413804.[0m



bestTest = 0.09248554913
bestIteration = 196

Training on fold [0/3]

bestTest = 0.04863221884
bestIteration = 140

Training on fold [1/3]

bestTest = 0.02476780186
bestIteration = 145

Training on fold [2/3]


[32m[I 2023-04-20 12:57:32,492][0m Trial 20 finished with value: 0.028571236510821597 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.3465568705172243, 'depth': 1}. Best is trial 16 with value: 0.08694479141413804.[0m



bestTest = 0.01246105919
bestIteration = 144

Training on fold [0/3]

bestTest = 0.1079545455
bestIteration = 194

Training on fold [1/3]

bestTest = 0.08815426997
bestIteration = 195

Training on fold [2/3]

bestTest = 0.08022922636
bestIteration = 136



[32m[I 2023-04-20 12:57:48,748][0m Trial 21 finished with value: 0.08825617080876524 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.36963736239328615, 'depth': 3}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.07471264368
bestIteration = 162

Training on fold [1/3]

bestTest = 0.06285714286
bestIteration = 168

Training on fold [2/3]

bestTest = 0.07514450867
bestIteration = 191



[32m[I 2023-04-20 12:58:02,295][0m Trial 22 finished with value: 0.06686898057316 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.40586757629170855, 'depth': 3}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.05847953216
bestIteration = 163

Training on fold [1/3]

bestTest = 0.08287292818
bestIteration = 181

Training on fold [2/3]

bestTest = 0.06413994169
bestIteration = 179



[32m[I 2023-04-20 12:58:13,193][0m Trial 23 finished with value: 0.06481268260046757 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.46275667282155053, 'depth': 2}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.09169054441
bestIteration = 157

Training on fold [1/3]

bestTest = 0.09470752089
bestIteration = 181

Training on fold [2/3]


[32m[I 2023-04-20 12:58:28,366][0m Trial 24 finished with value: 0.07960424528537619 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.421359699723758, 'depth': 3}. Best is trial 21 with value: 0.08825617080876524.[0m



bestTest = 0.05847953216
bestIteration = 146

Training on fold [0/3]

bestTest = 0.07954545455
bestIteration = 199

Training on fold [1/3]

bestTest = 0.0632183908
bestIteration = 181

Training on fold [2/3]

bestTest = 0.06528189911
bestIteration = 104



[32m[I 2023-04-20 12:58:41,870][0m Trial 25 finished with value: 0.06335348287874794 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.46535782225058175, 'depth': 2}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.04610951009
bestIteration = 194

Training on fold [1/3]

bestTest = 0.07303370787
bestIteration = 182

Training on fold [2/3]

bestTest = 0.05309734513
bestIteration = 199



[32m[I 2023-04-20 12:59:02,392][0m Trial 26 finished with value: 0.0570321724306054 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.36721962033896455, 'depth': 4}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.08533333333
bestIteration = 199

Training on fold [1/3]

bestTest = 0.09523809524
bestIteration = 178

Training on fold [2/3]

bestTest = 0.09142857143
bestIteration = 106



[32m[I 2023-04-20 12:59:27,163][0m Trial 27 finished with value: 0.08251276287282738 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.41275330520553427, 'depth': 5}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.09577464789
bestIteration = 167

Training on fold [1/3]

bestTest = 0.07558139535
bestIteration = 57

Training on fold [2/3]

bestTest = 0.07386363636
bestIteration = 198



[32m[I 2023-04-20 12:59:44,324][0m Trial 28 finished with value: 0.07699796806703614 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.4469691393205937, 'depth': 3}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.05294117647
bestIteration = 75

Training on fold [1/3]

bestTest = 0.04747774481
bestIteration = 52

Training on fold [2/3]


[32m[I 2023-04-20 13:01:09,208][0m Trial 29 finished with value: 0.04003496767588396 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.33968525124429894, 'depth': 10}. Best is trial 21 with value: 0.08825617080876524.[0m



bestTest = 0.04255319149
bestIteration = 49

Training on fold [0/3]

bestTest = 0.1111111111
bestIteration = 120

Training on fold [1/3]

bestTest = 0.08888888889
bestIteration = 112

Training on fold [2/3]

bestTest = 0.0701754386
bestIteration = 178



[32m[I 2023-04-20 13:01:43,226][0m Trial 30 finished with value: 0.08102799681467852 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.38924537666904785, 'depth': 6}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.08403361345
bestIteration = 142

Training on fold [1/3]

bestTest = 0.09315068493
bestIteration = 69

Training on fold [2/3]

bestTest = 0.08069164265
bestIteration = 107



[32m[I 2023-04-20 13:02:07,946][0m Trial 31 finished with value: 0.07907655710605808 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4074846605586023, 'depth': 5}. Best is trial 21 with value: 0.08825617080876524.[0m


Training on fold [0/3]

bestTest = 0.1215469613
bestIteration = 181

Training on fold [1/3]

bestTest = 0.07065217391
bestIteration = 193

Training on fold [2/3]


[32m[I 2023-04-20 13:02:32,825][0m Trial 32 finished with value: 0.09015630566316518 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.41443475608125596, 'depth': 5}. Best is trial 32 with value: 0.09015630566316518.[0m



bestTest = 0.08
bestIteration = 164

Training on fold [0/3]

bestTest = 0.1154855643
bestIteration = 198

Training on fold [1/3]

bestTest = 0.1043956044
bestIteration = 192

Training on fold [2/3]


[32m[I 2023-04-20 13:02:59,872][0m Trial 33 finished with value: 0.10069098317399472 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.43178482890682496, 'depth': 5}. Best is trial 33 with value: 0.10069098317399472.[0m



bestTest = 0.08310249307
bestIteration = 177

Training on fold [0/3]

bestTest = 0.1075268817
bestIteration = 126

Training on fold [1/3]

bestTest = 0.1221374046
bestIteration = 167

Training on fold [2/3]


[32m[I 2023-04-20 13:03:25,241][0m Trial 34 finished with value: 0.0963198777125225 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.47404144450478236, 'depth': 5}. Best is trial 33 with value: 0.10069098317399472.[0m



bestTest = 0.09039548023
bestIteration = 181

Training on fold [0/3]

bestTest = 0.1041666667
bestIteration = 168

Training on fold [1/3]

bestTest = 0.1215189873
bestIteration = 122

Training on fold [2/3]

bestTest = 0.0790960452
bestIteration = 139



[32m[I 2023-04-20 13:03:50,261][0m Trial 35 finished with value: 0.09540062310434694 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.49454699455513884, 'depth': 5}. Best is trial 33 with value: 0.10069098317399472.[0m


Training on fold [0/3]

bestTest = 0.1347150259
bestIteration = 100

Training on fold [1/3]

bestTest = 0.1231527094
bestIteration = 151

Training on fold [2/3]

bestTest = 0.1098901099
bestIteration = 176



[32m[I 2023-04-20 13:04:14,170][0m Trial 36 finished with value: 0.11648470824955459 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.47666867088363574, 'depth': 5}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1413612565
bestIteration = 189

Training on fold [1/3]

bestTest = 0.07368421053
bestIteration = 192

Training on fold [2/3]


[32m[I 2023-04-20 13:04:34,055][0m Trial 37 finished with value: 0.09401297268580966 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.49734340605839017, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m



bestTest = 0.07303370787
bestIteration = 199

Training on fold [0/3]

bestTest = 0.1350649351
bestIteration = 165

Training on fold [1/3]

bestTest = 0.09498680739
bestIteration = 163

Training on fold [2/3]

bestTest = 0.104109589
bestIteration = 134



[32m[I 2023-04-20 13:05:00,223][0m Trial 38 finished with value: 0.10545300428387237 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4750964933141514, 'depth': 5}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1189189189
bestIteration = 155

Training on fold [1/3]

bestTest = 0.07282913165
bestIteration = 61

Training on fold [2/3]

bestTest = 0.09497206704
bestIteration = 148



[32m[I 2023-04-20 13:05:31,376][0m Trial 39 finished with value: 0.08717380482086363 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.46603939166668645, 'depth': 6}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.09604519774
bestIteration = 192

Training on fold [1/3]

bestTest = 0.1126005362
bestIteration = 182

Training on fold [2/3]


[32m[I 2023-04-20 13:05:51,529][0m Trial 40 finished with value: 0.11034884630061904 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4782154944918049, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m



bestTest = 0.123943662
bestIteration = 194

Training on fold [0/3]

bestTest = 0.1239892183
bestIteration = 106

Training on fold [1/3]

bestTest = 0.08988764045
bestIteration = 89

Training on fold [2/3]

bestTest = 0.1085714286
bestIteration = 138



[32m[I 2023-04-20 13:06:12,005][0m Trial 41 finished with value: 0.10137309415901792 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4708505774276767, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1302083333
bestIteration = 199

Training on fold [1/3]

bestTest = 0.1010638298
bestIteration = 165

Training on fold [2/3]

bestTest = 0.1104972376
bestIteration = 147



[32m[I 2023-04-20 13:06:32,182][0m Trial 42 finished with value: 0.10614984065251444 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.47672053267732545, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1044386423
bestIteration = 172

Training on fold [1/3]

bestTest = 0.08042895442
bestIteration = 157

Training on fold [2/3]

bestTest = 0.1066666667
bestIteration = 199



[32m[I 2023-04-20 13:06:52,874][0m Trial 43 finished with value: 0.09248669719878617 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.47348088543850236, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1126760563
bestIteration = 158

Training on fold [1/3]

bestTest = 0.09142857143
bestIteration = 58

Training on fold [2/3]

bestTest = 0.06936416185
bestIteration = 156



[32m[I 2023-04-20 13:07:13,011][0m Trial 44 finished with value: 0.08408098488510933 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.45740039071112965, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.09169054441
bestIteration = 198

Training on fold [1/3]

bestTest = 0.08219178082
bestIteration = 197

Training on fold [2/3]

bestTest = 0.05952380952
bestIteration = 155



[32m[I 2023-04-20 13:07:26,513][0m Trial 45 finished with value: 0.07366501187795681 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.47799375147187806, 'depth': 2}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.1138211382
bestIteration = 131

Training on fold [1/3]

bestTest = 0.08310249307
bestIteration = 164

Training on fold [2/3]


[32m[I 2023-04-20 13:07:48,580][0m Trial 46 finished with value: 0.09335375854311412 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4504756157545442, 'depth': 4}. Best is trial 36 with value: 0.11648470824955459.[0m



bestTest = 0.09523809524
bestIteration = 189

Training on fold [0/3]

bestTest = 0.1114058355
bestIteration = 184

Training on fold [1/3]

bestTest = 0.1005291005
bestIteration = 108

Training on fold [2/3]


[32m[I 2023-04-20 13:08:20,136][0m Trial 47 finished with value: 0.09314094354523374 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.4762508043250892, 'depth': 6}. Best is trial 36 with value: 0.11648470824955459.[0m



bestTest = 0.09782608696
bestIteration = 188

Training on fold [0/3]

bestTest = 0.08877284595
bestIteration = 131

Training on fold [1/3]

bestTest = 0.1139896373
bestIteration = 119

Training on fold [2/3]

bestTest = 0.08426966292
bestIteration = 167



[32m[I 2023-04-20 13:08:48,003][0m Trial 48 finished with value: 0.08995832485684546 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.49989548358088215, 'depth': 5}. Best is trial 36 with value: 0.11648470824955459.[0m


Training on fold [0/3]

bestTest = 0.08767123288
bestIteration = 134

Training on fold [1/3]

bestTest = 0.09836065574
bestIteration = 193

Training on fold [2/3]

bestTest = 0.07038123167
bestIteration = 45



[32m[I 2023-04-20 13:09:25,817][0m Trial 49 finished with value: 0.07171365127518342 and parameters: {'l2_leaf_reg': 5, 'learning_rate': 0.43740249607791404, 'depth': 7}. Best is trial 36 with value: 0.11648470824955459.[0m


In [35]:
best_optun = study.best_params
best_optun

{'l2_leaf_reg': 2, 'learning_rate': 0.47666867088363574, 'depth': 5}

In [36]:
best_model_optun = CatBoostClassifier(
                                l2_leaf_reg=int(best_optun['l2_leaf_reg']),
                                learning_rate=best_optun['learning_rate'],
                                depth = int(best_optun['depth']),
                                iterations=100,
                                eval_metric='AUC',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False,
                                use_best_model=True
                              )

In [37]:
best_model_optun.fit(train_pool,
               eval_set =test_pool,
               verbose=False,
               plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1cb4a0a78b0>

test competition data

In [38]:
data_comp = pd.read_csv('./data/orange_small_churn_test_data.csv')
data_comp = data_comp[cat_boost_selecting_feature]
data_comp.fillna('NaN',inplace=True)

In [39]:
predictions_probs_hyper = best_model_hyper.predict_proba(data_comp)
predictions_probs_hyper[10:]

array([[0.95585889, 0.04414111],
       [0.95128467, 0.04871533],
       [0.85955112, 0.14044888],
       ...,
       [0.98376488, 0.01623512],
       [0.97686822, 0.02313178],
       [0.94712067, 0.05287933]])

In [41]:
predictions_probs_opt = best_model_optun.predict_proba(data_comp)
predictions_probs_opt[10:]

array([[0.95096802, 0.04903198],
       [0.94006118, 0.05993882],
       [0.86669957, 0.13330043],
       ...,
       [0.96553755, 0.03446245],
       [0.98267033, 0.01732967],
       [0.9448628 , 0.0551372 ]])

In [42]:
out_df_h = pd.DataFrame(enumerate(predictions_probs_hyper[:,1]), columns=['Id', 'result'])
out_df_h.to_csv('output_df_h.csv', sep=',', index=False)

In [43]:
out_df_o = pd.DataFrame(enumerate(predictions_probs_opt[:,1]), columns=['Id', 'result'])
out_df_o.to_csv('output_df_o.csv', sep=',', index=False)

This method give auc about 0.7, but f1 is so little we need to choice right treshold