In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
train = pd.read_csv('./data/train_ohe_te_fillna.csv')
test = pd.read_csv('./data/Test.zip')

useful_features = [
'MONTANT',
'FREQUENCE_RECH',
'REVENUE',
'ARPU_SEGMENT',
'FREQUENCE',
'DATA_VOLUME',
'ON_NET',
'ORANGE',
'TIGO',
'ZONE1',
'ZONE2',
# 'MRG',
'REGULARITY',
# 'TOP_PACK',
'FREQ_TOP_PACK',
# 'CHURN',
'TENURE_CHURN_mean',
'REGION_CHURN_mean',
'REGION_0',
'REGION_1',
'REGION_2',
'REGION_3',
'REGION_4',
'REGION_5',
'REGION_6',
'REGION_7',
'REGION_8',
'REGION_9',
'REGION_10',
'REGION_11',
'REGION_12',
'REGION_13',
'TENURE_0',
'TENURE_1',
'TENURE_2',
'TENURE_3',
'TENURE_4',
'TENURE_5',
'TENURE_6',
'TENURE_7',
]

useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'TENURE_CHURN_mean',
    'REGION_CHURN_mean'
]

cat_cols = [
    # 'user_id',
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
]

final_predictions = []
scores = []

# for cat_col in cat_cols:
#     encoder = OneHotEncoder(handle_unknown='ignore')
#     unique_values = train[cat_col].unique()

#     one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
#     ohe_df = pd.DataFrame(encoder.fit_transform(train[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = train.index
#     train = train.drop(cat_col, axis=1)
#     train = pd.concat([train, ohe_df], axis=1)        
#     print(f'[{cat_col}] xtrain transformed')

# #     ohe_df = pd.DataFrame(encoder.transform(xvalid[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
# #     ohe_df.index = xvalid.index
# #     xvalid = xvalid.drop(cat_col, axis=1)
# #     xvalid = pd.concat([xvalid, ohe_df], axis=1)        
# #     print(f'[{cat_col}] xvalid transformed')

#     ohe_df = pd.DataFrame(encoder.transform(test[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = test.index
#     test = test.drop(cat_col, axis=1)
#     test = pd.concat([test, ohe_df], axis=1)
#     print(f'[{cat_col}] xtest transformed')
    
#     useful_cols += one_hot_encoded_cols
#     useful_cols.remove(cat_col)
    
# scaler = StandardScaler()
# train[num_cols] = scaler.fit_transform(train[num_cols])
# # valid[num_cols] = scaler.transform(xvalid[num_cols])
# test[num_cols] = scaler.transform(test[num_cols])

def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

#     ordinal_encoder = preprocessing.OrdinalEncoder()
#     xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
#     xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBClassifier(
        random_state=42,
#         tree_method="gpu_hist",
#         gpu_id=1,
#         predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    score = roc_auc_score(yvalid, preds_valid)
    return score

In [3]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=10)

[32m[I 2021-09-11 00:04:46,879][0m A new study created in memory with name: no-name-02df9e7f-3459-4c0a-a59d-6c9e445f6f67[0m


[0]	validation_0-logloss:0.62342
[1000]	validation_0-logloss:0.27654
[2000]	validation_0-logloss:0.27646
[3000]	validation_0-logloss:0.27646
[3089]	validation_0-logloss:0.27645


[32m[I 2021-09-11 00:23:03,419][0m Trial 0 finished with value: 0.7759328225331883 and parameters: {'learning_rate': 0.1302145083250643, 'reg_lambda': 0.205739318814619, 'reg_alpha': 8.46452430120468e-06, 'subsample': 0.8302593096897046, 'colsample_bytree': 0.8773860562118906, 'max_depth': 1}. Best is trial 0 with value: 0.7759328225331883.[0m


[0]	validation_0-logloss:0.63075
[1000]	validation_0-logloss:0.27655
[2000]	validation_0-logloss:0.27647
[3000]	validation_0-logloss:0.27644
[3050]	validation_0-logloss:0.27644


[32m[I 2021-09-11 00:41:04,638][0m Trial 1 finished with value: 0.7759825613108775 and parameters: {'learning_rate': 0.11632489309638475, 'reg_lambda': 57.4788801616397, 'reg_alpha': 1.0954231148515935e-06, 'subsample': 0.5766475848638271, 'colsample_bytree': 0.7829097547205851, 'max_depth': 1}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.67687
[1000]	validation_0-logloss:0.27334
[1824]	validation_0-logloss:0.27327


[32m[I 2021-09-11 01:07:57,492][0m Trial 2 finished with value: 0.7709533117069707 and parameters: {'learning_rate': 0.025545094373984806, 'reg_lambda': 0.00392506077359536, 'reg_alpha': 1.6437041610700283e-05, 'subsample': 0.23399423770908634, 'colsample_bytree': 0.792689496529303, 'max_depth': 5}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.68555
[1000]	validation_0-logloss:0.27519
[2000]	validation_0-logloss:0.27415
[3000]	validation_0-logloss:0.27378
[4000]	validation_0-logloss:0.27358
[5000]	validation_0-logloss:0.27344
[6000]	validation_0-logloss:0.27338
[6999]	validation_0-logloss:0.27333


[32m[I 2021-09-11 02:33:46,064][0m Trial 3 finished with value: 0.7703909861130649 and parameters: {'learning_rate': 0.012421749406777961, 'reg_lambda': 0.0012413974450945805, 'reg_alpha': 0.5490274799079778, 'subsample': 0.8110519565653027, 'colsample_bytree': 0.5443559272525499, 'max_depth': 3}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.68341
[1000]	validation_0-logloss:0.27764
[2000]	validation_0-logloss:0.27696
[3000]	validation_0-logloss:0.27677
[4000]	validation_0-logloss:0.27669
[5000]	validation_0-logloss:0.27664
[6000]	validation_0-logloss:0.27660
[6999]	validation_0-logloss:0.27657


[32m[I 2021-09-11 03:26:38,728][0m Trial 4 finished with value: 0.7757556150917341 and parameters: {'learning_rate': 0.01734890480750199, 'reg_lambda': 1.3785089699350713e-06, 'reg_alpha': 4.79019264298999e-07, 'subsample': 0.8174462123544131, 'colsample_bytree': 0.38838151696518997, 'max_depth': 1}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.64177
[1000]	validation_0-logloss:0.27425
[2000]	validation_0-logloss:0.27388
[3000]	validation_0-logloss:0.27378
[3197]	validation_0-logloss:0.27378


[32m[I 2021-09-11 04:12:31,365][0m Trial 5 finished with value: 0.7712250359273162 and parameters: {'learning_rate': 0.08805937615945421, 'reg_lambda': 0.026131397105435338, 'reg_alpha': 0.007075114159118722, 'subsample': 0.4785000181816955, 'colsample_bytree': 0.9736793022789237, 'max_depth': 2}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.66356
[1000]	validation_0-logloss:0.27486
[2000]	validation_0-logloss:0.27405
[3000]	validation_0-logloss:0.27376
[4000]	validation_0-logloss:0.27363
[5000]	validation_0-logloss:0.27352
[6000]	validation_0-logloss:0.27347
[6999]	validation_0-logloss:0.27342


[32m[I 2021-09-11 05:28:47,262][0m Trial 6 finished with value: 0.7709215690667116 and parameters: {'learning_rate': 0.05058644883536961, 'reg_lambda': 2.7608482563433188, 'reg_alpha': 13.164424037898252, 'subsample': 0.5402986509616882, 'colsample_bytree': 0.5276011029442162, 'max_depth': 2}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.67599
[1000]	validation_0-logloss:0.27291
[2000]	validation_0-logloss:0.27281
[2681]	validation_0-logloss:0.27281


[32m[I 2021-09-11 06:58:08,124][0m Trial 7 finished with value: 0.7682681864484733 and parameters: {'learning_rate': 0.027028495657373555, 'reg_lambda': 4.597515744795734e-08, 'reg_alpha': 52.59713092321981, 'subsample': 0.8839297607586222, 'colsample_bytree': 0.7887698621986795, 'max_depth': 7}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.60053
[949]	validation_0-logloss:0.27351


[32m[I 2021-09-11 07:16:49,250][0m Trial 8 finished with value: 0.7705554749034385 and parameters: {'learning_rate': 0.1719677506475023, 'reg_lambda': 47.53387611552142, 'reg_alpha': 0.06230963793919973, 'subsample': 0.5894950140004344, 'colsample_bytree': 0.19358949999759273, 'max_depth': 7}. Best is trial 1 with value: 0.7759825613108775.[0m


[0]	validation_0-logloss:0.65579
[1000]	validation_0-logloss:0.27463
[2000]	validation_0-logloss:0.27405
[3000]	validation_0-logloss:0.27383
[4000]	validation_0-logloss:0.27372
[5000]	validation_0-logloss:0.27367
[6000]	validation_0-logloss:0.27364
[6999]	validation_0-logloss:0.27360


[32m[I 2021-09-11 08:19:12,336][0m Trial 9 finished with value: 0.7710875492546604 and parameters: {'learning_rate': 0.07550040904399719, 'reg_lambda': 0.10199510603017523, 'reg_alpha': 3.1032792386572865, 'subsample': 0.6316311785349856, 'colsample_bytree': 0.23103482954276922, 'max_depth': 2}. Best is trial 1 with value: 0.7759825613108775.[0m


In [5]:
study.best_params

{'learning_rate': 0.11632489309638475,
 'reg_lambda': 57.4788801616397,
 'reg_alpha': 1.0954231148515935e-06,
 'subsample': 0.5766475848638271,
 'colsample_bytree': 0.7829097547205851,
 'max_depth': 1}

In [24]:
train.columns

Index(['user_id', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT',
       'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1',
       'ZONE2', 'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK', 'CHURN',
       'kfold', 'TENURE_CHURN_mean', 'REGION_CHURN_mean', 'REGION_0',
       'REGION_1', 'REGION_2', 'REGION_3', 'REGION_4', 'REGION_5', 'REGION_6',
       'REGION_7', 'REGION_8', 'REGION_9', 'REGION_10', 'REGION_11',
       'REGION_12', 'REGION_13', 'TENURE_0', 'TENURE_1', 'TENURE_2',
       'TENURE_3', 'TENURE_4', 'TENURE_5', 'TENURE_6', 'TENURE_7'],
      dtype='object')