In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

tqdm.pandas()

In [2]:
train = pd.read_csv('./data/train_ohe_te_fillna.csv')
test = pd.read_csv('./data/Test.zip')

useful_features = [
'MONTANT',
'FREQUENCE_RECH',
'REVENUE',
'ARPU_SEGMENT',
'FREQUENCE',
'DATA_VOLUME',
'ON_NET',
'ORANGE',
'TIGO',
'ZONE1',
'ZONE2',
'REGULARITY',
'FREQ_TOP_PACK',
'TENURE_CHURN_mean',
'REGION_CHURN_mean',
'REGION_0',
'REGION_1',
'REGION_2',
'REGION_3',
'REGION_4',
'REGION_5',
'REGION_6',
'REGION_7',
'REGION_8',
'REGION_9',
'REGION_10',
'REGION_11',
'REGION_12',
'REGION_13',
'TENURE_0',
'TENURE_1',
'TENURE_2',
'TENURE_3',
'TENURE_4',
'TENURE_5',
'TENURE_6',
'TENURE_7',
]

useful_cols = [
    'REGION',
    'TENURE',
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'TENURE_CHURN_mean',
    'REGION_CHURN_mean'
]

cat_cols = [
    'REGION',
    'TENURE',
]

final_predictions = []
scores = []

def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    model = LGBMClassifier(
        random_state=42,
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    score = roc_auc_score(yvalid, preds_valid)
    return score

In [3]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

[32m[I 2021-09-08 18:40:38,422][0m A new study created in memory with name: no-name-72ac63d9-5d4f-4419-b8e7-c183272c1362[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.27382


[32m[I 2021-09-08 18:41:03,236][0m Trial 0 finished with value: 0.7709722003791525 and parameters: {'learning_rate': 0.2190781226698892, 'reg_lambda': 0.0003284071493458288, 'reg_alpha': 0.004901298783146639, 'subsample': 0.4754336210716843, 'colsample_bytree': 0.34126993422447205, 'max_depth': 6}. Best is trial 0 with value: 0.7709722003791525.[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[588]	valid_0's binary_logloss: 0.273255


[32m[I 2021-09-08 18:41:59,445][0m Trial 1 finished with value: 0.7698041272707171 and parameters: {'learning_rate': 0.047477032585942915, 'reg_lambda': 3.0075761133630468, 'reg_alpha': 70.10750174715302, 'subsample': 0.13429150325394096, 'colsample_bytree': 0.5454341528843681, 'max_depth': 7}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.278089
[2000]	valid_0's binary_logloss: 0.277153
[3000]	valid_0's binary_logloss: 0.276933
[4000]	valid_0's binary_logloss: 0.27681
[5000]	valid_0's binary_logloss: 0.276733
[6000]	valid_0's binary_logloss: 0.276682
[7000]	valid_0's binary_logloss: 0.276645
Did not meet early stopping. Best iteration is:
[7000]	valid_0's binary_logloss: 0.276645


[32m[I 2021-09-08 18:45:29,987][0m Trial 2 finished with value: 0.7765809065217564 and parameters: {'learning_rate': 0.013342556496161052, 'reg_lambda': 0.016799149740895716, 'reg_alpha': 4.33701796668025e-08, 'subsample': 0.6162924116128982, 'colsample_bytree': 0.6646104063986547, 'max_depth': 1}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.273227
[2000]	valid_0's binary_logloss: 0.273058
[3000]	valid_0's binary_logloss: 0.273029
Early stopping, best iteration is:
[2867]	valid_0's binary_logloss: 0.273026


[32m[I 2021-09-08 18:48:02,323][0m Trial 3 finished with value: 0.7703309241355893 and parameters: {'learning_rate': 0.016074688123189133, 'reg_lambda': 0.0667986113413545, 'reg_alpha': 1.6760541046940095e-05, 'subsample': 0.41288740845854044, 'colsample_bytree': 0.7514054255999928, 'max_depth': 7}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.276259
[2000]	valid_0's binary_logloss: 0.274383
[3000]	valid_0's binary_logloss: 0.273926
[4000]	valid_0's binary_logloss: 0.273812
[5000]	valid_0's binary_logloss: 0.273751
Early stopping, best iteration is:
[4899]	valid_0's binary_logloss: 0.273746


[32m[I 2021-09-08 18:53:06,946][0m Trial 4 finished with value: 0.7731128882062901 and parameters: {'learning_rate': 0.02621437536048763, 'reg_lambda': 0.27290614070589714, 'reg_alpha': 0.00034058783628698475, 'subsample': 0.9527162265977895, 'colsample_bytree': 0.15509492073404493, 'max_depth': 4}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.273385
[2000]	valid_0's binary_logloss: 0.273205
Early stopping, best iteration is:
[2205]	valid_0's binary_logloss: 0.273192


[32m[I 2021-09-08 18:55:21,522][0m Trial 5 finished with value: 0.7703880421789945 and parameters: {'learning_rate': 0.032990229558852134, 'reg_lambda': 0.6445314501601072, 'reg_alpha': 0.09405400311381441, 'subsample': 0.751479557329244, 'colsample_bytree': 0.5057329427802791, 'max_depth': 4}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[361]	valid_0's binary_logloss: 0.27325


[32m[I 2021-09-08 18:55:56,431][0m Trial 6 finished with value: 0.7704964753346135 and parameters: {'learning_rate': 0.058882440778224014, 'reg_lambda': 3.2839277104927824e-07, 'reg_alpha': 2.1558444594584183e-07, 'subsample': 0.7557141901261383, 'colsample_bytree': 0.8562889693824638, 'max_depth': 6}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.275768
[2000]	valid_0's binary_logloss: 0.274101
[3000]	valid_0's binary_logloss: 0.273806
[4000]	valid_0's binary_logloss: 0.273655
[5000]	valid_0's binary_logloss: 0.273559
[6000]	valid_0's binary_logloss: 0.273505
[7000]	valid_0's binary_logloss: 0.273471
Did not meet early stopping. Best iteration is:
[6933]	valid_0's binary_logloss: 0.27347


[32m[I 2021-09-08 19:02:48,249][0m Trial 7 finished with value: 0.7721984812817556 and parameters: {'learning_rate': 0.014495578147654307, 'reg_lambda': 0.09349013879849498, 'reg_alpha': 0.05678457818591098, 'subsample': 0.3647739815175761, 'colsample_bytree': 0.20000814011947332, 'max_depth': 4}. Best is trial 1 with value: 0.7698041272707171.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.273151
Early stopping, best iteration is:
[828]	valid_0's binary_logloss: 0.273137


[32m[I 2021-09-08 19:03:55,489][0m Trial 8 finished with value: 0.7697970531958969 and parameters: {'learning_rate': 0.057192504267984716, 'reg_lambda': 0.2020682138073545, 'reg_alpha': 9.291468887449219e-07, 'subsample': 0.8824763845186918, 'colsample_bytree': 0.7382762785360262, 'max_depth': 6}. Best is trial 8 with value: 0.7697970531958969.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.273558
[2000]	valid_0's binary_logloss: 0.273397
Early stopping, best iteration is:
[2348]	valid_0's binary_logloss: 0.273373


[32m[I 2021-09-08 19:06:09,894][0m Trial 9 finished with value: 0.7703323651531172 and parameters: {'learning_rate': 0.06372188126520278, 'reg_lambda': 1.698531773498989e-06, 'reg_alpha': 0.0057516873596360025, 'subsample': 0.4940807978973274, 'colsample_bytree': 0.4269292377321222, 'max_depth': 3}. Best is trial 8 with value: 0.7697970531958969.[0m


In [4]:
study.best_params

{'learning_rate': 0.057192504267984716,
 'reg_lambda': 0.2020682138073545,
 'reg_alpha': 9.291468887449219e-07,
 'subsample': 0.8824763845186918,
 'colsample_bytree': 0.7382762785360262,
 'max_depth': 6}