## Import libraries

In [1]:
import gc
import pickle
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Load autoencoder model

In [2]:
autoencoder = load_model('../input/customer-churn-rate-dae/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder-Input (InputLayer)      [(None, 142)]        0                                            
__________________________________________________________________________________________________
Encoder-Dense-1 (Dense)         (None, 128)          18304       Encoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-BatchNorm-1 (BatchNorma (None, 128)          512         Encoder-Dense-1[0][0]            
__________________________________________________________________________________________________
Encoder-Swish-1 (Activation)    (None, 128)          0           Encoder-BatchNorm-1[0][0]        
______________________________________________________________________________________________

## Prepare data for model training

In [3]:
with open("../input/customer-churn-rate-preprocess-data/Churn_Risk_Rate_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

226

In [4]:
cat_cols = ['region_category','membership_category','preferred_offer_types','internet_option',
            'complaint_status','feedback','age_category','joining_dt_year','time_category',
            'complaint_category','feedback_category','last_login_category']

train_df1, test_df1 = train_df.copy(), test_df.copy()

for col in cat_cols:
    dummy_val = pd.get_dummies(train_df1[col], prefix='col')
    train_df1 = pd.concat([train_df1, dummy_val], axis=1)
    train_df1.drop([col], inplace=True, axis=1)

print("train_df1: {}".format(train_df1.shape))

for col in cat_cols:
    dummy_val = pd.get_dummies(test_df1[col], prefix='col')
    test_df1 = pd.concat([test_df1, dummy_val], axis=1)
    test_df1.drop([col], inplace=True, axis=1)

print("test_df1: {}".format(test_df1.shape))

train_df1: (36987, 143)
test_df1: (19919, 142)


In [5]:
Xtrain_embed = feature_model.predict(train_df1.loc[:, train_df1.columns != 'churn_risk_score'].values)
Xtest_embed = feature_model.predict(test_df1.values)
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, index=train_df1.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, index=test_df1.index)
print("Xtrain_embed_df: {} \nXtest_embed_df: {}".format(Xtrain_embed_df.shape, Xtest_embed_df.shape))

train_df = pd.merge(train_df, Xtrain_embed_df, on='customer_id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='customer_id', sort=False)
print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del train_df1
del test_df1
del Xtrain_embed
del Xtest_embed
del Xtrain_embed_df
del Xtest_embed_df
gc.collect()

train_df.head()

Xtrain_embed_df: (36987, 64) 
Xtest_embed_df: (19919, 64)
train_df: (36987, 170) 
test_df: (19919, 169)


Unnamed: 0_level_0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,internet_option,days_since_last_login,avg_time_spent,avg_transaction_value,...,54,55,56,57,58,59,60,61,62,63
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fffe4300490044003600300030003800,-1.023003,0.0,2.0,4.0,0.0,2.0,2.0,0.766391,0.393377,1.453128,...,-0.210364,-0.135798,0.847136,-0.257103,-0.186361,0.517744,-0.186296,0.219591,0.613333,1.880162
fffe43004900440032003100300035003700,-0.23938,0.0,0.0,5.0,1.0,2.0,1.0,0.602943,0.41002,-0.748954,...,-0.257802,0.250188,0.379453,-0.233873,-0.169573,0.528861,-0.141722,0.307829,-0.266601,-0.277287
fffe4300490044003100390032003600,0.318969,0.0,1.0,0.0,1.0,2.0,2.0,0.256198,0.861089,-0.308575,...,-0.031088,-0.17507,0.196698,-0.150829,0.50234,0.120173,0.368638,0.110437,-0.262967,1.640988
fffe43004900440036003000330031003600,-0.006273,1.0,0.0,0.0,1.0,2.0,1.0,-0.350817,-0.905214,-0.108067,...,-0.109337,1.044451,0.008078,-0.078969,0.503134,0.407935,0.506297,0.133632,-0.265814,1.485688
fffe43004900440031003900350030003600,-0.286135,0.0,0.0,0.0,0.0,1.0,1.0,1.239352,-0.38768,-0.147029,...,-0.259821,-0.043998,0.312016,-0.237858,-0.206758,0.514125,1.489388,0.136695,-0.268102,-0.25922


In [6]:
train_df['churn_risk_score'] = train_df['churn_risk_score'].apply(lambda x: 0 if x==5 else 1 if x==-1 else x)
train_df.groupby(['churn_risk_score']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,churn_risk_score,count
0,0,9825
1,1,3815
2,2,2741
3,3,10422
4,4,10184


In [7]:
cat_cols = ['gender','joined_through_referral','used_special_discount','offer_application_preference',
            'past_complaint','joining_dt_day_weekend','referral_id_present','has_desktop','has_smartphone',
            'region_category','membership_category','preferred_offer_types','internet_option','complaint_status',
            'feedback','age_category','joining_dt_year','time_category','complaint_category','feedback_category',
            'last_login_category']

train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)
cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

[1, 4, 12, 13, 14, 19, 32, 37, 38, 2, 3, 5, 6, 15, 16, 17, 18, 33, 34, 35, 36]


In [8]:
Xtrain = train_df.loc[:, train_df.columns != 'churn_risk_score'].values
Ytrain = train_df['churn_risk_score'].values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

Xtrain: (36987, 169) 
Ytrain: (36987,) 
Xtest: (19919, 169)


## Hyperparameters search using Optuna

In [9]:
'''
def objective(trial):

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    counter = 0
    oof_score = 0

    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "device_type": "gpu",
        "num_class": 5,
        "is_unbalance": True,
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 45, 2000),
        "max_depth": trial.suggest_int("max_depth", 7, 16),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 15)
    }

    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, valid_sets=[lgvalidation], 
                          num_boost_round=5000, early_stopping_rounds=200, 
                          verbose_eval=False)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred = np.array([np.argmax(y_pred, axis=1)]).T
        oof_score += 100 * f1_score(val_y, y_pred, average='macro')
    
    oof_score /= float(counter)
    return oof_score
'''

'\ndef objective(trial):\n\n    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\n    counter = 0\n    oof_score = 0\n\n    params = {\n        "objective": "multiclass",\n        "metric": "multi_logloss",\n        "verbosity": -1,\n        "boosting_type": "gbdt",\n        "device_type": "gpu",\n        "num_class": 5,\n        "is_unbalance": True,\n        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),\n        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),\n        "num_leaves": trial.suggest_int("num_leaves", 45, 2000),\n        "max_depth": trial.suggest_int("max_depth", 7, 16),\n        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),\n        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),\n        "bagging_freq": trial.suggest_int("bagging_freq", 3, 15),\n        "min_child_samples": trial.suggest_int("min_child_samples", 3, 15)\n    }\n\n    for train, val in kfold.spli

In [10]:
'''
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)
'''

"\nstudy = optuna.create_study(direction='maximize')\nstudy.optimize(objective, n_trials=500)\n"

In [11]:
'''
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))
'''

'\nprint("Number of finished trials: {}".format(len(study.trials)))\n\nprint("Best trial:")\ntrial = study.best_trial\n\nprint("Value: {}".format(trial.value))\n\nprint("Params: ")\nfor key, value in trial.params.items():\n    print(" {}: {}".format(key, value))\n'

## Build and validate the model

In [12]:
params = {}
params["objective"] = 'multiclass'
params['metric'] = 'multi_logloss'
params['boosting'] = 'gbdt'
params["device_type"] = 'gpu'
params['num_class'] = 5
params['is_unbalance'] = True
params["learning_rate"] = 0.05
params["lambda_l2"] = 0.001
params["num_leaves"] = 225
params["max_depth"] = 8
params["feature_fraction"] = 0.758
params["bagging_fraction"] = 0.856
params["bagging_freq"] = 7
params["bagging_seed"] = 10
params["min_data_in_leaf"] = 3
params["verbosity"] = -1
num_rounds = 5000

In [13]:
FOLD = 7
NUM_SEED = 3

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], 5))
y_pred_final_lgb = np.zeros((Xtest.shape[0], 5))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          categorical_feature=cat_cols_indices,
                          early_stopping_rounds=200, verbose_eval=100)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred = np.array([np.argmax(y_pred, axis=1)]).T
        y_pred_meta_lgb[val] += model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_final_lgb += model.predict(Xtest, num_iteration=model.best_iteration)
        
        score = 100 * f1_score(val_y, y_pred, average='macro')
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lgb = y_pred_meta_lgb / float(NUM_SEED)
y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.254827	valid_1's multi_logloss: 0.490971
[200]	training's multi_logloss: 0.136596	valid_1's multi_logloss: 0.505856
[300]	training's multi_logloss: 0.0650251	valid_1's multi_logloss: 0.537042
Early stopping, best iteration is:
[114]	training's multi_logloss: 0.234411	valid_1's multi_logloss: 0.49089
Seed-24 | Fold-0 | OOF Score: 72.75309380002307


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.259152	valid_1's multi_logloss: 0.497978
[200]	training's multi_logloss: 0.142623	valid_1's multi_logloss: 0.51175
[300]	training's multi_logloss: 0.0682251	valid_1's multi_logloss: 0.539503
Early stopping, best iteration is:
[106]	training's multi_logloss: 0.250254	valid_1's multi_logloss: 0.496914
Seed-24 | Fold-1 | OOF Score: 72.98621828022915


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.256554	valid_1's multi_logloss: 0.488986
[200]	training's multi_logloss: 0.133604	valid_1's multi_logloss: 0.504431
[300]	training's multi_logloss: 0.0631808	valid_1's multi_logloss: 0.53379
Early stopping, best iteration is:
[105]	training's multi_logloss: 0.250287	valid_1's multi_logloss: 0.48876
Seed-24 | Fold-2 | OOF Score: 72.88013478562536


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.250621	valid_1's multi_logloss: 0.490005
[200]	training's multi_logloss: 0.133523	valid_1's multi_logloss: 0.505979
[300]	training's multi_logloss: 0.0634938	valid_1's multi_logloss: 0.535856
Early stopping, best iteration is:
[107]	training's multi_logloss: 0.239692	valid_1's multi_logloss: 0.489607
Seed-24 | Fold-3 | OOF Score: 73.16546828421994


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.259883	valid_1's multi_logloss: 0.495004
[200]	training's multi_logloss: 0.139591	valid_1's multi_logloss: 0.512387
[300]	training's multi_logloss: 0.0654688	valid_1's multi_logloss: 0.545563
Early stopping, best iteration is:
[111]	training's multi_logloss: 0.245098	valid_1's multi_logloss: 0.494873
Seed-24 | Fold-4 | OOF Score: 72.5703680743426


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.254458	valid_1's multi_logloss: 0.483532
[200]	training's multi_logloss: 0.135342	valid_1's multi_logloss: 0.493058
[300]	training's multi_logloss: 0.0637028	valid_1's multi_logloss: 0.521439
Early stopping, best iteration is:
[132]	training's multi_logloss: 0.213463	valid_1's multi_logloss: 0.481718
Seed-24 | Fold-5 | OOF Score: 73.71447451851108


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.253979	valid_1's multi_logloss: 0.497415
[200]	training's multi_logloss: 0.138759	valid_1's multi_logloss: 0.511176
[300]	training's multi_logloss: 0.0665445	valid_1's multi_logloss: 0.539595
Early stopping, best iteration is:
[106]	training's multi_logloss: 0.24618	valid_1's multi_logloss: 0.497208
Seed-24 | Fold-6 | OOF Score: 71.91519611080807

Seed: 24 | Aggregate OOF Score: 72.8549934076799




New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.255161	valid_1's multi_logloss: 0.476579
[200]	training's multi_logloss: 0.138684	valid_1's multi_logloss: 0.48814
[300]	training's multi_logloss: 0.06472	valid_1's multi_logloss: 0.51646
Early stopping, best iteration is:
[114]	training's multi_logloss: 0.235389	valid_1's multi_logloss: 0.476176
Seed-3 | Fold-0 | OOF Score: 74.02647534952196


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.256675	valid_1's multi_logloss: 0.473569
[200]	training's multi_logloss: 0.134564	valid_1's multi_logloss: 0.483111
[300]	training's multi_logloss: 0.062188	valid_1's multi_logloss: 0.509103
Early stopping, best iteration is:
[113]	training's multi_logloss: 0.237807	valid_1's multi_logloss: 0.473192
Seed-3 | Fold-1 | OOF Score: 74.18149613294906


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.251848	valid_1's multi_logloss: 0.505871
[200]	training's multi_logloss: 0.134861	valid_1's multi_logloss: 0.522285
[300]	training's multi_logloss: 0.0655412	valid_1's multi_logloss: 0.554065
Early stopping, best iteration is:
[100]	training's multi_logloss: 0.251848	valid_1's multi_logloss: 0.505871
Seed-3 | Fold-2 | OOF Score: 72.27554856956138


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.258179	valid_1's multi_logloss: 0.487955
[200]	training's multi_logloss: 0.138351	valid_1's multi_logloss: 0.502045
[300]	training's multi_logloss: 0.0645365	valid_1's multi_logloss: 0.532074
Early stopping, best iteration is:
[112]	training's multi_logloss: 0.242382	valid_1's multi_logloss: 0.487421
Seed-3 | Fold-3 | OOF Score: 72.45914795597422


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.252197	valid_1's multi_logloss: 0.503165
[200]	training's multi_logloss: 0.134301	valid_1's multi_logloss: 0.51561
[300]	training's multi_logloss: 0.0644815	valid_1's multi_logloss: 0.543646
Early stopping, best iteration is:
[114]	training's multi_logloss: 0.232583	valid_1's multi_logloss: 0.502228
Seed-3 | Fold-4 | OOF Score: 72.811461217035


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.256416	valid_1's multi_logloss: 0.491848
[200]	training's multi_logloss: 0.135176	valid_1's multi_logloss: 0.506954
[300]	training's multi_logloss: 0.0633759	valid_1's multi_logloss: 0.535953
Early stopping, best iteration is:
[101]	training's multi_logloss: 0.254807	valid_1's multi_logloss: 0.491837
Seed-3 | Fold-5 | OOF Score: 73.16940493868594


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.251539	valid_1's multi_logloss: 0.497777
[200]	training's multi_logloss: 0.139272	valid_1's multi_logloss: 0.512603
[300]	training's multi_logloss: 0.0648664	valid_1's multi_logloss: 0.542571
Early stopping, best iteration is:
[102]	training's multi_logloss: 0.248751	valid_1's multi_logloss: 0.497753
Seed-3 | Fold-6 | OOF Score: 72.6634601302364

Seed: 3 | Aggregate OOF Score: 73.08385632770913




New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.251916	valid_1's multi_logloss: 0.492731
[200]	training's multi_logloss: 0.135249	valid_1's multi_logloss: 0.505783
[300]	training's multi_logloss: 0.0635475	valid_1's multi_logloss: 0.537612
Early stopping, best iteration is:
[112]	training's multi_logloss: 0.234009	valid_1's multi_logloss: 0.492049
Seed-56 | Fold-0 | OOF Score: 73.44618780604624


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.252189	valid_1's multi_logloss: 0.496783
[200]	training's multi_logloss: 0.134013	valid_1's multi_logloss: 0.509427
[300]	training's multi_logloss: 0.0635461	valid_1's multi_logloss: 0.538726
Early stopping, best iteration is:
[116]	training's multi_logloss: 0.230062	valid_1's multi_logloss: 0.495927
Seed-56 | Fold-1 | OOF Score: 72.24385404295953


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.255201	valid_1's multi_logloss: 0.495359
[200]	training's multi_logloss: 0.132597	valid_1's multi_logloss: 0.507623
[300]	training's multi_logloss: 0.0645114	valid_1's multi_logloss: 0.53533
Early stopping, best iteration is:
[104]	training's multi_logloss: 0.249727	valid_1's multi_logloss: 0.495031
Seed-56 | Fold-2 | OOF Score: 73.77515963639227


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.254463	valid_1's multi_logloss: 0.497141
[200]	training's multi_logloss: 0.138184	valid_1's multi_logloss: 0.511389
[300]	training's multi_logloss: 0.0651732	valid_1's multi_logloss: 0.539833
Early stopping, best iteration is:
[110]	training's multi_logloss: 0.241621	valid_1's multi_logloss: 0.496344
Seed-56 | Fold-3 | OOF Score: 72.78758621380798


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.255113	valid_1's multi_logloss: 0.490547
[200]	training's multi_logloss: 0.139873	valid_1's multi_logloss: 0.502686
[300]	training's multi_logloss: 0.0665022	valid_1's multi_logloss: 0.532037
Early stopping, best iteration is:
[103]	training's multi_logloss: 0.250527	valid_1's multi_logloss: 0.490382
Seed-56 | Fold-4 | OOF Score: 73.34550547456286


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.258104	valid_1's multi_logloss: 0.476033
[200]	training's multi_logloss: 0.136855	valid_1's multi_logloss: 0.487304
[300]	training's multi_logloss: 0.0660169	valid_1's multi_logloss: 0.512899
Early stopping, best iteration is:
[108]	training's multi_logloss: 0.24638	valid_1's multi_logloss: 0.475578
Seed-56 | Fold-5 | OOF Score: 73.29290472097085


New categorical_feature is [1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 0.256358	valid_1's multi_logloss: 0.48427
[200]	training's multi_logloss: 0.13961	valid_1's multi_logloss: 0.498571
[300]	training's multi_logloss: 0.0657096	valid_1's multi_logloss: 0.527054
Early stopping, best iteration is:
[106]	training's multi_logloss: 0.248382	valid_1's multi_logloss: 0.483984
Seed-56 | Fold-6 | OOF Score: 72.80442590101825

Seed: 56 | Aggregate OOF Score: 73.09937482796542


Aggregate OOF Score: 73.01274152111817


In [14]:
np.savez_compressed('./LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    oof_score=oof_score,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission file

In [15]:
test_df = pd.read_csv("../input/customer-churn-rate-prediction/dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['customer_id'] = test_df['customer_id']
submit_df['churn_risk_score'] = np.array([np.argmax(y_pred_final_lgb, axis=1)]).T
submit_df['churn_risk_score'] = submit_df['churn_risk_score'].apply(lambda x: 5 if x==0 else x)
submit_df.head()

Unnamed: 0,customer_id,churn_risk_score
0,fffe43004900440031003700300030003400,3
1,fffe43004900440031003900370037003300,3
2,fffe43004900440034003800360037003000,3
3,fffe43004900440036003200370033003400,3
4,fffe43004900440035003000370031003900,5


In [16]:
submit_df.to_csv("./LGB_submission.csv", index=False)