## Import libraries

In [1]:
import gc
import pickle
import optuna
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Load autoencoder model

In [2]:
autoencoder = load_model('../input/customer-churn-rate-dae/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder-Input (InputLayer)      [(None, 142)]        0                                            
__________________________________________________________________________________________________
Encoder-Dense-1 (Dense)         (None, 128)          18304       Encoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-BatchNorm-1 (BatchNorma (None, 128)          512         Encoder-Dense-1[0][0]            
__________________________________________________________________________________________________
Encoder-Swish-1 (Activation)    (None, 128)          0           Encoder-BatchNorm-1[0][0]        
______________________________________________________________________________________________

## Prepare data for model training

In [3]:
with open("../input/customer-churn-rate-preprocess-data/Churn_Risk_Rate_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

221

In [4]:
cat_cols = ['region_category','membership_category','preferred_offer_types','internet_option',
            'complaint_status','feedback','age_category','joining_dt_year','time_category',
            'complaint_category','feedback_category','last_login_category']

for col in cat_cols:
    dummy_val = pd.get_dummies(train_df[col], prefix='col')
    train_df = pd.concat([train_df, dummy_val], axis=1)
    train_df.drop([col], inplace=True, axis=1)

print("train_df: {}".format(train_df.shape))

for col in cat_cols:
    dummy_val = pd.get_dummies(test_df[col], prefix='col')
    test_df = pd.concat([test_df, dummy_val], axis=1)
    test_df.drop([col], inplace=True, axis=1)

print("test_df: {}".format(test_df.shape))

train_df: (36987, 143)
test_df: (19919, 142)


In [5]:
Xtrain_embed = feature_model.predict(train_df.loc[:, train_df.columns != 'churn_risk_score'].values)
Xtest_embed = feature_model.predict(test_df.values)
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, index=train_df.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, index=test_df.index)
print("Xtrain_embed_df: {} \nXtest_embed_df: {}".format(Xtrain_embed_df.shape, Xtest_embed_df.shape))

train_df = pd.merge(train_df, Xtrain_embed_df, on='customer_id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='customer_id', sort=False)
print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del Xtrain_embed
del Xtest_embed
del Xtrain_embed_df
del Xtest_embed_df
gc.collect()

Xtrain_embed_df: (36987, 64) 
Xtest_embed_df: (19919, 64)
train_df: (36987, 207) 
test_df: (19919, 206)


1393

In [6]:
train_df['churn_risk_score'] = train_df['churn_risk_score'].apply(lambda x: 0 if x==5 else 1 if x==-1 else x)
train_df.groupby(['churn_risk_score']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,churn_risk_score,count
0,0,9825
1,1,3815
2,2,2741
3,3,10422
4,4,10184


In [7]:
Xtrain = train_df.loc[:, train_df.columns != 'churn_risk_score'].values
Ytrain = train_df['churn_risk_score'].values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

Xtrain: (36987, 206) 
Ytrain: (36987,) 
Xtest: (19919, 206)


## Hyperparameters search using Optuna

In [8]:
'''
def objective(trial):

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    counter = 0
    oof_score = 0

    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        model = XGBClassifier(
            objective='multi:softmax',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=5,
            max_depth=trial.suggest_int("max_depth", 7, 15), 
            max_leaves=trial.suggest_int("max_leaves", 45, 2000),
            learning_rate=0.05,
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 15),
            reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  early_stopping_rounds=200, verbose=False)
        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        oof_score += 100 * f1_score(val_y, y_pred, average='macro')
        
        del model
        del y_pred
        gc.collect()
    
    oof_score /= float(counter)
    return oof_score
'''

'\ndef objective(trial):\n\n    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\n    counter = 0\n    oof_score = 0\n\n    for train, val in kfold.split(Xtrain, Ytrain):\n        counter += 1\n\n        train_x, train_y = Xtrain[train], Ytrain[train]\n        val_x, val_y = Xtrain[val], Ytrain[val]\n\n        model = XGBClassifier(\n            objective=\'multi:softmax\',\n            eval_metric=\'mlogloss\',\n            booster=\'gbtree\',\n            sample_type=\'uniform\',\n            tree_method=\'gpu_hist\',\n            grow_policy=\'lossguide\',\n            use_label_encoder=False,\n            num_round=5000,\n            num_class=5,\n            max_depth=trial.suggest_int("max_depth", 7, 15), \n            max_leaves=trial.suggest_int("max_leaves", 45, 2000),\n            learning_rate=0.05,\n            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),\n            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),\n        

In [9]:
'''
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
'''

"\nstudy = optuna.create_study(direction='maximize')\nstudy.optimize(objective, n_trials=100)\n"

In [10]:
'''
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))
'''

'\nprint("Number of finished trials: {}".format(len(study.trials)))\n\nprint("Best trial:")\ntrial = study.best_trial\n\nprint("Value: {}".format(trial.value))\n\nprint("Params: ")\nfor key, value in trial.params.items():\n    print(" {}: {}".format(key, value))\n'

## Build and validate the model

In [11]:
def BalancedSampleWeights(y_train, class_weight_coef):
    classes = np.unique(y_train, axis=0)
    classes.sort()
    class_samples = np.bincount(y_train)
    total_samples = class_samples.sum()
    n_classes = len(class_samples)
    weights = total_samples / (n_classes * class_samples * 1.0)
    class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
    class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * class_weight_coef
    sample_weights = [class_weight_dict[i] for i in y_train]
    return sample_weights

In [12]:
FOLD = 7
NUM_SEED = 5

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], 5))
y_pred_final_xgb = np.zeros((Xtest.shape[0], 5))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]
        
        largest_class_weight_coef = max(np.bincount(train_y))/train_y.shape[0]
        weight = BalancedSampleWeights(train_y, largest_class_weight_coef)

        model = XGBClassifier(
            objective='multi:softmax',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=5,
            max_depth=15, 
            max_leaves=512,
            learning_rate=0.05,
            subsample=0.75,
            colsample_bytree=0.95,
            min_child_weight=3,
            reg_lambda=0.01,
            verbosity=0
        )

        model.fit(train_x, train_y, sample_weight=weight,
                  eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        y_pred_meta_xgb[val] += model.predict_proba(val_x, ntree_limit=model.best_ntree_limit)
        y_pred_final_xgb += model.predict_proba(Xtest, ntree_limit=model.best_ntree_limit)
        
        score = 100 * f1_score(val_y, y_pred, average='macro')
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_xgb = y_pred_meta_xgb / float(NUM_SEED)
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-mlogloss:1.52290	validation_1-mlogloss:1.53101
[50]	validation_0-mlogloss:0.36678	validation_1-mlogloss:0.58002
[99]	validation_0-mlogloss:0.19711	validation_1-mlogloss:0.51944
Seed-24 | Fold-0 | OOF Score: 70.55235857625337
[0]	validation_0-mlogloss:1.52378	validation_1-mlogloss:1.53100
[50]	validation_0-mlogloss:0.36753	validation_1-mlogloss:0.58102
[99]	validation_0-mlogloss:0.19726	validation_1-mlogloss:0.52532
Seed-24 | Fold-1 | OOF Score: 72.08929182433923
[0]	validation_0-mlogloss:1.52382	validation_1-mlogloss:1.53041
[50]	validation_0-mlogloss:0.36698	validation_1-mlogloss:0.56842
[99]	validation_0-mlogloss:0.19736	validation_1-mlogloss:0.50812
Seed-24 | Fold-2 | OOF Score: 71.73089398583204
[0]	validation_0-mlogloss:1.52379	validation_1-mlogloss:1.53007
[50]	validation_0-mlogloss:0.36937	validation_1-mlogloss:0.57217
[99]	validation_0-mlogloss:0.19907	validation_1-mlogloss:0.51155
Seed-24 | Fold-3 | OOF Score: 72.07481541249514
[0]	validation_0-mlogloss:1.5234

In [13]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [14]:
test_df = pd.read_csv("../input/customer-churn-rate-prediction/dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['customer_id'] = test_df['customer_id']
submit_df['churn_risk_score'] = np.array([np.argmax(y_pred_final_xgb, axis=1)]).T
submit_df['churn_risk_score'] = submit_df['churn_risk_score'].apply(lambda x: 5 if x==0 else x)
submit_df.head()

Unnamed: 0,customer_id,churn_risk_score
0,fffe43004900440031003700300030003400,3
1,fffe43004900440031003900370037003300,3
2,fffe43004900440034003800360037003000,4
3,fffe43004900440036003200370033003400,3
4,fffe43004900440035003000370031003900,5


In [15]:
submit_df.to_csv("./XGB_submission.csv", index=False)