In [79]:
import xgboost as xgb
from catboost import CatBoostClassifier, cv ,Pool
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit, GridSearchCV
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
import hyperopt

# Download data

In [2]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [5]:
# check null in features
dict_nan = {name: np.sum(data[name].isnull()) for name in data.columns}
dict_nan

{'ID': 0,
 'Var1': 18038,
 'Var2': 17831,
 'Var3': 17832,
 'Var4': 17736,
 'Var5': 17799,
 'Var6': 1991,
 'Var7': 1976,
 'Var8': 18299,
 'Var9': 18038,
 'Var10': 17799,
 'Var11': 17832,
 'Var12': 18085,
 'Var13': 1976,
 'Var14': 17832,
 'Var15': 18299,
 'Var16': 17799,
 'Var17': 17736,
 'Var18': 17736,
 'Var19': 17736,
 'Var20': 18299,
 'Var21': 1991,
 'Var22': 1792,
 'Var23': 17799,
 'Var24': 2616,
 'Var25': 1792,
 'Var26': 17799,
 'Var27': 17799,
 'Var28': 1793,
 'Var29': 18038,
 'Var30': 18038,
 'Var31': 18299,
 'Var32': 18299,
 'Var33': 18010,
 'Var34': 17831,
 'Var35': 1792,
 'Var36': 17831,
 'Var37': 17736,
 'Var38': 1792,
 'Var39': 18299,
 'Var40': 17831,
 'Var41': 18038,
 'Var42': 18299,
 'Var43': 17831,
 'Var44': 1792,
 'Var45': 18171,
 'Var46': 17831,
 'Var47': 18038,
 'Var48': 18299,
 'Var49': 17831,
 'Var50': 18038,
 'Var51': 16888,
 'Var52': 18299,
 'Var53': 18038,
 'Var54': 17831,
 'Var55': 18299,
 'Var56': 18055,
 'Var57': 0,
 'Var58': 18038,
 'Var59': 17990,
 'Var60': 1

In [6]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [7]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [8]:
# # balance class by undersampling
# count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.7)
# index_for_balance = labels[labels == 1].index
# add_index = np.random.choice(index_for_balance, size = count_add)
# data_add = data.iloc[add_index]
# labels_add = labels.iloc[add_index]

# data = pd.concat([data, data_add])
# labels = pd.concat([labels, labels_add])

In [9]:
# # balanced
# count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.7)
# index_for_balance = labels[labels == 0].index
# drop_index = np.random.choice(index_for_balance, size = count_add, replace=False)

# data.drop(index=drop_index, inplace=True)
# labels.drop(index=drop_index, inplace=True)

In [10]:
labels.value_counts()

0.0    16921
1.0     1377
Name: labels, dtype: int64

# Simple preprocessing feature

In [11]:
# drop feture with null more half of dataset
def check_feat_without(data, columns, threshold = 0):
    flag_nan_col = [np.sum(data[name].isnull()) < threshold for name in columns]
    return columns[flag_nan_col]
    

In [12]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
print(len(numeric_col))
print(len(cat_col))

189
41


In [13]:
right_numeric_column = check_feat_without(data, numeric_col, threshold = len(data)//2)
right_cat_column = check_feat_without(data, cat_col, threshold = len(data)//2)
print(len(right_numeric_column))
print(len(right_cat_column))

41
28


In [16]:
data = data[np.union1d(right_numeric_column, right_cat_column)]

In [17]:
# check unique in cat feat
for cur_cat in right_cat_column:
    count_uniq = len(data[cur_cat].unique())
    print(f'{cur_cat}: {count_uniq}')

Var192: 326
Var193: 44
Var195: 21
Var196: 4
Var197: 207
Var198: 2721
Var199: 2639
Var202: 4673
Var203: 4
Var204: 100
Var205: 4
Var206: 22
Var207: 12
Var208: 3
Var210: 6
Var211: 2
Var212: 71
Var216: 1247
Var217: 7944
Var218: 3
Var219: 20
Var220: 2721
Var221: 7
Var222: 2721
Var223: 5
Var226: 23
Var227: 7
Var228: 29


In [18]:
# check unique in num feat
for cur_num in right_numeric_column:
    count_uniq = len(data[cur_num].unique())
    print(f'{cur_num}: {count_uniq}')

Var6: 1039
Var7: 7
Var13: 1877
Var21: 498
Var22: 498
Var24: 63
Var25: 190
Var28: 2513
Var35: 10
Var38: 11908
Var44: 6
Var57: 14013
Var65: 13
Var72: 9
Var73: 129
Var74: 284
Var76: 11377
Var78: 13
Var81: 15952
Var83: 130
Var85: 109
Var94: 8571
Var109: 149
Var112: 158
Var113: 17900
Var119: 1003
Var123: 191
Var125: 6217
Var126: 52
Var132: 18
Var133: 14165
Var134: 12639
Var140: 1859
Var143: 5
Var144: 11
Var149: 7524
Var153: 14567
Var160: 273
Var163: 9049
Var173: 4
Var181: 7


In [19]:
data['Var173'].unique()

array([ 0., nan,  2.,  4.])

Cat boost

In [57]:
train_data_boost, test_data_boost, train_label_boost, test_label_boost = train_test_split(data, labels,
                                                                                         test_size=0.3,random_state=42)

In [58]:
train_data_boost.fillna('NaN',inplace=True)
test_data_boost.fillna('NaN',inplace=True)

In [59]:
model = CatBoostClassifier(iterations= 200, random_seed=0, eval_metric='F1', loss_function='Logloss')

In [60]:
model.fit(train_data_boost, train_label_boost,
          cat_features=list(right_cat_column),
          eval_set =(test_data_boost, test_label_boost),
          verbose=False,
          plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2658e054e80>

In [61]:
cv_params = model.get_params()
cv_data = cv(
    Pool(train_data_boost, train_label_boost, cat_features=list(right_cat_column)),
    cv_params,
    plot=True,
    verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.006269592476
bestIteration = 133

Training on fold [1/3]

bestTest = 0
bestIteration = 0

Training on fold [2/3]

bestTest = 0.006269592476
bestIteration = 190



In [62]:
train_pool = Pool(train_data_boost, train_label_boost, cat_features=list(right_cat_column))
test_pool = Pool(test_data_boost, test_label_boost, cat_features=list(right_cat_column))

Early stop

In [75]:
early_stop_params = model.get_params().copy()
early_stop_params.update({
    'od_type': 'Iter',
    'od_wait': 50,
    'iterations': 500
})

In [76]:
early_stop_model = CatBoostClassifier(**early_stop_params)
early_stop_model.fit(train_pool,
          eval_set =test_pool,
          verbose=False,
          plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x265bc9ec940>

In [77]:
feature_importances = early_stop_model.get_feature_importance(train_pool)
feature_names = train_data_boost.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Var126: 29.912216159079684
Var218: 6.111748930487022
Var199: 4.963949023551802
Var205: 4.670981630285442
Var74: 4.082971510804079
Var113: 3.577766056353002
Var212: 3.088485609988301
Var210: 3.0473900750053375
Var73: 2.9284793659259303
Var216: 2.7197968536519004
Var81: 2.5555615936107365
Var28: 2.1318593572360722
Var125: 1.8833603734579378
Var207: 1.7742199355183077
Var206: 1.710340682269077
Var228: 1.6810619443240038
Var222: 1.5649881615789072
Var134: 1.5479331797457034
Var132: 1.4677953362201985
Var193: 1.4205533859013337
Var13: 1.3382571132390741
Var195: 1.1366948234548448
Var197: 1.1070104365243614
Var7: 0.9514054350136616
Var227: 0.8184792407687856
Var211: 0.7484499585608848
Var226: 0.7355563694954477
Var44: 0.7347058792195502
Var204: 0.7267037261588836
Var203: 0.668658969241591
Var85: 0.5660668112792898
Var219: 0.5424888923407611
Var119: 0.541649392201261
Var192: 0.5171015081083691
Var6: 0.4911723148986432
Var25: 0.42781279445965215
Var24: 0.35414938425790055
Var160: 0.34393689393

Find optimal parameters model

In [96]:
def hyperopt_obj(params):
    model = CatBoostClassifier(
                                l2_leaf_reg=int(params['l2_leaf_reg']),
                                learning_rate=params['learning_rate'],
                                depth = int(params['depth']),
                                iterations=500,
                                eval_metric='F1',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False
                              )
    cv_calc = cv( train_pool,
                 model.get_params(),
                 verbose=False)
    best_metrics = np.max(cv_data['test-F1-mean'])
    return 1 - best_metrics

In [99]:
params_opt = {
                'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
                'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
                'depth': hyperopt.hp.quniform('depth', 1, 10,2)
                }

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_obj,
    space=params_opt,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    #rstate=np.random.RandomState(123)
)

print(best)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]Training on fold [0/3]

bestTest = 0.1126760563
bestIteration = 378

Training on fold [1/3]

bestTest = 0.07329842932
bestIteration = 466

Training on fold [2/3]

bestTest = 0.07669616519
bestIteration = 105

  2%|▉                                                | 1/50 [00:49<40:13, 49.27s/trial, best loss: 0.9958202716823407]Training on fold [0/3]

bestTest = 0.1270718232
bestIteration = 226

Training on fold [1/3]

bestTest = 0.08287292818
bestIteration = 414

Training on fold [2/3]

bestTest = 0.07492795389
bestIteration = 353

  4%|█▉                                               | 2/50 [01:38<39:30, 49.39s/trial, best loss: 0.9958202716823407]Training on fold [0/3]

bestTest = 0.06976744186
bestIteration = 264

Training on fold [1/3]

bestTest = 0.04093567251
bestIteration = 389

Training on fold [2/3]

bestTest = 0.0650887574
bestIteration = 362

  6%|██▉        

 50%|███████████████████████▌                       | 25/50 [34:39<42:47, 102.71s/trial, best loss: 0.9958202716823407]Training on fold [0/3]

bestTest = 0.04705882353
bestIteration = 28

Training on fold [1/3]

bestTest = 0.08398950131
bestIteration = 124

Training on fold [2/3]

bestTest = 0.1160220994
bestIteration = 101

 52%|████████████████████████▍                      | 26/50 [37:09<46:43, 116.83s/trial, best loss: 0.9958202716823407]Training on fold [0/3]

bestTest = 0.03067484663
bestIteration = 135

Training on fold [1/3]

bestTest = 0.03076923077
bestIteration = 68

Training on fold [2/3]

bestTest = 0.02469135802
bestIteration = 76

 54%|████████████████████████▎                    | 27/50 [41:45<1:03:10, 164.80s/trial, best loss: 0.9958202716823407]Training on fold [0/3]

bestTest = 0.03067484663
bestIteration = 463

Training on fold [1/3]

bestTest = 0.01863354037
bestIteration = 424

Training on fold [2/3]

bestTest = 0.01246105919
bestIteration = 384

 56%|████████████

100%|█████████████████████████████████████████████| 50/50 [1:23:32<00:00, 100.24s/trial, best loss: 0.9958202716823407]
{'depth': 4.0, 'l2_leaf_reg': 2.0, 'learning_rate': 0.28614075781169124}


In [100]:
best

{'depth': 4.0, 'l2_leaf_reg': 2.0, 'learning_rate': 0.28614075781169124}

In [110]:
best_model = CatBoostClassifier(
                                l2_leaf_reg=int(best['l2_leaf_reg']),
                                learning_rate=best['learning_rate'],
                                depth = int(best['depth']),
                                iterations=100,
                                eval_metric='AUC',
                                loss_function='Logloss',
                                random_seed=42,
                                verbose=False,
                                use_best_model=True
                              )

In [111]:
best_model.fit(train_pool,
               eval_set =test_pool,
               verbose=False,
               plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x26580366490>

In [112]:
best_model.eval_metrics(test_pool, ['AUC','F1'], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'AUC': [0.5574368803492018,
  0.5642532719051911,
  0.5752251878971166,
  0.6091900591438297,
  0.6344525843767923,
  0.6551503642485232,
  0.7052834192433464,
  0.7049012003813808,
  0.7097885738441254,
  0.7138689115002719,
  0.7116950126257924,
  0.7120865425440785,
  0.713909414595267,
  0.7171422133498201,
  0.721579397239458,
  0.7264884189081483,
  0.7285035642723596,
  0.7318045665144619,
  0.7375983713100284,
  0.7375983713100284,
  0.7375622909667856,
  0.7382948383228181,
  0.7393567642961959,
  0.7393567642961959,
  0.7394417276851224,
  0.7404370796058071,
  0.7418076670962167,
  0.7418104604131129,
  0.7421507795216351,
  0.7418816899939664,
  0.742472942070332,
  0.7430800229424428,
  0.7424217312605681,
  0.7450050838367511,
  0.7460444304985512,
  0.7462602142287837],
 'F1': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.004705882352941177,
  0.004705882352941177,
  0.009389671361502346,
  0.009389671361502346,
  0.009389671361502346,


test competition data

In [115]:
data_comp = pd.read_csv('./data/orange_small_churn_test_data.csv')
data_comp = data_comp[np.union1d(right_numeric_column, right_cat_column)]
data_comp.fillna('NaN',inplace=True)

In [118]:
predictions_probs = best_model.predict_proba(data_comp)
predictions_probs[10:]

array([[0.95424737, 0.04575263],
       [0.93413736, 0.06586264],
       [0.8547482 , 0.1452518 ],
       ...,
       [0.95957516, 0.04042484],
       [0.98051273, 0.01948727],
       [0.95504454, 0.04495546]])

In [119]:
out_df = pd.DataFrame(enumerate(predictions_probs[:,1]), columns=['Id', 'result'])
out_df.to_csv('output_df.csv', sep=',', index=False)

This method give auc about 0.7, but f1 is so little we need to choice right treshold