In [57]:
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit, GridSearchCV
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer

# Download data

In [58]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [59]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [61]:
# check null in features
dict_nan = {name: np.sum(data[name].isnull()) for name in data.columns}
dict_nan

{'ID': 0,
 'Var1': 18038,
 'Var2': 17831,
 'Var3': 17832,
 'Var4': 17736,
 'Var5': 17799,
 'Var6': 1991,
 'Var7': 1976,
 'Var8': 18299,
 'Var9': 18038,
 'Var10': 17799,
 'Var11': 17832,
 'Var12': 18085,
 'Var13': 1976,
 'Var14': 17832,
 'Var15': 18299,
 'Var16': 17799,
 'Var17': 17736,
 'Var18': 17736,
 'Var19': 17736,
 'Var20': 18299,
 'Var21': 1991,
 'Var22': 1792,
 'Var23': 17799,
 'Var24': 2616,
 'Var25': 1792,
 'Var26': 17799,
 'Var27': 17799,
 'Var28': 1793,
 'Var29': 18038,
 'Var30': 18038,
 'Var31': 18299,
 'Var32': 18299,
 'Var33': 18010,
 'Var34': 17831,
 'Var35': 1792,
 'Var36': 17831,
 'Var37': 17736,
 'Var38': 1792,
 'Var39': 18299,
 'Var40': 17831,
 'Var41': 18038,
 'Var42': 18299,
 'Var43': 17831,
 'Var44': 1792,
 'Var45': 18171,
 'Var46': 17831,
 'Var47': 18038,
 'Var48': 18299,
 'Var49': 17831,
 'Var50': 18038,
 'Var51': 16888,
 'Var52': 18299,
 'Var53': 18038,
 'Var54': 17831,
 'Var55': 18299,
 'Var56': 18055,
 'Var57': 0,
 'Var58': 18038,
 'Var59': 17990,
 'Var60': 1

In [62]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [63]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [64]:
# balance class by undersampling
count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.7)
index_for_balance = labels[labels == 1].index
add_index = np.random.choice(index_for_balance, size = count_add)
data_add = data.iloc[add_index]
labels_add = labels.iloc[add_index]

data = pd.concat([data, data_add])
labels = pd.concat([labels, labels_add])

In [65]:
# # balanced
# count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.7)
# index_for_balance = labels[labels == 0].index
# drop_index = np.random.choice(index_for_balance, size = count_add, replace=False)

# data.drop(index=drop_index, inplace=True)
# labels.drop(index=drop_index, inplace=True)

In [66]:
labels.value_counts()

0.0    16921
1.0    12257
Name: labels, dtype: int64

# Simple preprocessing feature

In [67]:
# drop feture with null more half of dataset
def check_feat_without(data, columns, threshold = 0):
    flag_nan_col = [np.sum(data[name].isnull()) < threshold for name in columns]
    return columns[flag_nan_col]
    

In [68]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
print(len(numeric_col))
print(len(cat_col))

189
41


In [69]:
right_numeric_column = check_feat_without(data, numeric_col, threshold = len(data)//2)
right_cat_column = check_feat_without(data, cat_col, threshold = len(data)//2)
print(len(right_numeric_column))
print(len(right_cat_column))

41
28


In [70]:
# check unique in cat feat
for cur_cat in right_cat_column:
    count_uniq = len(data[cur_cat].unique())
    print(f'{cur_cat}: {count_uniq}')

Var192: 326
Var193: 44
Var195: 21
Var196: 4
Var197: 207
Var198: 2721
Var199: 2639
Var202: 4673
Var203: 4
Var204: 100
Var205: 4
Var206: 22
Var207: 12
Var208: 3
Var210: 6
Var211: 2
Var212: 71
Var216: 1247
Var217: 7944
Var218: 3
Var219: 20
Var220: 2721
Var221: 7
Var222: 2721
Var223: 5
Var226: 23
Var227: 7
Var228: 29


In [71]:
# check unique in num feat
for cur_num in right_numeric_column:
    count_uniq = len(data[cur_num].unique())
    print(f'{cur_num}: {count_uniq}')

Var6: 1039
Var7: 7
Var13: 1877
Var21: 498
Var22: 498
Var24: 63
Var25: 190
Var28: 2513
Var35: 10
Var38: 11908
Var44: 6
Var57: 14013
Var65: 13
Var72: 9
Var73: 129
Var74: 284
Var76: 11377
Var78: 13
Var81: 15952
Var83: 130
Var85: 109
Var94: 8571
Var109: 149
Var112: 158
Var113: 17900
Var119: 1003
Var123: 191
Var125: 6217
Var126: 52
Var132: 18
Var133: 14165
Var134: 12639
Var140: 1859
Var143: 5
Var144: 11
Var149: 7524
Var153: 14567
Var160: 273
Var163: 9049
Var173: 4
Var181: 7


In [72]:
data['Var173'].unique()

array([ 0., nan,  2.,  4.])

# FILL NAN

We will use different method to fill on nan. The quality will be checked on validation roc auc xgboost 

In [73]:
model_classifier = xgb.XGBClassifier(eval_metric='mlogloss')

# simple statistic

In [74]:
# class encoder cat
class LabelTransformer():
#     def __init__(self):
#         return self

    def fit(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        self.encoder_columns = dict()
        self.unique_fitted_value = dict()
        for cur_col in self.columns_trans:
            unique_val = np.append(self.df[cur_col].unique(), 'unk_val')
            self.unique_fitted_value[cur_col] = unique_val
            self.encoder_columns[cur_col] = LabelEncoder().fit(unique_val)
        return self

    def transform(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        for cur_col in self.columns_trans:
            
            unique_val_trans = self.df[cur_col].unique()
            unique_val_trans = unique_val_trans[~pd.isna(unique_val_trans)]
            
            unique_val_train = self.unique_fitted_value[cur_col]
            unique_val_train = unique_val_train[~pd.isna(unique_val_train)]
            
            dissapear_val = np.setdiff1d(unique_val_trans, unique_val_train)
            self.df.loc[self.df[cur_col].isin(dissapear_val), cur_col] = 'unk_val'
            self.df[cur_col] = self.encoder_columns[cur_col].transform(self.df[cur_col])
            
        return self.df


In [75]:
pipeline_numeric = Pipeline(steps =[
    ('selecting_num', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_numeric_column])),
    ('fill_nan_num', SimpleImputer(strategy='mean')),
    ('scaler_num', StandardScaler())
])

pipeline_cat = Pipeline(steps = [
    ('selecting_cat', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_cat_column])),
    ('fill_nan_cat', SimpleImputer(strategy='most_frequent')),
    ('encoder_cat', LabelTransformer())
    
])

estimator = Pipeline(steps=[
    ('feature_preprocessing', FeatureUnion(transformer_list=[
        ('numeric_variables_processing', pipeline_numeric),
        ('category_variables_processing', pipeline_cat)
    ])),
    ('model',model_classifier)
])

In [76]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, random_state = 0, test_size=0.3, shuffle=True,
                                                                    stratify=labels)

In [77]:
cv_strategy = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

In [78]:
cv_xgb = cross_val_score(estimator, train_data, train_target, cv=cv_strategy, scoring='f1')

In [79]:
cv_xgb 

array([0.9428194 , 0.94157557, 0.94046722, 0.9356417 , 0.93572779])

check different statistics

In [80]:
grid_params = {
    'feature_preprocessing__numeric_variables_processing__fill_nan_num__strategy': ['mean', 'median', 'most_frequent']
}

In [81]:
grid_search = GridSearchCV(estimator, grid_params, scoring='f1', cv=cv_strategy)

In [82]:
grid_search.fit(train_data, train_target)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.3,
            train_size=None),
             estimator=Pipeline(steps=[('feature_preprocessing',
                                        FeatureUnion(transformer_list=[('numeric_variables_processing',
                                                                        Pipeline(steps=[('selecting_num',
                                                                                         FunctionTransformer(func=<function <lambda> at 0x000001F40AD6C700>)),
                                                                                        ('fill_nan_num',
                                                                                         SimpleImputer()),
                                                                                        ('scale...
                                                      max_cat_to_onehot=None,
                                                      max_delta_

In [83]:
grid_search.best_score_

0.9403928782200122

In [84]:
grid_search.best_params_

{'feature_preprocessing__numeric_variables_processing__fill_nan_num__strategy': 'median'}

It daoesn't matter wht statistic we chooose. We use other methods wich in sklearn from class imputer  

SimpleImputer, KNNImputer, IterativeImputer with default params

In [85]:
mean_metric_cv = []
imputers_list = [SimpleImputer(),
                 IterativeImputer(n_nearest_features = 10),
                 KNNImputer()]

for  imputer_num in imputers_list:
    pipeline_numeric = Pipeline(steps =[
    ('selecting_num', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_numeric_column])),
    ('fill_nan_num', imputer_num),
    ('scaler_num', StandardScaler())
])

    pipeline_cat = Pipeline(steps = [
    ('selecting_cat', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_cat_column])),
    ('fill_nan_cat', SimpleImputer(strategy='most_frequent')),
    ('encoder_cat', LabelTransformer())
])

    estimator = Pipeline(steps=[
    ('feature_preprocessing', FeatureUnion(transformer_list=[
        ('numeric_variables_processing', pipeline_numeric),
        ('category_variables_processing', pipeline_cat)
    ])),
    ('model',model_classifier)
])
    cv_xgb = cross_val_score(estimator, train_data, train_target, cv=cv_strategy, scoring='f1')
    mean_metric_cv.append(cv_xgb.mean())
    print(f'{repr(imputer_num)} with f1 ={list(map(lambda x: np.round(x, 3) ,cv_xgb))}')

SimpleImputer() with f1 =[0.943, 0.942, 0.94, 0.936, 0.936]
IterativeImputer(n_nearest_features=10) with f1 =[0.919, 0.938, 0.932, 0.934, 0.92]
KNNImputer() with f1 =[0.946, 0.949, 0.946, 0.939, 0.932]


In [86]:
mean_metric_cv

[0.9392463366891732, 0.9286239091282475, 0.9425163573505084]

# Use vanila catboost with preprocessing nan

In [87]:
train_data_for_boost = train_data.loc[:, np.union1d(right_numeric_column, right_cat_column)]

In [88]:
train_data_for_boost.fillna('NaN',inplace=True)

In [89]:
train_data_for_boost.head()

Unnamed: 0,Var109,Var112,Var113,Var119,Var123,Var125,Var126,Var13,Var132,Var133,...,Var7,Var72,Var73,Var74,Var76,Var78,Var81,Var83,Var85,Var94
12948,64.0,96.0,-29756.16,500.0,0.0,93384.0,,116.0,0.0,83060.0,...,7.0,,122,14.0,270680.0,0.0,4423.56,0.0,2.0,
2183,32.0,16.0,369028.8,495.0,42.0,23004.0,,848.0,0.0,3024000.0,...,7.0,,70,7.0,947816.0,0.0,139213.5,10.0,8.0,
12627,32.0,24.0,139156.0,610.0,42.0,738.0,4.0,308.0,8.0,140615.0,...,7.0,3.0,30,7.0,361912.0,3.0,128677.8,10.0,6.0,708.0
14964,24.0,32.0,442564.0,1595.0,66.0,60183.0,8.0,328.0,8.0,2872460.0,...,14.0,3.0,34,231.0,2764800.0,0.0,35546.1,15.0,0.0,5769.0
12618,56.0,32.0,1248020.0,605.0,66.0,19791.0,-18.0,2024.0,0.0,1947115.0,...,7.0,,112,42.0,1498024.0,0.0,191157.6,10.0,10.0,


In [90]:
model_catboost = CatBoostClassifier(iterations=100,  nan_mode='Max', cat_features=list(right_cat_column), random_seed=42)

In [91]:
cv_cat_boost = cross_val_score(model_catboost, train_data_for_boost, train_target, cv=cv_strategy, scoring='f1')

Learning rate set to 0.264978
0:	learn: 0.5645630	total: 34.2ms	remaining: 3.39s
1:	learn: 0.5052475	total: 66.5ms	remaining: 3.26s
2:	learn: 0.4586105	total: 101ms	remaining: 3.28s
3:	learn: 0.4139958	total: 134ms	remaining: 3.21s
4:	learn: 0.3769736	total: 166ms	remaining: 3.15s
5:	learn: 0.3569829	total: 199ms	remaining: 3.11s
6:	learn: 0.3448757	total: 232ms	remaining: 3.09s
7:	learn: 0.3368593	total: 266ms	remaining: 3.05s
8:	learn: 0.3316283	total: 300ms	remaining: 3.03s
9:	learn: 0.3243720	total: 339ms	remaining: 3.05s
10:	learn: 0.3202858	total: 405ms	remaining: 3.28s
11:	learn: 0.3172151	total: 471ms	remaining: 3.45s
12:	learn: 0.3145071	total: 511ms	remaining: 3.42s
13:	learn: 0.3092626	total: 549ms	remaining: 3.37s
14:	learn: 0.3060092	total: 586ms	remaining: 3.32s
15:	learn: 0.3039569	total: 626ms	remaining: 3.28s
16:	learn: 0.2998098	total: 662ms	remaining: 3.23s
17:	learn: 0.2980274	total: 704ms	remaining: 3.21s
18:	learn: 0.2965743	total: 740ms	remaining: 3.15s
19:	learn

64:	learn: 0.2455108	total: 2.41s	remaining: 1.3s
65:	learn: 0.2446718	total: 2.46s	remaining: 1.27s
66:	learn: 0.2432598	total: 2.5s	remaining: 1.23s
67:	learn: 0.2422272	total: 2.54s	remaining: 1.19s
68:	learn: 0.2419092	total: 2.58s	remaining: 1.16s
69:	learn: 0.2408291	total: 2.65s	remaining: 1.14s
70:	learn: 0.2397937	total: 2.74s	remaining: 1.12s
71:	learn: 0.2392793	total: 2.81s	remaining: 1.09s
72:	learn: 0.2385109	total: 2.84s	remaining: 1.05s
73:	learn: 0.2373197	total: 2.88s	remaining: 1.01s
74:	learn: 0.2360527	total: 2.91s	remaining: 971ms
75:	learn: 0.2348952	total: 2.95s	remaining: 933ms
76:	learn: 0.2344724	total: 2.99s	remaining: 894ms
77:	learn: 0.2332009	total: 3.03s	remaining: 856ms
78:	learn: 0.2328638	total: 3.07s	remaining: 817ms
79:	learn: 0.2328177	total: 3.11s	remaining: 777ms
80:	learn: 0.2318357	total: 3.14s	remaining: 738ms
81:	learn: 0.2313306	total: 3.18s	remaining: 698ms
82:	learn: 0.2293822	total: 3.21s	remaining: 658ms
83:	learn: 0.2280092	total: 3.25s

27:	learn: 0.2786555	total: 1.09s	remaining: 2.81s
28:	learn: 0.2776098	total: 1.13s	remaining: 2.77s
29:	learn: 0.2765274	total: 1.18s	remaining: 2.74s
30:	learn: 0.2749606	total: 1.22s	remaining: 2.72s
31:	learn: 0.2739219	total: 1.26s	remaining: 2.69s
32:	learn: 0.2719872	total: 1.31s	remaining: 2.65s
33:	learn: 0.2715249	total: 1.35s	remaining: 2.62s
34:	learn: 0.2708026	total: 1.39s	remaining: 2.58s
35:	learn: 0.2698820	total: 1.43s	remaining: 2.54s
36:	learn: 0.2680557	total: 1.47s	remaining: 2.5s
37:	learn: 0.2665374	total: 1.5s	remaining: 2.45s
38:	learn: 0.2639852	total: 1.53s	remaining: 2.4s
39:	learn: 0.2625827	total: 1.57s	remaining: 2.35s
40:	learn: 0.2618245	total: 1.6s	remaining: 2.31s
41:	learn: 0.2599783	total: 1.64s	remaining: 2.27s
42:	learn: 0.2590013	total: 1.68s	remaining: 2.23s
43:	learn: 0.2578725	total: 1.73s	remaining: 2.2s
44:	learn: 0.2563528	total: 1.78s	remaining: 2.18s
45:	learn: 0.2548886	total: 1.83s	remaining: 2.15s
46:	learn: 0.2530593	total: 1.88s	re

88:	learn: 0.2122998	total: 3.52s	remaining: 435ms
89:	learn: 0.2122797	total: 3.55s	remaining: 394ms
90:	learn: 0.2118316	total: 3.58s	remaining: 354ms
91:	learn: 0.2109014	total: 3.62s	remaining: 315ms
92:	learn: 0.2107601	total: 3.65s	remaining: 275ms
93:	learn: 0.2107497	total: 3.68s	remaining: 235ms
94:	learn: 0.2094722	total: 3.72s	remaining: 196ms
95:	learn: 0.2087882	total: 3.76s	remaining: 157ms
96:	learn: 0.2081829	total: 3.79s	remaining: 117ms
97:	learn: 0.2061826	total: 3.83s	remaining: 78.1ms
98:	learn: 0.2048690	total: 3.86s	remaining: 39ms
99:	learn: 0.2041072	total: 3.89s	remaining: 0us


In [92]:
cv_cat_boost.mean()

0.9390415768376676

Vanila catboost works like other more simple method for fill nans so it doesn't matter wthat to use in the future.I chose method is based on catboost because it doesn't need extra act. 