In [1]:
import xgboost as xgb
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit, GridSearchCV
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer

# Download data

In [2]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [5]:
# check null in features
dict_nan = {name: np.sum(data[name].isnull()) for name in data.columns}
dict_nan

{'ID': 0,
 'Var1': 18038,
 'Var2': 17831,
 'Var3': 17832,
 'Var4': 17736,
 'Var5': 17799,
 'Var6': 1991,
 'Var7': 1976,
 'Var8': 18299,
 'Var9': 18038,
 'Var10': 17799,
 'Var11': 17832,
 'Var12': 18085,
 'Var13': 1976,
 'Var14': 17832,
 'Var15': 18299,
 'Var16': 17799,
 'Var17': 17736,
 'Var18': 17736,
 'Var19': 17736,
 'Var20': 18299,
 'Var21': 1991,
 'Var22': 1792,
 'Var23': 17799,
 'Var24': 2616,
 'Var25': 1792,
 'Var26': 17799,
 'Var27': 17799,
 'Var28': 1793,
 'Var29': 18038,
 'Var30': 18038,
 'Var31': 18299,
 'Var32': 18299,
 'Var33': 18010,
 'Var34': 17831,
 'Var35': 1792,
 'Var36': 17831,
 'Var37': 17736,
 'Var38': 1792,
 'Var39': 18299,
 'Var40': 17831,
 'Var41': 18038,
 'Var42': 18299,
 'Var43': 17831,
 'Var44': 1792,
 'Var45': 18171,
 'Var46': 17831,
 'Var47': 18038,
 'Var48': 18299,
 'Var49': 17831,
 'Var50': 18038,
 'Var51': 16888,
 'Var52': 18299,
 'Var53': 18038,
 'Var54': 17831,
 'Var55': 18299,
 'Var56': 18055,
 'Var57': 0,
 'Var58': 18038,
 'Var59': 17990,
 'Var60': 1

In [6]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [7]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [8]:
labels.value_counts()

0.0    16921
1.0     1377
Name: labels, dtype: int64

# Simple preprocessing feature

In [9]:
# drop feture with null more half of dataset
def check_feat_without(data, columns, threshold = 0):
    flag_nan_col = [np.sum(data[name].isnull()) < threshold for name in columns]
    return columns[flag_nan_col]
    

In [10]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
print(len(numeric_col))
print(len(cat_col))

189
41


In [11]:
right_numeric_column = check_feat_without(data, numeric_col, threshold = len(data)//2)
right_cat_column = check_feat_without(data, cat_col, threshold = len(data)//2)
print(len(right_numeric_column))
print(len(right_cat_column))

41
28


In [12]:
# check unique in cat feat
for cur_cat in right_cat_column:
    count_uniq = len(data[cur_cat].unique())
    print(f'{cur_cat}: {count_uniq}')

Var192: 326
Var193: 44
Var195: 21
Var196: 4
Var197: 207
Var198: 2721
Var199: 2639
Var202: 4673
Var203: 4
Var204: 100
Var205: 4
Var206: 22
Var207: 12
Var208: 3
Var210: 6
Var211: 2
Var212: 71
Var216: 1247
Var217: 7944
Var218: 3
Var219: 20
Var220: 2721
Var221: 7
Var222: 2721
Var223: 5
Var226: 23
Var227: 7
Var228: 29


In [13]:
# check unique in num feat
for cur_num in right_numeric_column:
    count_uniq = len(data[cur_num].unique())
    print(f'{cur_num}: {count_uniq}')

Var6: 1039
Var7: 7
Var13: 1877
Var21: 498
Var22: 498
Var24: 63
Var25: 190
Var28: 2513
Var35: 10
Var38: 11908
Var44: 6
Var57: 14013
Var65: 13
Var72: 9
Var73: 129
Var74: 284
Var76: 11377
Var78: 13
Var81: 15952
Var83: 130
Var85: 109
Var94: 8571
Var109: 149
Var112: 158
Var113: 17900
Var119: 1003
Var123: 191
Var125: 6217
Var126: 52
Var132: 18
Var133: 14165
Var134: 12639
Var140: 1859
Var143: 5
Var144: 11
Var149: 7524
Var153: 14567
Var160: 273
Var163: 9049
Var173: 4
Var181: 7


In [14]:
data['Var173'].unique()

array([ 0., nan,  2.,  4.])

# FILL NAN

We will use different method to fill on nan. The quality will be checked on validation roc auc xgboost 

In [15]:
# статистика среднее* ...
# ближайшие соседи*
# матричное разложение
# как прогноз по другим фичам*
# метод в деревьях

In [50]:
model_classifier = xgb.XGBClassifier(eval_metric='mlogloss')

# simple statistic

In [18]:
# class encoder cat
class LabelTransformer():
#     def __init__(self):
#         return self

    def fit(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        self.encoder_columns = dict()
        self.unique_fitted_value = dict()
        for cur_col in self.columns_trans:
            unique_val = np.append(self.df[cur_col].unique(), 'unk_val')
            self.unique_fitted_value[cur_col] = unique_val
            self.encoder_columns[cur_col] = LabelEncoder().fit(unique_val)
        return self

    def transform(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        for cur_col in self.columns_trans:
            
            unique_val_trans = self.df[cur_col].unique()
            unique_val_trans = unique_val_trans[~pd.isna(unique_val_trans)]
            
            unique_val_train = self.unique_fitted_value[cur_col]
            unique_val_train = unique_val_train[~pd.isna(unique_val_train)]
            
            dissapear_val = np.setdiff1d(unique_val_trans, unique_val_train)
            self.df.loc[self.df[cur_col].isin(dissapear_val), cur_col] = 'unk_val'
            self.df[cur_col] = self.encoder_columns[cur_col].transform(self.df[cur_col])
            
        return self.df


In [19]:
pipeline_numeric = Pipeline(steps =[
    ('selecting_num', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_numeric_column])),
    ('fill_nan_num', SimpleImputer(strategy='mean')),
    ('scaler_num', StandardScaler())
])

pipeline_cat = Pipeline(steps = [
    ('selecting_cat', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_cat_column])),
    ('fill_nan_cat', SimpleImputer(strategy='most_frequent')),
    ('encoder_cat', LabelTransformer())
    
])

estimator = Pipeline(steps=[
    ('feature_preprocessing', FeatureUnion(transformer_list=[
        ('numeric_variables_processing', pipeline_numeric),
        ('category_variables_processing', pipeline_cat)
    ])),
    ('model',model_classifier)
])

NameError: name 'model_classifier' is not defined

In [20]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, random_state = 0, test_size=0.3, shuffle=True,
                                                                    stratify=labels)

In [21]:
cv_strategy = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

In [86]:
cv_xgb = cross_val_score(estimator, train_data, train_target, cv=cv_strategy, scoring='roc_auc')

In [87]:
cv_xgb 

array([0.65465006, 0.65944313, 0.6508929 , 0.64181691, 0.63815127])

check different statistics

In [58]:
grid_params = {
    'feature_preprocessing__numeric_variables_processing__fill_nan_num__strategy': ['mean', 'median', 'most_frequent']
}

In [59]:
grid_search = GridSearchCV(estimator, grid_params, scoring='roc_auc', cv=cv_strategy)

In [60]:
grid_search.fit(train_data, train_target)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.3,
            train_size=None),
             estimator=Pipeline(steps=[('feature_preprocessing',
                                        FeatureUnion(transformer_list=[('numeric_variables_processing',
                                                                        Pipeline(steps=[('selecting_num',
                                                                                         FunctionTransformer(func=<function <lambda> at 0x0000025926EB8EE0>)),
                                                                                        ('fill_nan_num',
                                                                                         SimpleImputer()),
                                                                                        ('scale...
                                                      n_estimators=100,
                                                      n_jobs=None,
   

In [61]:
grid_search.best_score_

0.6524099752119061

In [62]:
grid_search.best_params_

{'feature_preprocessing__numeric_variables_processing__fill_nan_num__strategy': 'most_frequent'}

It daoesn't matter wht statistic we chooose. We use other methods wich in sklearn from class imputer  

SimpleImputer, KNNImputer, IterativeImputer with default params

In [134]:
mean_metric_cv = []
imputers_list = [SimpleImputer(),
                 IterativeImputer(n_nearest_features = 10),
                 KNNImputer()]

for  imputer_num in imputers_list:
    pipeline_numeric = Pipeline(steps =[
    ('selecting_num', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_numeric_column])),
    ('fill_nan_num', imputer_num),
    ('scaler_num', StandardScaler())
])

    pipeline_cat = Pipeline(steps = [
    ('selecting_cat', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_cat_column])),
    ('fill_nan_cat', SimpleImputer(strategy='most_frequent')),
    ('encoder_cat', LabelTransformer())
])

    estimator = Pipeline(steps=[
    ('feature_preprocessing', FeatureUnion(transformer_list=[
        ('numeric_variables_processing', pipeline_numeric),
        ('category_variables_processing', pipeline_cat)
    ])),
    ('model',model_classifier)
])
    cv_xgb = cross_val_score(estimator, train_data, train_target, cv=cv_strategy, scoring='roc_auc')
    mean_metric_cv.append(cv_xgb.mean())
    print(f'{repr(imputer_num)} with roc_auc={list(map(lambda x: np.round(x, 3) ,cv_xgb))}')

SimpleImputer() with roc_auc=[0.655, 0.659, 0.651, 0.642, 0.638]
IterativeImputer(n_nearest_features=10) with roc_auc=[0.646, 0.594, 0.616, 0.65, 0.669]
KNNImputer() with roc_auc=[0.647, 0.652, 0.632, 0.637, 0.643]


In [135]:
mean_metric_cv

[0.6489908539138123, 0.6350222859179091, 0.6421510535426723]

# Use xgboost with preprocessing nan

In [52]:
train_data_for_xgb = train_data.loc[:, np.union1d(right_numeric_column, right_cat_column)]

In [53]:
train_data_for_xgb.head()

Unnamed: 0,Var109,Var112,Var113,Var119,Var123,Var125,Var126,Var13,Var132,Var133,...,Var7,Var72,Var73,Var74,Var76,Var78,Var81,Var83,Var85,Var94
10715,8.0,8.0,146743.2,380.0,48.0,3744.0,36.0,432.0,0.0,559890.0,...,7.0,3.0,60,49.0,1482600.0,0.0,203490.0,15.0,0.0,350409.0
4415,40.0,48.0,359235.2,520.0,36.0,16353.0,,5352.0,0.0,7389550.0,...,14.0,3.0,88,126.0,4147200.0,0.0,219513.6,15.0,0.0,348.0
17876,232.0,312.0,931.284,1285.0,126.0,810.0,,760.0,0.0,131115.0,...,7.0,3.0,54,378.0,102800.0,0.0,3734.97,70.0,54.0,594.0
4227,,0.0,108100.8,305.0,0.0,145836.0,40.0,1276.0,0.0,0.0,...,7.0,,142,14.0,0.0,0.0,4476.09,0.0,0.0,
6471,40.0,48.0,125829.2,535.0,30.0,0.0,4.0,0.0,0.0,5973500.0,...,0.0,3.0,16,0.0,4147200.0,0.0,180290.4,20.0,8.0,245631.0


In [54]:
#for name_cat in right_cat_column:
train_data_for_xgb[right_cat_column] = train_data_for_xgb[right_cat_column].astype("category")

In [56]:
model_classifier_new = xgb.XGBClassifier(eval_metric='mlogloss', max_cat_to_onehot=2,
                                          enable_categorical=True)

In [57]:
cv_xgb_new = cross_val_score(model_classifier_new, train_data_for_xgb, train_target, cv=cv_strategy, scoring='roc_auc')

In [58]:
cv_xgb_new

array([nan, nan, nan, nan, nan])

In [60]:
model_classifier_new.fit(train_data_for_xgb, train_target)

ValueError: Experimental support for categorical data is not implemented for current tree method yet.