In [1]:
import xgboost as xgb
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer

# Download data

In [2]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [5]:
# check null in features
dict_nan = {name: np.sum(data[name].isnull()) for name in data.columns}
dict_nan

{'ID': 0,
 'Var1': 18038,
 'Var2': 17831,
 'Var3': 17832,
 'Var4': 17736,
 'Var5': 17799,
 'Var6': 1991,
 'Var7': 1976,
 'Var8': 18299,
 'Var9': 18038,
 'Var10': 17799,
 'Var11': 17832,
 'Var12': 18085,
 'Var13': 1976,
 'Var14': 17832,
 'Var15': 18299,
 'Var16': 17799,
 'Var17': 17736,
 'Var18': 17736,
 'Var19': 17736,
 'Var20': 18299,
 'Var21': 1991,
 'Var22': 1792,
 'Var23': 17799,
 'Var24': 2616,
 'Var25': 1792,
 'Var26': 17799,
 'Var27': 17799,
 'Var28': 1793,
 'Var29': 18038,
 'Var30': 18038,
 'Var31': 18299,
 'Var32': 18299,
 'Var33': 18010,
 'Var34': 17831,
 'Var35': 1792,
 'Var36': 17831,
 'Var37': 17736,
 'Var38': 1792,
 'Var39': 18299,
 'Var40': 17831,
 'Var41': 18038,
 'Var42': 18299,
 'Var43': 17831,
 'Var44': 1792,
 'Var45': 18171,
 'Var46': 17831,
 'Var47': 18038,
 'Var48': 18299,
 'Var49': 17831,
 'Var50': 18038,
 'Var51': 16888,
 'Var52': 18299,
 'Var53': 18038,
 'Var54': 17831,
 'Var55': 18299,
 'Var56': 18055,
 'Var57': 0,
 'Var58': 18038,
 'Var59': 17990,
 'Var60': 1

In [6]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [7]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [8]:
labels.value_counts()

0.0    16921
1.0     1377
Name: labels, dtype: int64

# Simple preprocessing feature

In [9]:
# drop feture with null more half of dataset
def check_feat_without(data, columns, threshold = 0):
    flag_nan_col = [np.sum(data[name].isnull()) < threshold for name in columns]
    return columns[flag_nan_col]
    

In [10]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
print(len(numeric_col))
print(len(cat_col))

189
41


In [11]:
right_numeric_column = check_feat_without(data, numeric_col, threshold = len(data)//2)
right_cat_column = check_feat_without(data, cat_col, threshold = len(data)//2)
print(len(right_numeric_column))
print(len(right_cat_column))

41
28


In [12]:
# check unique in cat feat
for cur_cat in right_cat_column:
    count_uniq = len(data[cur_cat].unique())
    print(f'{cur_cat}: {count_uniq}')

Var192: 326
Var193: 44
Var195: 21
Var196: 4
Var197: 207
Var198: 2721
Var199: 2639
Var202: 4673
Var203: 4
Var204: 100
Var205: 4
Var206: 22
Var207: 12
Var208: 3
Var210: 6
Var211: 2
Var212: 71
Var216: 1247
Var217: 7944
Var218: 3
Var219: 20
Var220: 2721
Var221: 7
Var222: 2721
Var223: 5
Var226: 23
Var227: 7
Var228: 29


In [13]:
# check unique in num feat
for cur_num in right_numeric_column:
    count_uniq = len(data[cur_num].unique())
    print(f'{cur_num}: {count_uniq}')

Var6: 1039
Var7: 7
Var13: 1877
Var21: 498
Var22: 498
Var24: 63
Var25: 190
Var28: 2513
Var35: 10
Var38: 11908
Var44: 6
Var57: 14013
Var65: 13
Var72: 9
Var73: 129
Var74: 284
Var76: 11377
Var78: 13
Var81: 15952
Var83: 130
Var85: 109
Var94: 8571
Var109: 149
Var112: 158
Var113: 17900
Var119: 1003
Var123: 191
Var125: 6217
Var126: 52
Var132: 18
Var133: 14165
Var134: 12639
Var140: 1859
Var143: 5
Var144: 11
Var149: 7524
Var153: 14567
Var160: 273
Var163: 9049
Var173: 4
Var181: 7


In [14]:
data['Var173'].unique()

array([ 0., nan,  2.,  4.])

# FILL NAN

We will use different method to fill on nan. The quality will be checked on validation roc auc xgboost 

In [15]:
# статистика среднее* ...
# ближайшие соседи*
# матричное разложение
# как прогноз по другим фичам*
# метод в деревьях

SyntaxError: invalid syntax (<ipython-input-15-28c1b7062389>, line 2)

In [15]:
model_classifier = xgb.XGBClassifier()

In [58]:
# class encoder cat
class LabelTransformer():
#     def __init__(self):
#         return self

    def fit(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        self.encoder_columns = dict()
        self.unique_fitted_value = dict()
        for cur_col in self.columns_trans:
            unique_val = np.append(self.df[cur_col].unique(), 'unk_val')
            self.unique_fitted_value[cur_col] = unique_val
            self.encoder_columns[cur_col] = LabelEncoder().fit(unique_val)
        return self

    def transform(self, X, y=None):
        self.df = pd.DataFrame(X)
        self.columns_trans = self.df.columns.to_numpy()
        for cur_col in self.columns_trans:
            
            unique_val_trans = self.df[cur_col].unique()
            unique_val_trans = unique_val_trans[~pd.isna(unique_val_trans)]
            
            unique_val_train = self.unique_fitted_value[cur_col]
            unique_val_train = unique_val_train[~pd.isna(unique_val_train)]
            
            dissapear_val = np.setdiff1d(unique_val_trans, unique_val_train)
            self.df.loc[self.df[cur_col].isin(dissapear_val), cur_col] = 'unk_val'
            self.df[cur_col] = self.encoder_columns[cur_col].transform(self.df[cur_col])
            
        return self.df


In [74]:
pipeline_numeric = Pipeline(steps =[
    ('selecting_num', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_numeric_column])),
    ('fill_nan_num', SimpleImputer(strategy='mean')),
    ('scaler_num', StandardScaler())
])

pipeline_cat = Pipeline(steps = [
    ('selecting_cat', preprocessing.FunctionTransformer(lambda data: data.loc[:, right_cat_column])),
    ('fill_nan_cat', SimpleImputer(strategy='most_frequent')),
    ('encoder_cat', LabelTransformer())
])

In [75]:
estimator = Pipeline(steps=[
    ('feature_preprocessing', FeatureUnion(transformer_list=[
        ('numeric_variables_processing', pipeline_numeric),
        ('category_variables_processing', pipeline_cat)
    ])),
    ('model',model_classifier)
])

In [76]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, random_state = 0, test_size=0.3, shuffle=True,
                                                                    stratify=labels)

In [77]:
estimator.fit(train_data, train_target)



Pipeline(steps=[('feature_preprocessing',
                 FeatureUnion(transformer_list=[('numeric_variables_processing',
                                                 Pipeline(steps=[('selecting_num',
                                                                  FunctionTransformer(func=<function <lambda> at 0x0000021C8E8E8820>)),
                                                                 ('fill_nan_num',
                                                                  SimpleImputer(strategy='median')),
                                                                 ('scaler_num',
                                                                  StandardScaler())])),
                                                ('category_variables_processing',
                                                 Pipeline(steps=[('selecting_c...
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
           

In [78]:
predicted = estimator.predict(test_data)

In [79]:
roc_auc_score(predicted, test_target)

0.6219658176497972

In [80]:
cv_strategy = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

In [81]:
cv_xgb = cross_val_score(estimator, train_data, train_target, cv=cv_strategy, scoring='f1')



In [82]:
cv_xgb 

array([0.00662252, 0.03947368, 0.02675585, 0.02614379, 0.01973684])