In [51]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Download data

In [52]:
data = pd.read_csv('./data/orange_small_churn_train_data.csv')

In [53]:
data.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,labels
0,0,,,,,,3052.0,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,-1.0
1,1,,,,,,1813.0,7.0,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,-1.0
2,2,,,,,,1953.0,7.0,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,-1.0
3,3,,,,,,1533.0,7.0,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1.0
4,4,,,,,,686.0,7.0,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,-1.0


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Columns: 232 entries, ID to labels
dtypes: float64(192), int64(2), object(38)
memory usage: 32.4+ MB


In [55]:
# check null in features
dict_nan = {name: np.sum(data[name].isnull()) for name in data.columns}
dict_nan

{'ID': 0,
 'Var1': 18038,
 'Var2': 17831,
 'Var3': 17832,
 'Var4': 17736,
 'Var5': 17799,
 'Var6': 1991,
 'Var7': 1976,
 'Var8': 18299,
 'Var9': 18038,
 'Var10': 17799,
 'Var11': 17832,
 'Var12': 18085,
 'Var13': 1976,
 'Var14': 17832,
 'Var15': 18299,
 'Var16': 17799,
 'Var17': 17736,
 'Var18': 17736,
 'Var19': 17736,
 'Var20': 18299,
 'Var21': 1991,
 'Var22': 1792,
 'Var23': 17799,
 'Var24': 2616,
 'Var25': 1792,
 'Var26': 17799,
 'Var27': 17799,
 'Var28': 1793,
 'Var29': 18038,
 'Var30': 18038,
 'Var31': 18299,
 'Var32': 18299,
 'Var33': 18010,
 'Var34': 17831,
 'Var35': 1792,
 'Var36': 17831,
 'Var37': 17736,
 'Var38': 1792,
 'Var39': 18299,
 'Var40': 17831,
 'Var41': 18038,
 'Var42': 18299,
 'Var43': 17831,
 'Var44': 1792,
 'Var45': 18171,
 'Var46': 17831,
 'Var47': 18038,
 'Var48': 18299,
 'Var49': 17831,
 'Var50': 18038,
 'Var51': 16888,
 'Var52': 18299,
 'Var53': 18038,
 'Var54': 17831,
 'Var55': 18299,
 'Var56': 18055,
 'Var57': 0,
 'Var58': 18038,
 'Var59': 17990,
 'Var60': 1

In [56]:
# check disbalance in classes
print(data['labels'].value_counts())
print(data['labels'].unique())

-1.0    16921
 1.0     1377
Name: labels, dtype: int64
[-1.  1. nan]


In [57]:
data['labels'].replace(-1, 0, inplace=True)
data.dropna(subset=['labels'], inplace=True)
labels = data['labels']
data = data.iloc[:,:-1]

In [58]:
labels.value_counts()

0.0    16921
1.0     1377
Name: labels, dtype: int64

# Preprocessing

In [59]:
# drop feture with null more half of dataset
def check_feat_without(data, columns, threshold = 0):
    flag_nan_col = [np.sum(data[name].isnull()) < threshold for name in columns]
    return columns[flag_nan_col]
    

In [60]:
numeric_col = data.columns[1:190]
cat_col = data.columns[190:]
print(len(numeric_col))
print(len(cat_col))

189
41


In [61]:
right_numeric_column = check_feat_without(data, numeric_col, threshold = len(data)//2)
right_cat_column = check_feat_without(data, cat_col, threshold = len(data)//2)
print(len(right_numeric_column))
print(len(right_cat_column))

41
28


In [62]:
right_numeric_column

Index(['Var6', 'Var7', 'Var13', 'Var21', 'Var22', 'Var24', 'Var25', 'Var28',
       'Var35', 'Var38', 'Var44', 'Var57', 'Var65', 'Var72', 'Var73', 'Var74',
       'Var76', 'Var78', 'Var81', 'Var83', 'Var85', 'Var94', 'Var109',
       'Var112', 'Var113', 'Var119', 'Var123', 'Var125', 'Var126', 'Var132',
       'Var133', 'Var134', 'Var140', 'Var143', 'Var144', 'Var149', 'Var153',
       'Var160', 'Var163', 'Var173', 'Var181'],
      dtype='object')

In [63]:
# balance class by undersampling
count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.7)
index_for_balance = labels[labels == 1].index
add_index = np.random.choice(index_for_balance, size = count_add)
data_add = data.iloc[add_index]
labels_add = labels.iloc[add_index]

data = pd.concat([data, data_add])
labels = pd.concat([labels, labels_add])

In [64]:
# # balance class by oversampling
# count_add = int((len(labels[labels == 0]) - len(labels[labels == 1]))*0.9)
# index_for_balance = labels[labels == 0].index
# drop_index = np.random.choice(index_for_balance, size = count_add, replace=False)

# data.drop(index=drop_index, inplace=True)
# labels.drop(index=drop_index, inplace=True)

In [65]:
labels.value_counts()

0.0    16921
1.0    12257
Name: labels, dtype: int64

In [66]:
data = data[np.union1d(right_numeric_column, right_cat_column)]
train_data, test_data, train_target, test_target = train_test_split(data, labels, random_state = 0, test_size=0.3, shuffle=True,
                                                                    stratify=labels)

In [67]:
train_data

Unnamed: 0,Var109,Var112,Var113,Var119,Var123,Var125,Var126,Var13,Var132,Var133,...,Var7,Var72,Var73,Var74,Var76,Var78,Var81,Var83,Var85,Var94
5449,32.0,48.0,118872.8,515.0,30.0,711.0,,13636.0,0.0,7776000.0,...,35.0,3.0,140,1169.0,4147200.0,0.0,211070.40,10.0,0.0,948.0
9197,56.0,32.0,-161317.2,705.0,12.0,7281.0,64.0,276.0,0.0,65535.0,...,7.0,3.0,88,35.0,18496.0,0.0,4154.01,0.0,10.0,13638.0
12627,32.0,24.0,139156.0,610.0,42.0,738.0,4.0,308.0,8.0,140615.0,...,7.0,3.0,30,7.0,361912.0,3.0,128677.80,10.0,6.0,708.0
14964,24.0,32.0,442564.0,1595.0,66.0,60183.0,8.0,328.0,8.0,2872460.0,...,14.0,3.0,34,231.0,2764800.0,0.0,35546.10,15.0,0.0,5769.0
3874,32.0,72.0,823760.0,630.0,60.0,0.0,4.0,0.0,0.0,4548765.0,...,0.0,3.0,16,0.0,1586384.0,0.0,147838.50,35.0,4.0,18918.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12950,80.0,64.0,-164135.6,515.0,36.0,5805.0,-8.0,40.0,0.0,3169555.0,...,7.0,3.0,132,0.0,1234952.0,0.0,94897.20,30.0,24.0,5316.0
9880,48.0,64.0,177897.6,785.0,0.0,0.0,4.0,0.0,0.0,727525.0,...,0.0,,18,0.0,317520.0,0.0,21875.01,0.0,10.0,
10095,24.0,40.0,23133.0,565.0,54.0,0.0,-8.0,0.0,0.0,5704950.0,...,0.0,3.0,18,0.0,3456000.0,0.0,135768.90,10.0,0.0,81975.0
3654,32.0,8.0,-672668.0,425.0,66.0,4554.0,10.0,12.0,24.0,570075.0,...,7.0,3.0,36,0.0,5608.0,9.0,117028.80,25.0,10.0,144.0


In [68]:
# Prerocessing numeric features
numeric_data = train_data[right_numeric_column]

# fill on null by mean
mean_data = numeric_data.mean(axis='index')

for i, name in enumerate(right_numeric_column):
    numeric_data[name].fillna(mean_data[i], inplace=True)
    
scaler = StandardScaler().fit(numeric_data)
scaled_numeric_data = scaler.transform(numeric_data)

In [69]:
# Preprocessing cat features
cat_data = train_data[right_cat_column]
transform_cat_data = cat_data.copy()
dict_encoders = dict()
for name in cat_data.columns:
    unique_val = np.append(cat_data[name].unique(), 'unk_val')
    encoder_cur = LabelEncoder().fit(unique_val)
    dict_encoders[name] = encoder_cur
    transform_cat_data[name] = encoder_cur.transform(cat_data[name])


In [70]:
train_data = np.hstack([scaled_numeric_data, transform_cat_data.to_numpy()])

In [71]:
train_data.shape

(20424, 69)

# Modelling

In [72]:
cv_strategy = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

In [73]:
cv_res = cross_val_score(LogisticRegression(), train_data, train_target, cv=cv_strategy, scoring='f1')

In [74]:
cv_res

array([0.19724483, 0.1837388 , 0.07784649, 0.186875  , 0.22376357])

In [75]:
cv_xgb = cross_val_score(xgb.XGBClassifier(), train_data, train_target, cv=cv_strategy, scoring='f1')

In [76]:
cv_xgb

array([0.93388585, 0.93851377, 0.93758201, 0.94128752, 0.9386064 ])

In [77]:
# preprocessing test data 

# Prerocessing numeric features test
numeric_data_test = test_data[right_numeric_column]

for i, name in enumerate(right_numeric_column):
    numeric_data_test[name].fillna(mean_data[i], inplace=True)
    
scaled_numeric_data_test = scaler.transform(numeric_data_test)

# Prerocessing cat features test
cat_data_test = test_data[right_cat_column]
transform_cat_data_test = cat_data_test.copy()

for name in cat_data_test.columns:
    unique_val_test = cat_data_test[name].unique()
    unique_val_test = unique_val_test[~pd.isna(unique_val_test)]
    
    unique_val_train = cat_data[name].unique()
    unique_val_train = unique_val_train[~pd.isna(unique_val_train)]
        
    dissapear_val = np.setdiff1d(unique_val_test, unique_val_train)
    cat_data_test[cat_data_test[name].isin(dissapear_val)] = 'unk_val'
    transform_cat_data_test[name] = dict_encoders[name].transform(cat_data_test[name])

In [78]:
test_data = np.hstack([scaled_numeric_data_test, transform_cat_data_test.to_numpy()])

In [79]:
model_xgb = xgb.XGBClassifier().fit(train_data, train_target)
predicted_train = model_xgb.predict(train_data)
predicted_test = model_xgb.predict(test_data)

In [80]:
print(classification_report(predicted_train, train_target))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     11775
           1       0.99      0.98      0.99      8649

    accuracy                           0.99     20424
   macro avg       0.99      0.99      0.99     20424
weighted avg       0.99      0.99      0.99     20424



In [81]:
print(roc_auc_score(predicted_train, train_target))

0.989493354778322


In [82]:
print(classification_report(predicted_test, test_target))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4963
           1       0.98      0.95      0.96      3791

    accuracy                           0.97      8754
   macro avg       0.97      0.97      0.97      8754
weighted avg       0.97      0.97      0.97      8754



In [83]:
print(roc_auc_score(predicted_test, test_target))

0.9658881685963867


# check competition

In [34]:
data_comp = pd.read_csv('./data/orange_small_churn_test_data.csv')

In [35]:
data_comp.head()

Unnamed: 0,ID,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,0,,,,,,1225.0,7.0,,,...,zCkv,APgdzOv,jySVZNlOJy,,ELof,xb3V,6fzt,Zy3gnGM,,
1,1,,,,,,896.0,14.0,,,...,oslk,IIvC99a,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,
2,2,,,,,,791.0,7.0,,,...,oslk,6YSocsg,LM8l689qOp,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,mj86,
3,3,,,,,,2296.0,7.0,,,...,oslk,5nQ7A2G,jySVZNlOJy,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,am7c,
4,4,8.0,,,,,,,,28.0,...,oslk,MI8s5nE,LM8l689qOp,,,7P5s,RAYp,F2FyR07IdsN7I,,


In [123]:
data_comp = data_comp[np.union1d(right_numeric_column, right_cat_column)]

In [124]:
# Prerocessing numeric features test
numeric_data_comp = data_comp[right_numeric_column]

for i, name in enumerate(right_numeric_column):
    numeric_data_comp[name].fillna(mean_data[i], inplace=True)
    
scaled_numeric_data_comp = scaler.transform(numeric_data_comp)

# Prerocessing cat features test
cat_data_comp = data_comp[right_cat_column]
transform_cat_data_comp = cat_data_comp.copy()

for name in cat_data_comp.columns:
    unique_val_comp = cat_data_comp[name].unique()
    unique_val_comp = unique_val_comp[~pd.isna(unique_val_comp)]
    
    unique_val_train = cat_data[name].unique()
    unique_val_train = unique_val_train[~pd.isna(unique_val_train)]
        
    dissapear_val = np.setdiff1d(unique_val_comp, unique_val_train)
    cat_data_comp[cat_data_comp[name].isin(dissapear_val)] = 'unk_val'
    transform_cat_data_comp[name] = dict_encoders[name].transform(cat_data_comp[name])

In [125]:
comp_data = np.hstack([scaled_numeric_data_comp, transform_cat_data_comp.to_numpy()])

In [126]:
predicted_prob = model_xgb.predict_proba(comp_data)

In [127]:
predicted_prob

array([[0.8651889 , 0.1348111 ],
       [0.9489334 , 0.05106656],
       [0.87363195, 0.12636805],
       ...,
       [0.85287714, 0.14712289],
       [0.9737031 , 0.02629693],
       [0.9728888 , 0.0271112 ]], dtype=float32)

In [128]:
out_df = pd.DataFrame(enumerate(predicted_prob[:,1]), columns=['Id', 'result'])
out_df.to_csv('output_df.csv', sep=',', index=False)

We can use undersampling or oversampling but in both variant model is overfitted. ROC_AUC equals less 0.65 in data test competiton it can show about data shift