In [1]:
!pip install catboost
!pip install category_encoders



In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from category_encoders import WOEEncoder
from sklearn.metrics import f1_score

In [3]:
train = pd.read_csv('churn_train.csv',index_col=0)
test = pd.read_csv('churn_test.csv',index_col=0)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128058 entries, 0 to 128057
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  128058 non-null  float64
 1   marital_status       128053 non-null  float64
 2   job_position         128058 non-null  float64
 3   credit_sum           128057 non-null  float64
 4   credit_length        128058 non-null  float64
 5   scoring              128055 non-null  float64
 6   education            128055 non-null  float64
 7   tariff_id            128058 non-null  float64
 8   region               127910 non-null  float64
 9   average_region_wage  127910 non-null  float64
 10  income               128057 non-null  float64
 11  credit_count         121183 non-null  float64
 12  overdue_count        121183 non-null  float64
 13  churn                128058 non-null  int64  
dtypes: float64(13), int64(1)
memory usage: 14.7 MB


In [5]:
train.isna().mean()

age                    0.000000
marital_status         0.000039
job_position           0.000000
credit_sum             0.000008
credit_length          0.000000
scoring                0.000023
education              0.000023
tariff_id              0.000000
region                 0.001156
average_region_wage    0.001156
income                 0.000008
credit_count           0.053687
overdue_count          0.053687
churn                  0.000000
dtype: float64

In [6]:
train = train.fillna(0)

In [7]:
train

Unnamed: 0,age,marital_status,job_position,credit_sum,credit_length,scoring,education,tariff_id,region,average_region_wage,income,credit_count,overdue_count,churn
0,38.0,3.0,15.0,12340.91,12.0,0.365170,2.0,20.0,73.0,56000.0,43000.0,1.0,0.0,0
1,33.0,4.0,14.0,12118.00,10.0,0.507244,4.0,2.0,46.0,28160.0,20000.0,0.0,0.0,0
2,50.0,4.0,14.0,11691.00,6.0,0.382806,4.0,25.0,4.0,27390.0,12000.0,1.0,0.0,0
3,39.0,4.0,14.0,87467.00,12.0,0.508112,4.0,2.0,29.0,55880.0,80000.0,2.0,0.0,0
4,65.0,3.0,15.0,44879.00,10.0,0.491978,4.0,28.0,17.0,27060.0,45000.0,3.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128053,45.0,3.0,15.0,45978.00,10.0,0.429848,4.0,2.0,22.0,22550.0,25000.0,3.0,0.0,0
128054,59.0,4.0,10.0,27980.00,14.0,0.446021,2.0,1.0,62.0,27060.0,10500.0,0.0,0.0,0
128055,27.0,3.0,14.0,22756.00,15.0,0.520948,4.0,2.0,46.0,28160.0,25000.0,0.0,0.0,0
128056,36.0,3.0,14.0,55189.00,12.0,0.457322,2.0,2.0,63.0,45430.0,70000.0,2.0,0.0,0


In [8]:
train.describe()

Unnamed: 0,age,marital_status,job_position,credit_sum,credit_length,scoring,education,tariff_id,region,average_region_wage,income,credit_count,overdue_count,churn
count,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0,128058.0
mean,36.502444,3.196747,13.153969,26108.292673,10.981469,0.469499,3.202002,14.136563,42.936849,32903.113667,40099.100369,1.991746,0.043699,0.174585
std,10.557304,0.739799,3.142273,16206.853671,3.533701,0.124303,1.064161,11.548063,22.280543,12627.415453,24738.110978,1.790338,0.206137,0.379613
min,18.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,3.0,14.0,14919.0,10.0,0.379602,2.0,2.0,25.0,25520.0,25000.0,1.0,0.0,0.0
50%,34.0,3.0,14.0,21256.0,10.0,0.461519,4.0,19.0,40.0,27390.0,35000.0,2.0,0.0,0.0
75%,43.0,4.0,14.0,32068.0,12.0,0.552366,4.0,28.0,63.0,42460.0,50000.0,3.0,0.0,0.0
max,71.0,5.0,18.0,200000.0,36.0,1.128291,5.0,32.0,82.0,66880.0,950000.0,21.0,3.0,1.0


In [9]:
train.nunique().sort_values()

churn                      2
overdue_count              4
marital_status             6
education                  6
job_position              18
credit_count              21
credit_length             31
tariff_id                 32
age                       54
average_region_wage       65
region                    82
income                  1400
scoring                15607
credit_sum             36674
dtype: int64

In [10]:
train.churn.value_counts(normalize=True)

0    0.825415
1    0.174585
Name: churn, dtype: float64

In [11]:
train.groupby(['marital_status'])[['job_position','credit_sum','scoring','income']].agg(['mean','median'])

Unnamed: 0_level_0,job_position,job_position,credit_sum,credit_sum,scoring,scoring,income,income
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median
marital_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.0,14.2,14.0,17999.876,19791.0,0.392169,0.356591,38800.0,36000.0
1.0,13.190041,14.0,26083.983145,21560.0,0.477458,0.472814,39423.128585,35000.0
2.0,13.086585,14.0,24702.261844,19994.0,0.472756,0.465877,39275.19229,35000.0
3.0,13.087867,14.0,25592.245541,20739.0,0.452773,0.443861,41479.382046,35000.0
4.0,13.33434,14.0,27749.863614,23016.0,0.497582,0.492142,38523.779827,35000.0
5.0,12.520092,14.0,22367.602752,18519.0,0.474652,0.469823,31319.431688,28000.0


In [12]:
train.groupby(['job_position'])[['credit_sum','scoring','income']].agg(['mean','median'])

Unnamed: 0_level_0,credit_sum,credit_sum,scoring,scoring,income,income
Unnamed: 0_level_1,mean,median,mean,median,mean,median
job_position,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1.0,21250.146623,17376.0,0.496555,0.492488,29549.405291,25000.0
2.0,32680.570323,26807.0,0.464027,0.455692,58552.981503,50000.0
3.0,26657.652333,17884.5,0.481349,0.476865,40928.888889,35000.0
4.0,34601.215295,28886.0,0.435689,0.4249,67574.815168,55000.0
5.0,14470.285714,12419.0,0.496721,0.44971,31428.571429,25000.0
6.0,29440.554624,22334.0,0.440666,0.447469,65463.978495,50000.0
7.0,15726.22,14778.0,0.578385,0.603547,15666.666667,12000.0
8.0,23248.013247,18558.0,0.512329,0.520279,36666.311688,32000.0
9.0,13900.0,13900.0,0.286377,0.286377,17000.0,17000.0
10.0,19810.640575,16841.175,0.471233,0.464093,22583.967398,19800.0


In [13]:
X = train.drop('churn',axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, train['churn'], test_size=0.2, random_state=42)

In [15]:
cat_features = ['marital_status','job_position', 'education', 'region']
for col in cat_features:
  X_train[col]= X_train[col].astype('str')

In [16]:
woe_enc = WOEEncoder(cols=cat_features, random_state=17).fit(X_train, y_train)
X_train_woe = woe_enc.transform(X_train.reset_index(drop=True))
X_test_woe = woe_enc.transform(X_test.reset_index(drop=True))

In [17]:
model = CatBoostClassifier(
   iterations=1e4, depth=5,
   loss_function='Logloss',
   verbose=True, metric_period=int(1e3))

model.fit(X_train_woe, y_train,eval_set=(X_test_woe,y_test))

Learning rate set to 0.036518
0:	learn: 0.6709753	test: 0.6709815	best: 0.6709815 (0)	total: 181ms	remaining: 30m 12s
1000:	learn: 0.3863447	test: 0.4050145	best: 0.4050145 (1000)	total: 1m 32s	remaining: 13m 53s
2000:	learn: 0.3747071	test: 0.4050774	best: 0.4050145 (1000)	total: 2m 49s	remaining: 11m 16s
3000:	learn: 0.3647733	test: 0.4059754	best: 0.4050145 (1000)	total: 4m 32s	remaining: 10m 34s
4000:	learn: 0.3560575	test: 0.4069742	best: 0.4050145 (1000)	total: 5m 41s	remaining: 8m 32s
5000:	learn: 0.3481595	test: 0.4081829	best: 0.4050145 (1000)	total: 7m 20s	remaining: 7m 20s
6000:	learn: 0.3410028	test: 0.4094760	best: 0.4050145 (1000)	total: 9m 1s	remaining: 6m 1s
7000:	learn: 0.3342859	test: 0.4108190	best: 0.4050145 (1000)	total: 10m 33s	remaining: 4m 31s
8000:	learn: 0.3280003	test: 0.4118384	best: 0.4050145 (1000)	total: 11m 49s	remaining: 2m 57s
9000:	learn: 0.3220558	test: 0.4131630	best: 0.4050145 (1000)	total: 13m 13s	remaining: 1m 28s
9999:	learn: 0.3163116	test: 0.4

<catboost.core.CatBoostClassifier at 0x7f56be866cb0>

In [18]:
y_pred = model.predict(X_test_woe)

In [19]:
f1_score(y_test, y_pred, average='weighted')

0.7656838783859802