In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('./../data/train.csv', dtype={
                    'cust_id': 'int64',
                    'gender': 'category',
                    'age': 'int8',
                    'driving_license': 'category',
                    'region_code': 'category',
                    'previously_insured': 'category',
                    'vehicle_age': 'category',
                    'vehicle_damage': 'category',
                    'annual_premium': 'float16',
                    'policy_sales_channel': 'category',
                    'days_since_insured': 'int8',
                    'response': 'bool',
                 })

In [3]:
train_data.head()

Unnamed: 0,cust_id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,days_since_insured,response
0,167647,Male,22,1,7.0,1,< 1 Year,No,52.59375,152.0,16,False
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,866.5,26.0,-121,False
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,717.0,124.0,-3,False
3,87447,Female,22,1,33.0,0,< 1 Year,No,553.0,152.0,69,False
4,501933,Male,28,1,46.0,1,< 1 Year,No,580.5,152.0,-45,False


In [4]:
from pycaret.classification import *

### Experiment: Using the companyId as the categorical feature

In [5]:
reg = setup(data=train_data, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel', 'region_code'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,2400
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(382154, 12)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,True


In [6]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [7]:
# best = compare_models(include=['nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'rf', 'qda', 'ada', 'gbc', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy'], fold=3)
best = compare_models(include=['nb', 'dt', 'svm', 'gpc', 'mlp', 'rf', 'qda', 'ada', 'gbc', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy'], fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8464,0.8957,0.3929,0.5423,0.4556,0.3689,0.3754,1.1733
catboost,CatBoost Classifier,0.8457,0.8954,0.4036,0.5378,0.4611,0.3732,0.3785,14.5167
gbc,Gradient Boosting Classifier,0.8452,0.8937,0.3015,0.5494,0.3893,0.3092,0.3275,8.03
ada,Ada Boost Classifier,0.8423,0.8904,0.2682,0.5366,0.3572,0.2786,0.3006,3.5867
rf,Random Forest Classifier,0.8391,0.8818,0.384,0.5109,0.4384,0.3467,0.3515,7.7233
dummy,Dummy Classifier,0.8364,0.5,0.0,0.0,0.0,0.0,0.0,0.2267
et,Extra Trees Classifier,0.836,0.8753,0.3868,0.4985,0.4356,0.3414,0.3452,5.77
mlp,MLP Classifier,0.8245,0.838,0.2686,0.3735,0.2445,0.1842,0.2005,249.22
nb,Naive Bayes,0.8204,0.8138,0.3729,0.4422,0.4046,0.2998,0.3013,2.4633
dt,Decision Tree Classifier,0.8134,0.6647,0.4434,0.4317,0.4374,0.3256,0.3257,1.7733


## Experiment: Using only region 28

In [8]:
train_data['region_code'].value_counts()

28.0    107199
8.0      33941
46.0     20203
41.0     19090
15.0     13071
30.0     12742
29.0     11650
50.0     10176
11.0      9392
3.0       9276
36.0      8619
33.0      8008
35.0      7254
47.0      7251
6.0       6565
45.0      5555
18.0      5392
37.0      5016
14.0      4884
10.0      4509
39.0      4476
21.0      4431
48.0      4333
13.0      4030
2.0       3723
7.0       3284
12.0      3162
9.0       2936
32.0      2714
43.0      2580
27.0      2544
17.0      2517
25.0      2471
26.0      2438
24.0      2385
38.0      2014
23.0      2005
16.0      2001
0.0       1839
4.0       1825
31.0      1752
20.0      1710
49.0      1709
34.0      1543
19.0      1473
22.0      1261
40.0      1242
5.0       1213
1.0        940
44.0       784
42.0       567
52.0       263
51.0       196
Name: region_code, dtype: int64

In [9]:
train_data_28 = train_data.loc[train_data['region_code'] == '28.0']

In [10]:
train_data_28.shape

(107199, 12)

In [11]:
train_data_28 = train_data_28.loc[:, train_data_28.columns!='region_code']

In [12]:
train_data_28.head()

Unnamed: 0,cust_id,gender,age,driving_license,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,days_since_insured,response
1,17163,Male,42,1,0,1-2 Year,Yes,866.5,26.0,-121,False
7,331781,Male,38,1,1,1-2 Year,No,52.59375,26.0,51,False
8,290704,Female,45,1,0,1-2 Year,Yes,1118.0,124.0,6,False
9,344792,Male,42,1,0,1-2 Year,Yes,556.0,122.0,-39,True
12,441774,Male,33,1,0,< 1 Year,Yes,508.75,122.0,-91,True


In [13]:
clf = setup(data=train_data_28, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,1013
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(107199, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,True


In [14]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7556,0.8269,0.4749,0.5401,0.5053,0.3439,0.3452,5.2233
lightgbm,Light Gradient Boosting Machine,0.755,0.8283,0.4911,0.5374,0.5132,0.35,0.3506,0.3767
gbc,Gradient Boosting Classifier,0.7546,0.8274,0.4118,0.5439,0.4686,0.3131,0.3182,1.7733
ada,Ada Boost Classifier,0.7538,0.8238,0.4157,0.5413,0.4699,0.3133,0.318,0.5933
rf,Random Forest Classifier,0.7455,0.8103,0.4443,0.5188,0.4787,0.3117,0.3133,1.34
lr,Logistic Regression,0.7431,0.8038,0.2039,0.5331,0.2921,0.1729,0.2036,0.2733
lda,Linear Discriminant Analysis,0.7425,0.8097,0.2155,0.5251,0.3056,0.1801,0.2071,0.11
et,Extra Trees Classifier,0.7409,0.8019,0.4468,0.5084,0.4756,0.3045,0.3056,1.2667
ridge,Ridge Classifier,0.7396,0.0,0.0882,0.5282,0.1511,0.0821,0.1291,0.09
dummy,Dummy Classifier,0.7371,0.5,0.0,0.0,0.0,0.0,0.0,0.0833


# Experiment: Using region not 28

In [16]:
train_data_not_28 = train_data.loc[train_data['region_code'] != '28.0']

In [17]:
train_data_not_28.shape

(274955, 12)

In [18]:
train_data_not_28 = train_data_not_28.loc[:, train_data_not_28.columns!='region_code']

In [19]:
clf = setup(data=train_data_not_28, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,8484
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(274955, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,True


In [20]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8792,0.9108,0.2561,0.539,0.3465,0.2893,0.314,0.8233
catboost,CatBoost Classifier,0.8791,0.9097,0.3013,0.5316,0.3846,0.3232,0.3393,10.9233
gbc,Gradient Boosting Classifier,0.8786,0.9103,0.1958,0.544,0.2877,0.2372,0.2747,4.7333
ada,Ada Boost Classifier,0.8771,0.907,0.1157,0.5483,0.1905,0.1537,0.21,1.62
ridge,Ridge Classifier,0.8746,0.0,0.0,0.0,0.0,0.0,0.0,0.1333
dummy,Dummy Classifier,0.8746,0.5,0.0,0.0,0.0,0.0,0.0,0.1533
lda,Linear Discriminant Analysis,0.873,0.8887,0.1832,0.4833,0.2656,0.2112,0.2414,0.3
lr,Logistic Regression,0.8725,0.8286,0.1373,0.3145,0.1912,0.1519,0.1678,2.6333
rf,Random Forest Classifier,0.8682,0.8887,0.3483,0.466,0.3987,0.3264,0.3308,3.6267
et,Extra Trees Classifier,0.8652,0.8743,0.3545,0.4522,0.3974,0.3227,0.3258,3.23


# Experiment: Using region 8

In [21]:
train_data_8 = train_data.loc[train_data['region_code'] == '8.0']

In [22]:
train_data_8 = train_data_8.loc[:, train_data_8.columns!='region_code']

In [23]:
clf = setup(data=train_data_8, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,4175
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(33941, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,True


In [24]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8738,0.9091,0.138,0.5033,0.216,0.171,0.2144,0.5067
dummy,Dummy Classifier,0.8737,0.5,0.0,0.0,0.0,0.0,0.0,0.0233
ridge,Ridge Classifier,0.8736,0.0,0.0,0.0,0.0,-0.0002,-0.002,0.02
ada,Ada Boost Classifier,0.8731,0.9059,0.1936,0.4949,0.2776,0.2226,0.2526,0.1933
lightgbm,Light Gradient Boosting Machine,0.8731,0.906,0.2519,0.4946,0.3335,0.2716,0.2906,0.1233
catboost,CatBoost Classifier,0.8725,0.9052,0.2766,0.4916,0.354,0.2894,0.3041,2.6833
lr,Logistic Regression,0.8707,0.8908,0.09,0.4416,0.149,0.1113,0.1539,1.9933
rf,Random Forest Classifier,0.869,0.8945,0.3156,0.4725,0.3783,0.3083,0.3162,0.3
lda,Linear Discriminant Analysis,0.8689,0.8874,0.125,0.434,0.1938,0.1457,0.1798,0.0267
et,Extra Trees Classifier,0.8666,0.8891,0.3282,0.4604,0.3832,0.3107,0.3164,0.2733


# Experiment using region 46

In [25]:
train_data_46 = train_data.loc[train_data['region_code'] == '46.0']

In [26]:
train_data_46 = train_data_46.loc[:, train_data_46.columns!='region_code']

In [27]:
clf = setup(data=train_data_46, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,4838
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(20203, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,True


In [28]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8697,0.9079,0.2564,0.5146,0.3422,0.2788,0.2998,0.3
ada,Ada Boost Classifier,0.8682,0.9025,0.2768,0.5016,0.3564,0.2898,0.3059,0.13
ridge,Ridge Classifier,0.8679,0.0,0.0,0.0,0.0,0.0,0.0,0.0133
dummy,Dummy Classifier,0.8679,0.5,0.0,0.0,0.0,0.0,0.0,0.0167
rf,Random Forest Classifier,0.8674,0.8933,0.3598,0.4982,0.4173,0.3447,0.3508,0.22
catboost,CatBoost Classifier,0.8672,0.9045,0.341,0.4973,0.4041,0.3323,0.3401,1.6933
lr,Logistic Regression,0.8665,0.8904,0.1312,0.4812,0.206,0.1584,0.1991,2.0433
lightgbm,Light Gradient Boosting Machine,0.8659,0.9021,0.3576,0.4906,0.4132,0.3397,0.3453,0.0833
lda,Linear Discriminant Analysis,0.8642,0.889,0.219,0.4696,0.2987,0.2343,0.2554,0.02
et,Extra Trees Classifier,0.863,0.8878,0.3673,0.4761,0.4142,0.3382,0.3421,0.1833


# Experiment using the region code 41

In [29]:
train_data_41 = train_data.loc[train_data['region_code'] == '41.0']

In [30]:
train_data_41 = train_data_41.loc[:, train_data_41.columns!='region_code']

In [31]:
clf = setup(data=train_data_41, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,4581
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(19090, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,True


In [32]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8528,0.9093,0.5193,0.5384,0.5282,0.4411,0.4415,0.27
ada,Ada Boost Classifier,0.8523,0.9077,0.5221,0.5371,0.5293,0.4418,0.442,0.13
lda,Linear Discriminant Analysis,0.8503,0.9012,0.5936,0.5262,0.5577,0.468,0.4694,0.0233
catboost,CatBoost Classifier,0.8485,0.9055,0.5014,0.5255,0.5129,0.4233,0.4236,1.6
lr,Logistic Regression,0.8478,0.9017,0.3932,0.5306,0.4508,0.3648,0.3708,1.9867
lightgbm,Light Gradient Boosting Machine,0.8461,0.9023,0.5146,0.5163,0.5152,0.4238,0.4239,0.0933
rf,Random Forest Classifier,0.8443,0.8984,0.4459,0.5124,0.4768,0.3859,0.3872,0.1967
dummy,Dummy Classifier,0.8409,0.5,0.0,0.0,0.0,0.0,0.0,0.0167
et,Extra Trees Classifier,0.8396,0.8947,0.452,0.4956,0.4728,0.3785,0.379,0.1967
ridge,Ridge Classifier,0.837,0.0,0.0193,0.3047,0.0363,0.0177,0.0402,0.0167


In [34]:
train_data_generic = train_data.loc[~train_data['region_code'].isin(['41.0', '28.0', '8.0', '46.0'])]

In [35]:
train_data_generic.shape

(201721, 12)

In [36]:
train_data_generic['region_code'].unique()

['7.0', '33.0', '25.0', '39.0', '13.0', ..., '17.0', '51.0', '0.0', '23.0', '44.0']
Length: 49
Categories (53, object): ['0.0', '1.0', '10.0', '11.0', ..., '6.0', '7.0', '8.0', '9.0']

In [37]:
clf = setup(data=train_data_generic, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,3572
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(201721, 12)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,True


In [38]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8863,0.9154,0.3261,0.5483,0.4089,0.3505,0.365,0.71
catboost,CatBoost Classifier,0.8854,0.915,0.3479,0.5389,0.4228,0.3624,0.3732,9.3033
gbc,Gradient Boosting Classifier,0.8851,0.9139,0.1868,0.5745,0.2816,0.2364,0.2813,6.8033
ada,Ada Boost Classifier,0.8841,0.9113,0.2572,0.5416,0.3486,0.2937,0.3185,2.0933
lda,Linear Discriminant Analysis,0.8806,0.8987,0.3167,0.5083,0.3903,0.328,0.3393,0.8
rf,Random Forest Classifier,0.88,0.902,0.3347,0.5042,0.4023,0.3386,0.3474,4.35
ridge,Ridge Classifier,0.8793,0.0,0.0,0.0,0.0,-0.0,-0.001,0.19
dummy,Dummy Classifier,0.8793,0.5,0.0,0.0,0.0,0.0,0.0,0.15
lr,Logistic Regression,0.8779,0.8327,0.1555,0.3207,0.2094,0.1703,0.1839,0.84
et,Extra Trees Classifier,0.8736,0.8887,0.353,0.4685,0.4026,0.3335,0.3377,6.63


In [39]:
train_data_with_small_regions = train_data.loc[~train_data['region_code'].isin(['41.0', '28.0', '8.0', '46.0'])]

In [41]:
clf = setup(data=train_data_with_small_regions, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel', 'region_code'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,4022
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(201721, 12)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,True


In [42]:
best = compare_models(fold=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8858,0.9148,0.3125,0.548,0.3976,0.3397,0.3564,0.65
catboost,CatBoost Classifier,0.8854,0.9143,0.3404,0.5401,0.4175,0.3575,0.3693,9.2667
gbc,Gradient Boosting Classifier,0.8853,0.9136,0.2345,0.5601,0.3302,0.2788,0.3109,3.8433
ada,Ada Boost Classifier,0.883,0.9096,0.1848,0.5469,0.275,0.2282,0.2689,1.2767
ridge,Ridge Classifier,0.8793,0.0,0.0,0.0,0.0,0.0,0.0,0.18
rf,Random Forest Classifier,0.8793,0.9013,0.319,0.5001,0.3894,0.3261,0.3363,2.38
dummy,Dummy Classifier,0.8793,0.5,0.0,0.0,0.0,0.0,0.0,0.1133
lda,Linear Discriminant Analysis,0.8779,0.8904,0.1899,0.4857,0.2728,0.2199,0.2492,0.2
et,Extra Trees Classifier,0.8769,0.8952,0.3239,0.4848,0.3883,0.3228,0.331,2.2233
lr,Logistic Regression,0.8766,0.8915,0.2163,0.4761,0.2973,0.24,0.2627,2.1933
