In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('./../data/train.csv', dtype={
                    'cust_id': 'int64',
                    'gender': 'category',
                    'age': 'int8',
                    'driving_license': 'category',
                    'region_code': 'category',
                    'previously_insured': 'category',
                    'vehicle_age': 'category',
                    'vehicle_damage': 'category',
                    'annual_premium': 'float16',
                    'policy_sales_channel': 'category',
                    'days_since_insured': 'int8',
                    'response': 'bool',
                 })

In [3]:
train_data.head()

Unnamed: 0,cust_id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,days_since_insured,response
0,167647,Male,22,1,7.0,1,< 1 Year,No,52.59375,152.0,16,False
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,866.5,26.0,-121,False
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,717.0,124.0,-3,False
3,87447,Female,22,1,33.0,0,< 1 Year,No,553.0,152.0,69,False
4,501933,Male,28,1,46.0,1,< 1 Year,No,580.5,152.0,-45,False


In [4]:
from pycaret.classification import *

### Experiment: Using the companyId as the categorical feature

In [5]:
reg = setup(data=train_data, target='response', ignore_features=['cust_id'],
            categorical_features=['gender', 'driving_license', 'previously_insured', 'vehicle_age', 'vehicle_damage'],
            high_cardinality_features=['policy_sales_channel', 'region_code'], experiment_name='churn1') 

Unnamed: 0,Description,Value
0,session_id,8900
1,Target,response
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(382154, 12)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,True


In [6]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [None]:
# best = compare_models(include=['nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'rf', 'qda', 'ada', 'gbc', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy'], fold=3)
best = compare_models(include=['nb', 'dt', 'svm', 'gpc', 'mlp', 'rf', 'qda', 'ada', 'gbc', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy'], fold=3)

IntProgress(value=0, description='Processing: ', max=79)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8206,0.8124,0.3742,0.4423,0.4054,0.3007,0.3021,1.5167
dt,Decision Tree Classifier,0.8135,0.6641,0.4417,0.4311,0.4363,0.3246,0.3247,1.0933
svm,SVM - Linear Kernel,0.6098,0.0,0.3352,0.1181,0.0971,-0.0022,-0.0062,8.3367




In [9]:
best

[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=4096, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                 

In [None]:
## comparing with the other boosting algorithms

In [15]:
best = compare_models(n_select=3, fold=5, exclude = ['rf', 'et'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,15.368,358.1481,18.9248,0.7613,0.158,0.1372,2.79
xgboost,Extreme Gradient Boosting,15.406,360.2976,18.9815,0.7599,0.1585,0.1374,54.642
gbr,Gradient Boosting Regressor,15.6569,375.5804,19.3799,0.7497,0.1627,0.141,70.988
lr,Linear Regression,15.8467,384.5224,19.6092,0.7437,0.1662,0.1424,3.356
ridge,Ridge Regression,15.8467,384.523,19.6092,0.7437,0.1662,0.1424,0.328
br,Bayesian Ridge,15.8467,384.5229,19.6092,0.7437,0.1662,0.1424,7.508
huber,Huber Regressor,15.8374,385.5695,19.6359,0.743,0.1654,0.1415,34.092
omp,Orthogonal Matching Pursuit,17.5081,485.5239,22.0346,0.6764,0.1873,0.1586,0.382
lasso,Lasso Regression,17.8893,496.7546,22.2879,0.6689,0.1954,0.1678,1.184
knn,K Neighbors Regressor,17.6717,501.157,22.3865,0.666,0.1882,0.1579,44.652


## Experiment: Comparing models by making companyId as a high cardinality feature

In [10]:
reg1 = setup(data=train_data, target='salary', ignore_features=['jobId'], high_cardinality_features = ['companyId'], log_experiment=True, experiment_name='salary2') 

Unnamed: 0,Description,Value
0,session_id,2015
1,Target,salary
2,Original Data,"(1000000, 9)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(699999, 32)"


In [18]:
best1 = compare_models(n_select=3, fold=5, exclude = ['rf', 'et', 'knn'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,15.3087,355.0953,18.844,0.7636,0.1573,0.1365,26.792
lightgbm,Light Gradient Boosting Machine,15.3609,357.9536,18.9197,0.7617,0.1582,0.1372,1.798
xgboost,Extreme Gradient Boosting,15.3851,359.4595,18.9594,0.7607,0.1584,0.1373,26.754
gbr,Gradient Boosting Regressor,15.6577,375.5186,19.3783,0.75,0.1628,0.1411,36.782
lr,Linear Regression,15.8448,384.4681,19.6078,0.7441,0.1663,0.1424,1.542
ridge,Ridge Regression,15.8448,384.4675,19.6078,0.7441,0.1663,0.1424,0.15
br,Bayesian Ridge,15.8448,384.4675,19.6078,0.7441,0.1663,0.1424,1.028
lar,Least Angle Regression,15.8479,384.6529,19.6126,0.744,0.1665,0.1425,0.164
huber,Huber Regressor,17.5853,484.9315,21.8727,0.6771,0.1901,0.162,10.534
lasso,Lasso Regression,17.884,496.4574,22.2813,0.6695,0.1955,0.1678,0.852


In [19]:
best1

[<catboost.core.CatBoostRegressor at 0x1b29550c4f0>,
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=5179, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min

In [21]:
evaluate_model(best1[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [22]:
save_model(best1[0], 'my_best_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['jobId'], id_columns=[],
                                       ml_usecase='regression',
                                       numerical_features=[], target='salary',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_st...
                 ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                 ('cluster_all', 'passthrough'),
                 ('dummy', Dummify(target='salary')),
                 ('fix_perfect', Remove_100(target='salary')),
                 ('clean_names', 

## Experiment: Using Polynomial Features

In [6]:
reg3 = setup(data=train_data, target='salary', ignore_features=['jobId'], high_cardinality_features = ['companyId'],
            log_experiment=True, polynomial_features = True, experiment_name='salary3') 

Unnamed: 0,Description,Value
0,session_id,1848
1,Target,salary
2,Original Data,"(1000000, 9)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(699999, 34)"


In [7]:
best3 = compare_models(n_select=3, fold=5, exclude = ['rf', 'et', 'knn'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,15.3212,355.5497,18.856,0.7631,0.1572,0.1366,27.184
lightgbm,Light Gradient Boosting Machine,15.3737,358.446,18.9326,0.7611,0.1582,0.1373,1.912
xgboost,Extreme Gradient Boosting,15.407,360.2882,18.9812,0.7599,0.1585,0.1374,29.794
gbr,Gradient Boosting Regressor,15.6689,376.0139,19.391,0.7494,0.1629,0.1412,41.95
lr,Linear Regression,15.8574,384.9111,19.6191,0.7435,0.1663,0.1425,1.636
ridge,Ridge Regression,15.8559,384.8427,19.6173,0.7435,0.1663,0.1425,0.174
lar,Least Angle Regression,15.8571,384.8776,19.6182,0.7435,0.1663,0.1425,0.164
br,Bayesian Ridge,15.8559,384.8427,19.6173,0.7435,0.1663,0.1425,1.112
lasso,Lasso Regression,17.9035,497.3785,22.3019,0.6686,0.1956,0.168,1.362
huber,Huber Regressor,20.4629,648.2498,25.4572,0.568,0.2309,0.1961,12.188
