### These notebook details the model building using Catboost for actual payment predictions:

***1) Feature selection***

***2) Categorical feature transformation***

***3) Group K-Fold to avoid leakage in evaluation***

***4) Model building and predictions***

In [1]:
from sklearn.metrics import f1_score
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier
import warnings
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, StratifiedKFold, GroupKFold
from sklearn.metrics import balanced_accuracy_score, auc, mean_squared_error, roc_curve, confusion_matrix, precision_score, recall_score, f1_score,\
log_loss, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import random
pd.set_option("display.max_columns",30)
rand = 40
random.seed(40)
np.random.seed(40)

In [2]:
data = pd.read_pickle("./data/data_prep.pkl", compression="gzip")
ss = pd.read_csv('./data/SampleSubmission.csv')

In [3]:
train_status = pd.read_csv("./data/Noleak Train Loan Status V2.csv")

test_status = pd.read_csv("./data/Noleak Test Loan Status V2.csv")

train_status = train_status.drop(['Unnamed: 0'], axis = 1)

test_status = test_status.drop(['Unnamed: 0'], axis = 1)

In [4]:
train_status

Unnamed: 0,ID,Target
0,ID_0IWQNPI,0
1,ID_IY8SYB9,1
2,ID_3TMLZ41,1
3,ID_GPL8VO8,1
4,ID_KMK2R2J,1
...,...,...
28002,ID_U45UBV4,1
28003,ID_M2BYCIV,1
28004,ID_C3ICA30,1
28005,ID_4EH9O8V,1


In [5]:
train_status.head()

Unnamed: 0,ID,Target
0,ID_0IWQNPI,0
1,ID_IY8SYB9,1
2,ID_3TMLZ41,1
3,ID_GPL8VO8,1
4,ID_KMK2R2J,1


In [6]:
test_status.head()

Unnamed: 0,ID,Target
0,ID_6L67PAA,1
1,ID_VJ80SX2,1
2,ID_7OU9HLK,0
3,ID_WVWTPGK,1
4,ID_04DSDQS,1


In [7]:
data.head()

Unnamed: 0,ID,Target m Month,Target m Payment,AccessoryRate,Region_ payment_std,Payment_window_until6months,Age,Occupation_ payment_sum,age_group_ payment_mean,Occupation_ payment_max,MainApplicantGender_ payment_sum,TotalContract/Term,age_group_ payment_min,rateTypeEntity_ payment_max,Town_ payment_mean,...,paymentmax_mstats,paymentstd_mstats,paymentsum_mstats,paymentlen_mstats,paymentmonth_m,paymentmedian_ystats,paymentmean_ystats,paymentmin_ystats,paymentmax_ystats,paymentstd_ystats,paymentsum_ystats,paymentlen_ystats,paymentyear_m,TermDateto_m,TermDatetoratio_m
0,ID_000RHRU,m1,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,11660.0,45.494505,220.0,2885.0,984.040404,...,480.0,140.0,680.0,2.0,11.0,120.0,210.0,40.0,700.0,224.425192,1260.0,6.0,2019.0,-458,0.932314
1,ID_000RHRU,m2,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,11660.0,45.494505,220.0,2885.0,984.040404,...,640.0,220.0,840.0,2.0,12.0,120.0,210.0,40.0,700.0,224.425192,1260.0,6.0,2019.0,-488,0.875
2,ID_000RHRU,m3,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,11660.0,45.494505,220.0,2885.0,984.040404,...,700.0,187.283208,1980.0,4.0,1.0,,,,,,,,2020.0,-519,0.822736
3,ID_000RHRU,m4,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,11660.0,45.494505,220.0,2885.0,984.040404,...,640.0,225.430144,1290.0,4.0,2.0,,,,,,,,2020.0,-550,0.776364
4,ID_000RHRU,m5,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,11660.0,45.494505,220.0,2885.0,984.040404,...,120.0,20.0,200.0,2.0,3.0,,,,,,,,2020.0,-579,0.737478


In [8]:
data.columns

Index(['ID', 'Target m Month', 'Target m Payment', 'AccessoryRate',
       'Region_ payment_std', 'Payment_window_until6months', 'Age',
       'Occupation_ payment_sum', 'age_group_ payment_mean',
       'Occupation_ payment_max',
       ...
       'paymentmedian_ystats', 'paymentmean_ystats', 'paymentmin_ystats',
       'paymentmax_ystats', 'paymentstd_ystats', 'paymentsum_ystats',
       'paymentlen_ystats', 'paymentyear_m', 'TermDateto_m',
       'TermDatetoratio_m'],
      dtype='object', length=128)

In [9]:
data.dtypes

ID                      object
Target m Month          object
Target m Payment       float64
AccessoryRate          float64
Region_ payment_std    float64
                        ...   
paymentsum_ystats      float64
paymentlen_ystats      float64
paymentyear_m          float64
TermDateto_m             int64
TermDatetoratio_m      float64
Length: 128, dtype: object

### Feature Selection

In [10]:
data = data.drop(['MainApplicantGender_ payment_std', 'MainApplicantGender', 'Term_brackets_ payment_max',\
                  'MainApplicantGender_ payment_mean','rateTypeEntity_ payment_sum','LastTransactionDate day',\
                  'rateTypeEntity', 'age_group_ payment_median','rateTypeEntity_ payment_mean',\
                  'Occupation_ payment_mean', 'age_group_ payment_max', 'Term_brackets_ payment_std',\
                  'MainApplicantGender_ payment_sum', 'Term_brackets_ payment_mean',\
                  'rateTypeEntity_ payment_median','MainApplicantGender_ payment_max','FirstPaymentDate year',\
                  'rateTypeEntity_ payment_max','Term_brackets','rateTypeEntity_ payment_std',\
                  'MainApplicantGender_ payment_median', 'RegisteredAtMonthEnd', 'Occupation_ payment_min'], axis = 1)

In [11]:
data = data.drop(['upsell_month', 'RegisteredInLeapYear', 'RegistrationDate year', 'RegisteredAtMonthStart', \
                 'RegistrationDate month', 'LastTransactionDate month', 'FirstPaymentDate day', \
                  'RegistrationDate day', 'Term', 'First_payment', 'FirstPaymentDate month', \
                  'payment_mean', 'payment_std', 'payment_median', 'paymentmedian_ystats',
                  'paymentmean_ystats', 'paymentmin_ystats', 'paymentmax_ystats',
                  'paymentstd_ystats', 'paymentsum_ystats', 'paymentlen_ystats', \
                  'paymentyear_m', 'TermDatetoratio_m', 'TermDatetoLastPayment', \
                  'LastTransactionDate year', 'TotalContract/Term', 'Payment_miss_count', \
                  'Tot_Amt_by_contractTenure', 'overpaid', 'Av_pay_miss', \
                  'Payment_miss_sum', 'Contract_Rate_vs_Average_pay', 'ContractRate_month'], axis = 1)

In [12]:
data.head()

Unnamed: 0,ID,Target m Month,Target m Payment,AccessoryRate,Region_ payment_std,Payment_window_until6months,Age,Occupation_ payment_sum,age_group_ payment_mean,Occupation_ payment_max,age_group_ payment_min,Town_ payment_mean,userpay_frequency,paymenthist_month 5,age_group_ payment_sum,...,Occupation,CurrentDuration,Town_ payment_max,Term_brackets_ payment_min,rateTypeEntity_ payment_min,DaysOnDeposit,paymentmedian_mstats,paymentmean_mstats,paymentmin_mstats,paymentmax_mstats,paymentstd_mstats,paymentsum_mstats,paymentlen_mstats,paymentmonth_m,TermDateto_m
0,ID_000RHRU,m1,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,Farmer,790,2720.0,280.0,240.0,7,340.0,340.0,200.0,480.0,140.0,680.0,2.0,11.0,-458
1,ID_000RHRU,m2,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,Farmer,790,2720.0,280.0,240.0,7,420.0,420.0,200.0,640.0,220.0,840.0,2.0,12.0,-488
2,ID_000RHRU,m3,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,Farmer,790,2720.0,280.0,240.0,7,540.0,495.0,200.0,700.0,187.283208,1980.0,4.0,1.0,-519
3,ID_000RHRU,m4,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,Farmer,790,2720.0,280.0,240.0,7,305.0,322.5,40.0,640.0,225.430144,1290.0,4.0,2.0,-550
4,ID_000RHRU,m5,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,Farmer,790,2720.0,280.0,240.0,7,100.0,100.0,80.0,120.0,20.0,200.0,2.0,3.0,-579


In [13]:
data.columns

Index(['ID', 'Target m Month', 'Target m Payment', 'AccessoryRate',
       'Region_ payment_std', 'Payment_window_until6months', 'Age',
       'Occupation_ payment_sum', 'age_group_ payment_mean',
       'Occupation_ payment_max', 'age_group_ payment_min',
       'Town_ payment_mean', 'userpay_frequency', 'paymenthist_month 5',
       'age_group_ payment_sum', 'Term_brackets_ payment_median',
       'Town_ payment_min', 'MainApplicantGender_ payment_min',
       'payedsum/TotalContract', 'paymenthist_month 2', 'monthshist_month 5',
       'Region_ payment_mean', 'Contract/Payhist window',
       'Region_ payment_max', 'Term_brackets_ payment_sum',
       'Region_ payment_min', 'Contract/until6monthswindow',
       'Region_ payment_median', 'Town', 'payhist_len', 'monthshist_month 2',
       'paymenthist_month 1', 'payLeft', 'monthshist_month 4',
       'paymenthist_month 3', 'Town_ payment_sum', 'Town_ payment_std',
       'Deposit', 'monthshist_month 1', 'RatePerUnit', 'is_upsell',
  

In [14]:
#Extract categorical

cat_cols = []

for col in data.columns:
    
    if data[col].dtype == 'O':
        
        cat_cols.append(col)

In [15]:
cat_cols.remove('tag')
cat_cols.remove('ID')

In [16]:
cat_cols

['Target m Month', 'Town', 'Region', 'age_group', 'Occupation']

In [17]:
data.columns

Index(['ID', 'Target m Month', 'Target m Payment', 'AccessoryRate',
       'Region_ payment_std', 'Payment_window_until6months', 'Age',
       'Occupation_ payment_sum', 'age_group_ payment_mean',
       'Occupation_ payment_max', 'age_group_ payment_min',
       'Town_ payment_mean', 'userpay_frequency', 'paymenthist_month 5',
       'age_group_ payment_sum', 'Term_brackets_ payment_median',
       'Town_ payment_min', 'MainApplicantGender_ payment_min',
       'payedsum/TotalContract', 'paymenthist_month 2', 'monthshist_month 5',
       'Region_ payment_mean', 'Contract/Payhist window',
       'Region_ payment_max', 'Term_brackets_ payment_sum',
       'Region_ payment_min', 'Contract/until6monthswindow',
       'Region_ payment_median', 'Town', 'payhist_len', 'monthshist_month 2',
       'paymenthist_month 1', 'payLeft', 'monthshist_month 4',
       'paymenthist_month 3', 'Town_ payment_sum', 'Town_ payment_std',
       'Deposit', 'monthshist_month 1', 'RatePerUnit', 'is_upsell',
  

In [18]:
data.isnull().sum()

ID                         0
Target m Month             0
Target m Payment       56016
AccessoryRate              0
Region_ payment_std    11604
                       ...  
paymentstd_mstats      61399
paymentsum_mstats      61399
paymentlen_mstats      61399
paymentmonth_m             0
TermDateto_m               0
Length: 72, dtype: int64

In [19]:
for col in cat_cols:

    data[col] = pd.factorize(data[col])[0]

In [20]:
for col in cat_cols:
    
    print(data[col].value_counts())

5    37343
4    37343
3    37343
2    37343
1    37343
0    37343
Name: Target m Month, dtype: int64
 2     12588
-1     11604
 11    11466
 24    10548
 10    10242
 12     9936
 28     9708
 4      9630
 5      9162
 23     8406
 34     7764
 3      7470
 1      7410
 8      6924
 14     6510
 0      6204
 20     5448
 6      5316
 7      5280
 33     5220
 17     5088
 19     4932
 22     4668
 13     4326
 30     4104
 16     3942
 9      3714
 41     3582
 26     3048
 21     2262
 31     2178
 35     1854
 44     1746
 25     1590
 36     1476
 18     1386
 42     1344
 29      978
 27      948
 37      708
 32      612
 39      528
 43      468
 45      432
 15      402
 40      336
 46      288
 38      282
Name: Town, dtype: int64
 3    41988
 0    32064
 4    30336
 6    28554
 1    28026
 2    27048
 5    24438
-1    11604
Name: Region, dtype: int64
 0    65208
 3    43554
-1    41634
 2    39738
 1    24408
 4     6816
 5     2700
Name: age_group, dtype: int64
1    74280
0 

In [21]:
data.head()

Unnamed: 0,ID,Target m Month,Target m Payment,AccessoryRate,Region_ payment_std,Payment_window_until6months,Age,Occupation_ payment_sum,age_group_ payment_mean,Occupation_ payment_max,age_group_ payment_min,Town_ payment_mean,userpay_frequency,paymenthist_month 5,age_group_ payment_sum,...,Occupation,CurrentDuration,Town_ payment_max,Term_brackets_ payment_min,rateTypeEntity_ payment_min,DaysOnDeposit,paymentmedian_mstats,paymentmean_mstats,paymentmin_mstats,paymentmax_mstats,paymentstd_mstats,paymentsum_mstats,paymentlen_mstats,paymentmonth_m,TermDateto_m
0,ID_000RHRU,0,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,0,790,2720.0,280.0,240.0,7,340.0,340.0,200.0,480.0,140.0,680.0,2.0,11.0,-458
1,ID_000RHRU,1,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,0,790,2720.0,280.0,240.0,7,420.0,420.0,200.0,640.0,220.0,840.0,2.0,12.0,-488
2,ID_000RHRU,2,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,0,790,2720.0,280.0,240.0,7,540.0,495.0,200.0,700.0,187.283208,1980.0,4.0,1.0,-519
3,ID_000RHRU,3,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,0,790,2720.0,280.0,240.0,7,305.0,322.5,40.0,640.0,225.430144,1290.0,4.0,2.0,-550
4,ID_000RHRU,4,,0.0,592.652439,27,30.0,11990.0,1017.071429,2880.0,220.0,984.040404,0.851852,200.0,11080.0,...,0,790,2720.0,280.0,240.0,7,100.0,100.0,80.0,120.0,20.0,200.0,2.0,3.0,-579


In [22]:
data.columns

Index(['ID', 'Target m Month', 'Target m Payment', 'AccessoryRate',
       'Region_ payment_std', 'Payment_window_until6months', 'Age',
       'Occupation_ payment_sum', 'age_group_ payment_mean',
       'Occupation_ payment_max', 'age_group_ payment_min',
       'Town_ payment_mean', 'userpay_frequency', 'paymenthist_month 5',
       'age_group_ payment_sum', 'Term_brackets_ payment_median',
       'Town_ payment_min', 'MainApplicantGender_ payment_min',
       'payedsum/TotalContract', 'paymenthist_month 2', 'monthshist_month 5',
       'Region_ payment_mean', 'Contract/Payhist window',
       'Region_ payment_max', 'Term_brackets_ payment_sum',
       'Region_ payment_min', 'Contract/until6monthswindow',
       'Region_ payment_median', 'Town', 'payhist_len', 'monthshist_month 2',
       'paymenthist_month 1', 'payLeft', 'monthshist_month 4',
       'paymenthist_month 3', 'Town_ payment_sum', 'Town_ payment_std',
       'Deposit', 'monthshist_month 1', 'RatePerUnit', 'is_upsell',
  

In [23]:
train = data[data['tag'] == 'train']

test = data[data['tag'] == 'test']


train = train.drop(['tag'], axis = 1)

test = test.drop(['tag'], axis = 1)

In [24]:
train = train.merge(train_status, on= 'ID', how = 'left')

In [25]:
test = test.merge(test_status, on= 'ID', how = 'left')

In [26]:
train.shape, test.shape

((168042, 72), (56016, 72))

In [27]:
train.reset_index(drop = True, inplace = True)

test.reset_index(drop = True, inplace = True)

In [28]:
train = train.sample(random_state = rand, frac = 1)

In [29]:
train.head()

Unnamed: 0,ID,Target m Month,Target m Payment,AccessoryRate,Region_ payment_std,Payment_window_until6months,Age,Occupation_ payment_sum,age_group_ payment_mean,Occupation_ payment_max,age_group_ payment_min,Town_ payment_mean,userpay_frequency,paymenthist_month 5,age_group_ payment_sum,...,CurrentDuration,Town_ payment_max,Term_brackets_ payment_min,rateTypeEntity_ payment_min,DaysOnDeposit,paymentmedian_mstats,paymentmean_mstats,paymentmin_mstats,paymentmax_mstats,paymentstd_mstats,paymentsum_mstats,paymentlen_mstats,paymentmonth_m,TermDateto_m,Target
29697,ID_6CWFB46,3,1750.0,0.0,614.635021,14,36.0,11707.0,1017.071429,3000.0,220.0,1074.714815,1.0,1525.0,11080.0,...,386,3200.0,200.0,240.0,3,1300.0,1300.0,1300.0,1300.0,0.0,1300.0,1.0,9.0,46,1
93769,ID_K5N1RC2,1,2200.0,0.0,609.040759,8,64.0,11990.0,1112.727273,2880.0,300.0,980.769231,0.75,4400.0,11377.5,...,197,3000.0,200.0,240.0,3,4400.0,4400.0,4400.0,4400.0,0.0,4400.0,1.0,1.0,297,1
84805,ID_I7U5Z0Y,1,500.0,0.0,595.421606,28,67.0,11990.0,1134.888889,2880.0,360.0,1115.555556,1.0,40.0,11405.0,...,817,2660.0,280.0,240.0,7,120.0,120.0,40.0,200.0,80.0,240.0,2.0,5.0,-515,0
32993,ID_73JJQD2,5,384.0,0.0,614.635021,20,,11320.0,,2850.0,,1074.714815,1.0,1076.0,,...,562,3200.0,200.0,240.0,3,970.0,970.0,495.0,1445.0,475.0,1940.0,2.0,5.0,-189,1
115803,ID_OSOK3IB,3,230.0,0.0,652.399418,11,36.0,11140.0,1017.071429,2850.0,220.0,1248.444444,1.0,460.0,11080.0,...,286,3302.5,280.0,240.0,3,1170.0,1170.0,1170.0,1170.0,0.0,1170.0,1.0,3.0,-137,0


***Splitting the dataset according to GroupKFold(to avoid data leakage)***

In [30]:
group_by_user_id = train.groupby(['ID', 'Target m Month']) 
group_by_user_id.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Target m Payment,AccessoryRate,Region_ payment_std,Payment_window_until6months,Age,Occupation_ payment_sum,age_group_ payment_mean,Occupation_ payment_max,age_group_ payment_min,Town_ payment_mean,userpay_frequency,paymenthist_month 5,age_group_ payment_sum,Term_brackets_ payment_median,Town_ payment_min,...,CurrentDuration,Town_ payment_max,Term_brackets_ payment_min,rateTypeEntity_ payment_min,DaysOnDeposit,paymentmedian_mstats,paymentmean_mstats,paymentmin_mstats,paymentmax_mstats,paymentstd_mstats,paymentsum_mstats,paymentlen_mstats,paymentmonth_m,TermDateto_m,Target
ID,Target m Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
ID_001AMM9,0,950.0,0.0,592.652439,10,53.0,11990.0,1112.727273,2880.0,300.0,999.310345,0.900000,1250.0,11377.5,880.0,250.0,...,250,2790.0,280.0,240.0,7,,,,,,,,6.0,82,1
ID_001AMM9,1,1280.0,0.0,592.652439,10,53.0,11990.0,1112.727273,2880.0,300.0,999.310345,0.900000,1250.0,11377.5,880.0,250.0,...,250,2790.0,280.0,240.0,7,,,,,,,,7.0,52,1
ID_001AMM9,2,750.0,0.0,592.652439,10,53.0,11990.0,1112.727273,2880.0,300.0,999.310345,0.900000,1250.0,11377.5,880.0,250.0,...,250,2790.0,280.0,240.0,7,2200.0,2200.000000,2200.0,2200.0,0.000000,2200.0,1.0,8.0,21,1
ID_001AMM9,3,1150.0,0.0,592.652439,10,53.0,11990.0,1112.727273,2880.0,300.0,999.310345,0.900000,1250.0,11377.5,880.0,250.0,...,250,2790.0,280.0,240.0,7,880.0,880.000000,880.0,880.0,0.000000,880.0,1.0,9.0,-10,1
ID_001AMM9,4,1250.0,0.0,592.652439,10,53.0,11990.0,1112.727273,2880.0,300.0,999.310345,0.900000,1250.0,11377.5,880.0,250.0,...,250,2790.0,280.0,240.0,7,,,,,,,,10.0,-40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID_ZZYZHG8,1,800.0,0.0,595.421606,38,30.0,11707.0,1017.071429,3000.0,220.0,1142.678571,0.868421,160.0,11080.0,880.0,350.0,...,1120,2810.0,280.0,240.0,7,280.0,360.000000,100.0,700.0,251.396102,1080.0,3.0,11.0,-818,0
ID_ZZYZHG8,2,720.0,0.0,595.421606,38,30.0,11707.0,1017.071429,3000.0,220.0,1142.678571,0.868421,160.0,11080.0,880.0,350.0,...,1120,2810.0,280.0,240.0,7,280.0,313.333333,100.0,560.0,189.267594,940.0,3.0,12.0,-848,0
ID_ZZYZHG8,3,190.0,0.0,595.421606,38,30.0,11707.0,1017.071429,3000.0,220.0,1142.678571,0.868421,160.0,11080.0,880.0,350.0,...,1120,2810.0,280.0,240.0,7,380.0,410.000000,100.0,800.0,273.922130,2460.0,6.0,1.0,-879,0
ID_ZZYZHG8,4,400.0,0.0,595.421606,38,30.0,11707.0,1017.071429,3000.0,220.0,1142.678571,0.868421,160.0,11080.0,880.0,350.0,...,1120,2810.0,280.0,240.0,7,280.0,358.000000,40.0,810.0,289.440840,1790.0,5.0,2.0,-910,0


In [31]:
groups_by_user_id_list = train['ID'].copy().tolist()

In [32]:
groups_by_user_id_list

['ID_6CWFB46',
 'ID_K5N1RC2',
 'ID_I7U5Z0Y',
 'ID_73JJQD2',
 'ID_OSOK3IB',
 'ID_BMAXX1S',
 'ID_L2S05DG',
 'ID_L9GDFOY',
 'ID_EX7YBWN',
 'ID_3XLV3HJ',
 'ID_CEP7FA2',
 'ID_HBN98WU',
 'ID_GO1NUWF',
 'ID_TO5NR4D',
 'ID_GO6XCAS',
 'ID_RQW83GH',
 'ID_IWAR1MV',
 'ID_UL8SEP7',
 'ID_7KMFK5F',
 'ID_CQS77NW',
 'ID_ZYRDHF0',
 'ID_EKVCXSS',
 'ID_AZ8H8EG',
 'ID_X7I2G78',
 'ID_10GI9WE',
 'ID_REDNKNJ',
 'ID_44C552G',
 'ID_P2D2K07',
 'ID_TS7XWFB',
 'ID_DBJ8EZD',
 'ID_MINLIM6',
 'ID_M87J7MR',
 'ID_NQW14QS',
 'ID_1BP7HHS',
 'ID_GTLZ16L',
 'ID_IAP10YR',
 'ID_EJ1R5AX',
 'ID_TGAH4QT',
 'ID_BJPSDZ4',
 'ID_NM97B13',
 'ID_2IIE00S',
 'ID_UECZ82T',
 'ID_DDLUE3F',
 'ID_VJVX658',
 'ID_NAXRYJJ',
 'ID_46JXAXT',
 'ID_5FDG34T',
 'ID_WD75AGY',
 'ID_CSYSUC9',
 'ID_YRPH7LL',
 'ID_LYP3R6R',
 'ID_AV372TH',
 'ID_8VQTW09',
 'ID_XV50PPR',
 'ID_YHNF25Y',
 'ID_97AR3V9',
 'ID_RYZFPIR',
 'ID_O9GRD7U',
 'ID_7I6E8ZN',
 'ID_ZVSNA47',
 'ID_81560CI',
 'ID_YRULEWP',
 'ID_TAOVK63',
 'ID_AO6HANA',
 'ID_QCY1YYE',
 'ID_ISNVBOQ',
 'ID_B3CFK

In [33]:
y = train['Target m Payment'].astype(int)

X = train.drop(['Target m Payment', 'ID'], axis = 1)

In [34]:
X.columns

Index(['Target m Month', 'AccessoryRate', 'Region_ payment_std',
       'Payment_window_until6months', 'Age', 'Occupation_ payment_sum',
       'age_group_ payment_mean', 'Occupation_ payment_max',
       'age_group_ payment_min', 'Town_ payment_mean', 'userpay_frequency',
       'paymenthist_month 5', 'age_group_ payment_sum',
       'Term_brackets_ payment_median', 'Town_ payment_min',
       'MainApplicantGender_ payment_min', 'payedsum/TotalContract',
       'paymenthist_month 2', 'monthshist_month 5', 'Region_ payment_mean',
       'Contract/Payhist window', 'Region_ payment_max',
       'Term_brackets_ payment_sum', 'Region_ payment_min',
       'Contract/until6monthswindow', 'Region_ payment_median', 'Town',
       'payhist_len', 'monthshist_month 2', 'paymenthist_month 1', 'payLeft',
       'monthshist_month 4', 'paymenthist_month 3', 'Town_ payment_sum',
       'Town_ payment_std', 'Deposit', 'monthshist_month 1', 'RatePerUnit',
       'is_upsell', 'age_group_ payment_std', 'R

### CATBOOST

In [35]:
def root_mean_squared_error(y_true, y_pred):
    """Root mean squared error regression loss"""
    return np.sqrt(np.mean(np.square(y_true-y_pred)))

In [36]:
#Cross-Validation LGBM routine

def cat_crossval(model):

    rand = 40

    fold = GroupKFold(n_splits= 5)

    i = 1

    rmse_bin = list()
    
    
    predictions = []
    
    
    X_cols = list(X.columns)
    
    X_len = len(X_cols)
    
    test_new = test[X_cols]

    
    
    
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(X_len)

    


    for train_index, test_index in fold.split(X, y, groups = groups_by_user_id_list):
        
    
        print('fold n°: ', i)
        

        x_data, x_val = X.iloc[train_index], X.iloc[test_index]

        y_data, y_val = y.iloc[train_index], y.iloc[test_index]
    
    
    
   
        
        model.fit(x_data, y_data, eval_set= [(x_val, y_val)],  verbose = 0)
        
    
        y_train_pred = model.predict(x_data)
        
        y_test_pred = model.predict(x_val)
        
        predictions.append(model.predict(test_new))

        
        
        feature_importances += model.feature_importances_ 
    

    
        print('rmse train', root_mean_squared_error(y_data, y_train_pred))
        print('rmse test',  root_mean_squared_error(y_val, y_test_pred))
    
    
    
        
        
        rmse_bin.append(root_mean_squared_error(y_val, y_test_pred))
    
    


        i = i + 1
        
    print('Mean rmse bin test: ', np.mean(rmse_bin)) 
    
    
    return predictions, feature_importances/10, rmse_bin

In [37]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(iterations= 1500, learning_rate = 0.008, depth = 8, grow_policy = 'Lossguide', \
                              random_state = 40, reg_lambda = 15, loss_function='RMSE', num_leaves = 60)

In [38]:
test_predictions, importances, rmse_list= cat_crossval(cat_model)

fold n°:  1
rmse train 551.4533597348772
rmse test 610.2851246802442
fold n°:  2
rmse train 542.5204541817926
rmse test 806.5964328713676
fold n°:  3
rmse train 586.7447565805237
rmse test 689.9537741583015
fold n°:  4
rmse train 639.7105374864857
rmse test 604.0285505538487
fold n°:  5
rmse train 593.7422161776445
rmse test 790.5091208506061
Mean rmse bin test:  700.2746006228737


In [39]:
feature_importances = pd.DataFrame({'feature': list(X.columns), 'importance': importances}).sort_values('importance', 
                                    ascending = False)
    

In [40]:
feature_importances[:40]

Unnamed: 0,feature,importance
11,paymenthist_month 5,16.940591
30,payLeft,8.994852
38,is_upsell,4.276928
69,Target,4.199546
0,Target m Month,3.641541
52,paymenthist_month 4,2.057918
68,TermDateto_m,1.228526
48,payment_sum,0.953502
32,paymenthist_month 3,0.79201
16,payedsum/TotalContract,0.619761


***Create Test Submission***

In [41]:
#Fit entire model on data

cat_model.fit(X, y)

0:	learn: 894.3381910	total: 190ms	remaining: 4m 45s
1:	learn: 892.2149236	total: 370ms	remaining: 4m 37s
2:	learn: 889.9514116	total: 578ms	remaining: 4m 48s
3:	learn: 887.7980203	total: 775ms	remaining: 4m 49s
4:	learn: 885.7183186	total: 979ms	remaining: 4m 52s
5:	learn: 883.5716845	total: 1.18s	remaining: 4m 53s
6:	learn: 881.4908669	total: 1.4s	remaining: 4m 58s
7:	learn: 879.3541642	total: 1.59s	remaining: 4m 56s
8:	learn: 877.3147727	total: 1.79s	remaining: 4m 55s
9:	learn: 875.1634570	total: 1.96s	remaining: 4m 51s
10:	learn: 873.1772047	total: 2.16s	remaining: 4m 52s
11:	learn: 871.0887086	total: 2.35s	remaining: 4m 51s
12:	learn: 868.8847315	total: 2.55s	remaining: 4m 51s
13:	learn: 866.9534884	total: 2.74s	remaining: 4m 50s
14:	learn: 864.8185166	total: 2.97s	remaining: 4m 54s
15:	learn: 862.8533945	total: 3.23s	remaining: 4m 59s
16:	learn: 860.8062695	total: 3.44s	remaining: 4m 59s
17:	learn: 858.7034175	total: 3.66s	remaining: 5m 1s
18:	learn: 856.8847911	total: 3.84s	rema

153:	learn: 713.7642273	total: 32.1s	remaining: 4m 40s
154:	learn: 713.1985247	total: 32.3s	remaining: 4m 40s
155:	learn: 712.6475164	total: 32.5s	remaining: 4m 40s
156:	learn: 712.1131228	total: 32.7s	remaining: 4m 39s
157:	learn: 711.5865851	total: 32.9s	remaining: 4m 39s
158:	learn: 711.0756684	total: 33s	remaining: 4m 38s
159:	learn: 710.6122837	total: 33.2s	remaining: 4m 38s
160:	learn: 710.0765327	total: 33.4s	remaining: 4m 37s
161:	learn: 709.5402421	total: 33.6s	remaining: 4m 37s
162:	learn: 708.9861608	total: 33.8s	remaining: 4m 37s
163:	learn: 708.4971329	total: 34s	remaining: 4m 36s
164:	learn: 707.9032725	total: 34.2s	remaining: 4m 36s
165:	learn: 707.4077429	total: 34.3s	remaining: 4m 36s
166:	learn: 706.9223585	total: 34.5s	remaining: 4m 35s
167:	learn: 706.3705312	total: 34.7s	remaining: 4m 35s
168:	learn: 705.9010422	total: 34.9s	remaining: 4m 34s
169:	learn: 705.4454727	total: 35.1s	remaining: 4m 34s
170:	learn: 704.9511818	total: 35.3s	remaining: 4m 34s
171:	learn: 70

304:	learn: 661.4070013	total: 1m 1s	remaining: 4m 2s
305:	learn: 661.2414512	total: 1m 2s	remaining: 4m 1s
306:	learn: 660.9738377	total: 1m 2s	remaining: 4m 1s
307:	learn: 660.6863516	total: 1m 2s	remaining: 4m 1s
308:	learn: 660.5199481	total: 1m 2s	remaining: 4m 1s
309:	learn: 660.2033800	total: 1m 2s	remaining: 4m 1s
310:	learn: 659.9794744	total: 1m 3s	remaining: 4m
311:	learn: 659.6356674	total: 1m 3s	remaining: 4m
312:	learn: 659.4305618	total: 1m 3s	remaining: 4m
313:	learn: 659.2526553	total: 1m 3s	remaining: 4m
314:	learn: 659.0608930	total: 1m 3s	remaining: 4m
315:	learn: 658.7464332	total: 1m 4s	remaining: 3m 59s
316:	learn: 658.4281242	total: 1m 4s	remaining: 3m 59s
317:	learn: 658.2828353	total: 1m 4s	remaining: 3m 59s
318:	learn: 657.9350042	total: 1m 4s	remaining: 3m 59s
319:	learn: 657.6260843	total: 1m 4s	remaining: 3m 58s
320:	learn: 657.3742001	total: 1m 5s	remaining: 3m 58s
321:	learn: 657.1328194	total: 1m 5s	remaining: 3m 58s
322:	learn: 656.8880511	total: 1m 5s

452:	learn: 627.4455845	total: 1m 31s	remaining: 3m 32s
453:	learn: 627.2298572	total: 1m 31s	remaining: 3m 31s
454:	learn: 627.0167034	total: 1m 32s	remaining: 3m 31s
455:	learn: 626.8134985	total: 1m 32s	remaining: 3m 31s
456:	learn: 626.5893724	total: 1m 32s	remaining: 3m 31s
457:	learn: 626.3722374	total: 1m 32s	remaining: 3m 31s
458:	learn: 626.1454775	total: 1m 32s	remaining: 3m 30s
459:	learn: 625.9390011	total: 1m 33s	remaining: 3m 30s
460:	learn: 625.7201185	total: 1m 33s	remaining: 3m 30s
461:	learn: 625.5990102	total: 1m 33s	remaining: 3m 30s
462:	learn: 625.4547542	total: 1m 33s	remaining: 3m 30s
463:	learn: 625.2303799	total: 1m 34s	remaining: 3m 29s
464:	learn: 625.0384682	total: 1m 34s	remaining: 3m 29s
465:	learn: 624.8223711	total: 1m 34s	remaining: 3m 29s
466:	learn: 624.7044910	total: 1m 34s	remaining: 3m 29s
467:	learn: 624.5051523	total: 1m 34s	remaining: 3m 29s
468:	learn: 624.3715261	total: 1m 34s	remaining: 3m 28s
469:	learn: 624.2687057	total: 1m 35s	remaining:

600:	learn: 603.4622062	total: 2m 3s	remaining: 3m 4s
601:	learn: 603.3282308	total: 2m 3s	remaining: 3m 3s
602:	learn: 603.1660546	total: 2m 3s	remaining: 3m 3s
603:	learn: 603.0202436	total: 2m 3s	remaining: 3m 3s
604:	learn: 602.8679594	total: 2m 3s	remaining: 3m 3s
605:	learn: 602.7275502	total: 2m 4s	remaining: 3m 3s
606:	learn: 602.6142612	total: 2m 4s	remaining: 3m 2s
607:	learn: 602.4625399	total: 2m 4s	remaining: 3m 2s
608:	learn: 602.3300993	total: 2m 4s	remaining: 3m 2s
609:	learn: 602.1846395	total: 2m 4s	remaining: 3m 2s
610:	learn: 602.0848714	total: 2m 5s	remaining: 3m 2s
611:	learn: 601.9195714	total: 2m 5s	remaining: 3m 1s
612:	learn: 601.8031722	total: 2m 5s	remaining: 3m 1s
613:	learn: 601.6538710	total: 2m 5s	remaining: 3m 1s
614:	learn: 601.5466026	total: 2m 6s	remaining: 3m 1s
615:	learn: 601.4030698	total: 2m 6s	remaining: 3m 1s
616:	learn: 601.3136582	total: 2m 6s	remaining: 3m
617:	learn: 601.1496941	total: 2m 6s	remaining: 3m
618:	learn: 601.0107808	total: 2m 

749:	learn: 586.5000668	total: 2m 34s	remaining: 2m 34s
750:	learn: 586.4101980	total: 2m 34s	remaining: 2m 34s
751:	learn: 586.3006353	total: 2m 35s	remaining: 2m 34s
752:	learn: 586.1836879	total: 2m 35s	remaining: 2m 34s
753:	learn: 586.0744233	total: 2m 35s	remaining: 2m 33s
754:	learn: 585.9736460	total: 2m 35s	remaining: 2m 33s
755:	learn: 585.9072575	total: 2m 35s	remaining: 2m 33s
756:	learn: 585.8382078	total: 2m 36s	remaining: 2m 33s
757:	learn: 585.7316560	total: 2m 36s	remaining: 2m 33s
758:	learn: 585.6672734	total: 2m 36s	remaining: 2m 32s
759:	learn: 585.5962401	total: 2m 36s	remaining: 2m 32s
760:	learn: 585.5337874	total: 2m 37s	remaining: 2m 32s
761:	learn: 585.4271442	total: 2m 37s	remaining: 2m 32s
762:	learn: 585.3511808	total: 2m 37s	remaining: 2m 32s
763:	learn: 585.2460889	total: 2m 37s	remaining: 2m 31s
764:	learn: 585.1742978	total: 2m 37s	remaining: 2m 31s
765:	learn: 585.1071031	total: 2m 38s	remaining: 2m 31s
766:	learn: 585.0200298	total: 2m 38s	remaining:

898:	learn: 574.1118301	total: 3m 9s	remaining: 2m 6s
899:	learn: 574.0497070	total: 3m 9s	remaining: 2m 6s
900:	learn: 573.9953170	total: 3m 9s	remaining: 2m 6s
901:	learn: 573.9091270	total: 3m 9s	remaining: 2m 5s
902:	learn: 573.8584442	total: 3m 10s	remaining: 2m 5s
903:	learn: 573.7951424	total: 3m 10s	remaining: 2m 5s
904:	learn: 573.7134801	total: 3m 10s	remaining: 2m 5s
905:	learn: 573.6533117	total: 3m 10s	remaining: 2m 5s
906:	learn: 573.6056347	total: 3m 10s	remaining: 2m 4s
907:	learn: 573.5519918	total: 3m 11s	remaining: 2m 4s
908:	learn: 573.4978810	total: 3m 11s	remaining: 2m 4s
909:	learn: 573.4259553	total: 3m 11s	remaining: 2m 4s
910:	learn: 573.3613681	total: 3m 11s	remaining: 2m 4s
911:	learn: 573.2895521	total: 3m 12s	remaining: 2m 3s
912:	learn: 573.2368533	total: 3m 12s	remaining: 2m 3s
913:	learn: 573.1530947	total: 3m 12s	remaining: 2m 3s
914:	learn: 573.1044014	total: 3m 12s	remaining: 2m 3s
915:	learn: 573.0494924	total: 3m 12s	remaining: 2m 3s
916:	learn: 57

1045:	learn: 564.6187195	total: 3m 43s	remaining: 1m 37s
1046:	learn: 564.5557270	total: 3m 43s	remaining: 1m 36s
1047:	learn: 564.4801903	total: 3m 44s	remaining: 1m 36s
1048:	learn: 564.4309725	total: 3m 44s	remaining: 1m 36s
1049:	learn: 564.3626907	total: 3m 44s	remaining: 1m 36s
1050:	learn: 564.3243476	total: 3m 44s	remaining: 1m 36s
1051:	learn: 564.2406851	total: 3m 45s	remaining: 1m 35s
1052:	learn: 564.1768441	total: 3m 45s	remaining: 1m 35s
1053:	learn: 564.1380038	total: 3m 45s	remaining: 1m 35s
1054:	learn: 564.0661702	total: 3m 45s	remaining: 1m 35s
1055:	learn: 563.9986180	total: 3m 45s	remaining: 1m 35s
1056:	learn: 563.9247096	total: 3m 46s	remaining: 1m 34s
1057:	learn: 563.8737767	total: 3m 46s	remaining: 1m 34s
1058:	learn: 563.8166356	total: 3m 46s	remaining: 1m 34s
1059:	learn: 563.7801290	total: 3m 46s	remaining: 1m 34s
1060:	learn: 563.7231666	total: 3m 47s	remaining: 1m 33s
1061:	learn: 563.6627494	total: 3m 47s	remaining: 1m 33s
1062:	learn: 563.5940981	total:

1190:	learn: 556.5359947	total: 4m 17s	remaining: 1m 6s
1191:	learn: 556.4846052	total: 4m 18s	remaining: 1m 6s
1192:	learn: 556.4573711	total: 4m 18s	remaining: 1m 6s
1193:	learn: 556.4126148	total: 4m 18s	remaining: 1m 6s
1194:	learn: 556.3316121	total: 4m 18s	remaining: 1m 6s
1195:	learn: 556.2893701	total: 4m 19s	remaining: 1m 5s
1196:	learn: 556.2271338	total: 4m 19s	remaining: 1m 5s
1197:	learn: 556.1943108	total: 4m 19s	remaining: 1m 5s
1198:	learn: 556.1646004	total: 4m 19s	remaining: 1m 5s
1199:	learn: 556.1369560	total: 4m 20s	remaining: 1m 5s
1200:	learn: 556.0950609	total: 4m 20s	remaining: 1m 4s
1201:	learn: 556.0743697	total: 4m 20s	remaining: 1m 4s
1202:	learn: 556.0366414	total: 4m 20s	remaining: 1m 4s
1203:	learn: 556.0020715	total: 4m 20s	remaining: 1m 4s
1204:	learn: 555.9559173	total: 4m 21s	remaining: 1m 3s
1205:	learn: 555.9218244	total: 4m 21s	remaining: 1m 3s
1206:	learn: 555.8591584	total: 4m 21s	remaining: 1m 3s
1207:	learn: 555.8227865	total: 4m 21s	remaining

1338:	learn: 550.0454604	total: 4m 54s	remaining: 35.5s
1339:	learn: 550.0128845	total: 4m 55s	remaining: 35.2s
1340:	learn: 549.9346672	total: 4m 55s	remaining: 35s
1341:	learn: 549.9052224	total: 4m 55s	remaining: 34.8s
1342:	learn: 549.8618258	total: 4m 55s	remaining: 34.6s
1343:	learn: 549.8227054	total: 4m 56s	remaining: 34.4s
1344:	learn: 549.7818090	total: 4m 56s	remaining: 34.2s
1345:	learn: 549.7493531	total: 4m 56s	remaining: 34s
1346:	learn: 549.7060671	total: 4m 57s	remaining: 33.7s
1347:	learn: 549.6725463	total: 4m 57s	remaining: 33.5s
1348:	learn: 549.6335778	total: 4m 57s	remaining: 33.3s
1349:	learn: 549.5856928	total: 4m 58s	remaining: 33.1s
1350:	learn: 549.5517576	total: 4m 58s	remaining: 32.9s
1351:	learn: 549.5017594	total: 4m 58s	remaining: 32.7s
1352:	learn: 549.4527970	total: 4m 59s	remaining: 32.5s
1353:	learn: 549.4228961	total: 4m 59s	remaining: 32.3s
1354:	learn: 549.3869396	total: 4m 59s	remaining: 32.1s
1355:	learn: 549.3240746	total: 5m	remaining: 31.9s


1486:	learn: 544.2438901	total: 5m 30s	remaining: 2.89s
1487:	learn: 544.2146069	total: 5m 31s	remaining: 2.67s
1488:	learn: 544.1608281	total: 5m 31s	remaining: 2.45s
1489:	learn: 544.0982182	total: 5m 31s	remaining: 2.23s
1490:	learn: 544.0629633	total: 5m 31s	remaining: 2s
1491:	learn: 544.0332757	total: 5m 32s	remaining: 1.78s
1492:	learn: 543.9879230	total: 5m 32s	remaining: 1.56s
1493:	learn: 543.9257211	total: 5m 32s	remaining: 1.33s
1494:	learn: 543.8995949	total: 5m 32s	remaining: 1.11s
1495:	learn: 543.8634734	total: 5m 32s	remaining: 890ms
1496:	learn: 543.8210286	total: 5m 33s	remaining: 668ms
1497:	learn: 543.7973237	total: 5m 33s	remaining: 445ms
1498:	learn: 543.7492164	total: 5m 33s	remaining: 223ms
1499:	learn: 543.7228041	total: 5m 33s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2772120e198>

In [42]:
X_cols = list(X.columns)

test_new = test[X_cols]


test['Target m Payment'] = cat_model.predict(test_new)

In [43]:
test['M submission'] = test['Target m Month'].map({0: 'm1', 1 : 'm2', 2 : 'm3', 3 : 'm4', 4 : 'm5', 5 : 'm6'})

In [44]:
def create_submission_format(df):
    
    return df['ID'] + ' x ' + str(df['M submission'])

In [45]:
test['submission_ID'] = test.apply(create_submission_format, axis = 1)

In [46]:
submission = test[['submission_ID', 'Target m Payment']]

In [47]:
submission = submission.rename(columns = {'submission_ID' : 'ID', 'Target m Payment' : 'Target'})

In [48]:
submission.head()

Unnamed: 0,ID,Target
0,ID_000RHRU x m1,208.173667
1,ID_000RHRU x m2,226.717025
2,ID_000RHRU x m3,213.552269
3,ID_000RHRU x m4,232.12374
4,ID_000RHRU x m5,255.000548


In [49]:
import gc
gc.collect()

0