## These notebook details the model building using LIGHTGBM for loan payment status:

***1) Feature selection***

***2) Categorical feature transformation***

***3) Stratified K-Fold cross validation to control imbalance in cross validation***

***4) Model building and generating predictions out of fold to avoid leakage which won't represent expected performance at test time***

In [1]:
from sklearn.metrics import f1_score
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostClassifier
import warnings
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, StratifiedKFold, GroupKFold
from sklearn.metrics import balanced_accuracy_score, auc, mean_squared_error, roc_curve, confusion_matrix, precision_score, recall_score, f1_score,\
log_loss, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import random
pd.set_option("display.max_columns",30)
rand = 40
random.seed(40)
np.random.seed(40)

In [2]:
data = pd.read_pickle("./data/loan_prep.pkl", compression="gzip")
ss = pd.read_csv('./data/SampleSubmission.csv')

data = data.drop(['Unnamed: 0'], axis = 1)

In [3]:
data.head()

Unnamed: 0,ID,m1,m2,m3,m4,m5,m6,payhist_len,tag,Deposit,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,...,rateTypeEntity_ payment_min,rateTypeEntity_ payment_max,MainApplicantGender_ payment_mean,MainApplicantGender_ payment_median,MainApplicantGender_ payment_sum,MainApplicantGender_ payment_std,MainApplicantGender_ payment_min,MainApplicantGender_ payment_max,Term_brackets,Term_brackets_ payment_mean,Term_brackets_ payment_median,Term_brackets_ payment_sum,Term_brackets_ payment_std,Term_brackets_ payment_min,Term_brackets_ payment_max
0,ID_MR53LEX,880.0,930.0,495.0,715.0,220.0,385.0,31,train,2500,0.0,DAILY,55,3,Male,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,a year+,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
1,ID_3D7NQUH,660.0,935.0,935.0,825.0,770.0,935.0,30,train,2500,0.0,DAILY,55,3,Male,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,a year+,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
2,ID_0IWQNPI,700.0,1350.0,1550.0,1400.0,1450.0,1200.0,6,train,2400,0.0,DAILY,50,3,Male,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,12months,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
3,ID_IY8SYB9,580.0,480.0,800.0,1260.0,1650.0,530.0,10,train,2000,0.0,DAILY,40,7,Female,...,240.0,2885.0,1035.651786,925.0,11660.0,597.796299,240.0,2840.0,12months,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
4,ID_9XHL7VZ,40.0,440.0,460.0,360.0,80.0,330.0,22,train,2000,0.0,DAILY,40,7,Male,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,12months,1005.714286,880.0,9844.0,586.976283,280.0,2560.0


In [4]:
data = data.drop(['m1', 'm2', 'm3', 'm4', 'm5', 'm6'], axis = 1)

In [5]:
data.dtypes

ID                                object
payhist_len                        int64
tag                               object
Deposit                            int64
AccessoryRate                    float64
                                  ...   
Term_brackets_ payment_median    float64
Term_brackets_ payment_sum       float64
Term_brackets_ payment_std       float64
Term_brackets_ payment_min       float64
Term_brackets_ payment_max       float64
Length: 209, dtype: object

In [6]:
#Scale target to decimal

data['Target'] = data['Target']/100

In [7]:
#Extract categorical

cat_cols = []

for col in data.columns:
    
    if data[col].dtype == 'O':
        
        cat_cols.append(col)

In [8]:
cat_cols.remove('tag')
cat_cols.remove('ID')

In [9]:
cat_cols

['rateTypeEntity',
 'MainApplicantGender',
 'Region',
 'Town',
 'Occupation',
 'age_group',
 'Term_brackets']

In [10]:
data.columns

Index(['ID', 'payhist_len', 'tag', 'Deposit', 'AccessoryRate',
       'rateTypeEntity', 'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender',
       'Age',
       ...
       'MainApplicantGender_ payment_std', 'MainApplicantGender_ payment_min',
       'MainApplicantGender_ payment_max', 'Term_brackets',
       'Term_brackets_ payment_mean', 'Term_brackets_ payment_median',
       'Term_brackets_ payment_sum', 'Term_brackets_ payment_std',
       'Term_brackets_ payment_min', 'Term_brackets_ payment_max'],
      dtype='object', length=209)

In [11]:
data.isnull().sum()

ID                               0
payhist_len                      0
tag                              0
Deposit                          0
AccessoryRate                    0
                                ..
Term_brackets_ payment_median    0
Term_brackets_ payment_sum       0
Term_brackets_ payment_std       0
Term_brackets_ payment_min       0
Term_brackets_ payment_max       0
Length: 209, dtype: int64

In [12]:
for col in cat_cols:

    data[col] = pd.factorize(data[col])[0]

In [13]:
for col in cat_cols:
    
    print(data[col].value_counts())

0    37168
1       97
2       78
Name: rateTypeEntity, dtype: int64
0    23351
1    13992
Name: MainApplicantGender, dtype: int64
 4    6998
 3    5344
 6    5056
 1    4759
 0    4671
 5    4508
 2    4073
-1    1934
Name: Region, dtype: int64
 12    2098
-1     1934
 9     1911
 1     1758
 5     1707
 3     1656
 8     1618
 7     1605
 6     1527
 11    1401
 15    1294
 0     1245
 20    1235
 23    1154
 22    1085
 26    1034
 19     908
 17     886
 10     880
 2      870
 13     848
 14     822
 21     778
 4      721
 16     684
 37     657
 25     619
 28     597
 34     508
 27     377
 32     363
 29     309
 24     291
 35     265
 38     246
 42     231
 30     224
 39     163
 31     158
 43     118
 40     102
 33      88
 41      78
 36      72
 18      67
 44      56
 45      48
 46      47
Name: Town, dtype: int64
2    12380
1    11747
5     5228
4     2770
6     2384
0     1830
3     1004
Name: Occupation, dtype: int64
 3    10868
 0     7259
-1     6939
 2     662

In [14]:
data.head()

Unnamed: 0,ID,payhist_len,tag,Deposit,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,Term,TotalContractValue,...,rateTypeEntity_ payment_min,rateTypeEntity_ payment_max,MainApplicantGender_ payment_mean,MainApplicantGender_ payment_median,MainApplicantGender_ payment_sum,MainApplicantGender_ payment_std,MainApplicantGender_ payment_min,MainApplicantGender_ payment_max,Term_brackets,Term_brackets_ payment_mean,Term_brackets_ payment_median,Term_brackets_ payment_sum,Term_brackets_ payment_std,Term_brackets_ payment_min,Term_brackets_ payment_max
0,ID_MR53LEX,31,train,2500,0.0,0,55,3,0,,0,0,0,556,33080.0,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,0,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
1,ID_3D7NQUH,30,train,2500,0.0,0,55,3,0,26.0,1,1,1,556,33080.0,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,0,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
2,ID_0IWQNPI,6,train,2400,0.0,0,50,3,0,21.0,2,2,2,240,14400.0,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
3,ID_IY8SYB9,10,train,2000,0.0,0,40,7,1,26.0,2,3,1,364,16560.0,...,240.0,2885.0,1035.651786,925.0,11660.0,597.796299,240.0,2840.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
4,ID_9XHL7VZ,22,train,2000,0.0,0,40,7,0,27.0,3,4,1,364,16560.0,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0


In [15]:
data.columns

Index(['ID', 'payhist_len', 'tag', 'Deposit', 'AccessoryRate',
       'rateTypeEntity', 'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender',
       'Age',
       ...
       'MainApplicantGender_ payment_std', 'MainApplicantGender_ payment_min',
       'MainApplicantGender_ payment_max', 'Term_brackets',
       'Term_brackets_ payment_mean', 'Term_brackets_ payment_median',
       'Term_brackets_ payment_sum', 'Term_brackets_ payment_std',
       'Term_brackets_ payment_min', 'Term_brackets_ payment_max'],
      dtype='object', length=209)

In [16]:
train = data[data['tag'] == 'train']

test = data[data['tag'] == 'test']


train = train.drop(['tag'], axis = 1)

test = test.drop(['tag'], axis = 1)

In [17]:
train.shape, test.shape

((28007, 208), (9336, 208))

In [18]:
train.reset_index(drop = True, inplace = True)

test.reset_index(drop = True, inplace = True)

In [19]:
train.head()

Unnamed: 0,ID,payhist_len,Deposit,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,Term,TotalContractValue,Target,...,rateTypeEntity_ payment_min,rateTypeEntity_ payment_max,MainApplicantGender_ payment_mean,MainApplicantGender_ payment_median,MainApplicantGender_ payment_sum,MainApplicantGender_ payment_std,MainApplicantGender_ payment_min,MainApplicantGender_ payment_max,Term_brackets,Term_brackets_ payment_mean,Term_brackets_ payment_median,Term_brackets_ payment_sum,Term_brackets_ payment_std,Term_brackets_ payment_min,Term_brackets_ payment_max
0,ID_MR53LEX,31,2500,0.0,0,55,3,0,,0,0,0,556,33080.0,0.594317,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,0,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
1,ID_3D7NQUH,30,2500,0.0,0,55,3,0,26.0,1,1,1,556,33080.0,0.822128,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,0,1158.083333,1100.0,24586.0,651.004326,200.0,3500.0
2,ID_0IWQNPI,6,2400,0.0,0,50,3,0,21.0,2,2,2,240,14400.0,1.000694,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
3,ID_IY8SYB9,10,2000,0.0,0,40,7,1,26.0,2,3,1,364,16560.0,1.0,...,240.0,2885.0,1035.651786,925.0,11660.0,597.796299,240.0,2840.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0
4,ID_9XHL7VZ,22,2000,0.0,0,40,7,0,27.0,3,4,1,364,16560.0,0.573007,...,240.0,2885.0,1065.0,950.0,11590.0,625.015119,250.0,2930.0,1,1005.714286,880.0,9844.0,586.976283,280.0,2560.0


In [20]:
train['Target'].value_counts()/len(train)

1.000000    0.399757
1.000030    0.009319
1.003125    0.006891
1.002415    0.005499
1.000604    0.004856
              ...   
0.834609    0.000036
0.768924    0.000036
0.756328    0.000036
0.462891    0.000036
0.846484    0.000036
Name: Target, Length: 9417, dtype: float64

In [21]:
train['Target'].value_counts()

1.000000    11196
1.000030      261
1.003125      193
1.002415      154
1.000604      136
            ...  
0.834609        1
0.768924        1
0.756328        1
0.462891        1
0.846484        1
Name: Target, Length: 9417, dtype: int64

In [22]:
#scaling all values less than 1 to be 0, values less than 2 and greater than 1 to be 1.

y = train['Target'].astype(int)

X = train.drop(['Target'], axis = 1)

In [23]:
#replacing values of 3 with 2.

y = y.replace(3, 2)

In [None]:
y.value_counts()

In [24]:
train.dtypes

ID                                object
payhist_len                        int64
Deposit                            int64
AccessoryRate                    float64
rateTypeEntity                     int64
                                  ...   
Term_brackets_ payment_median    float64
Term_brackets_ payment_sum       float64
Term_brackets_ payment_std       float64
Term_brackets_ payment_min       float64
Term_brackets_ payment_max       float64
Length: 208, dtype: object

In [25]:
X_cols = list(X.drop(['ID'], axis = 1).columns)
X_len = len(X_cols)



### LIGHTGBM

In [26]:
#Cross-Validation LGBM routine

def lgb_crossval(model):

    rand = 40

    fold = StratifiedKFold(n_splits= 5, random_state = rand, shuffle = True)

    i = 1

    acc_bin = list()
    
    
    predictions = []
    
    test_new = test[X_cols]
    

    combined = pd.DataFrame(columns = ['ID', 'Target'])
    
    
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(X_len)

    


    for train_index, test_index in fold.split(X, y):
        
    
        print('fold n°: ', i)
        
        val_targs = pd.DataFrame(columns = ['ID', 'Target'])

        

        x_data, x_val = X.iloc[train_index], X.iloc[test_index]

        y_data, y_val = y.iloc[train_index], y.iloc[test_index]
        
        
        val_targs['ID'] = x_val['ID']
        
        x_data.drop(['ID'], axis = 1, inplace = True)
        
        x_val.drop(['ID'], axis = 1, inplace = True)
    
   
        
        model.fit(x_data, y_data, categorical_feature = cat_cols, eval_set= [(x_val, y_val)], verbose = 0)
    
        y_train_pred = model.predict(x_data)
        
        y_test_pred = model.predict(x_val)
        
        
        
        val_targs['Target'] = y_test_pred
       
        combined = pd.concat([combined, val_targs])
        
        
        
        
        predictions.append(model.predict(test_new))
        
        feature_importances += model.feature_importances_ 
    
        print('acc train', accuracy_score(y_data, y_train_pred))
        print('acc test',  accuracy_score(y_val, y_test_pred))
        
        
        acc_bin.append(accuracy_score(y_val, y_test_pred))
    
    
    
    
    
    


        i = i + 1
        

    
    print('Mean acc bin test: ', np.mean(acc_bin))

    
    return predictions, feature_importances/10, combined

In [27]:
lgb_model = LGBMClassifier(n_estimators = 1000, learning_rate = 0.04, 
                           boosting = 'gbdt', num_leaves = 60, importance_type = 'gain', \
                           random_state = rand, objective = "multiclass", \
                           reg_lambda = 20)

In [28]:
test_predictions, importances, combined= lgb_crossval(lgb_model)

fold n°:  1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
New categorical_feature is ['MainApplicantGender', 'Occupation', 'Region', 'Term_brackets', 'Town', 'age_group', 'rateTypeEntity']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


acc train 0.9995536710555679
acc test 0.8855765797929311
fold n°:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
New categorical_feature is ['MainApplicantGender', 'Occupation', 'Region', 'Term_brackets', 'Town', 'age_group', 'rateTypeEntity']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


acc train 0.999776835527784
acc test 0.8868261335237415
fold n°:  3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
New categorical_feature is ['MainApplicantGender', 'Occupation', 'Region', 'Term_brackets', 'Town', 'age_group', 'rateTypeEntity']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


acc train 0.999419798268321
acc test 0.8876986252454919
fold n°:  4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
New categorical_feature is ['MainApplicantGender', 'Occupation', 'Region', 'Term_brackets', 'Town', 'age_group', 'rateTypeEntity']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


acc train 0.9997322145853789
acc test 0.8914479557221925
fold n°:  5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
New categorical_feature is ['MainApplicantGender', 'Occupation', 'Region', 'Term_brackets', 'Town', 'age_group', 'rateTypeEntity']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


acc train 0.9990627510488262
acc test 0.8944831280128549
Mean acc bin test:  0.8892064844594424


In [29]:
combined.to_csv('Noleak Train Loan Status V2.csv')

In [30]:
combined['Target']

2        0
3        1
7        1
8        1
9        1
        ..
27989    1
27991    1
27992    1
27995    1
27997    1
Name: Target, Length: 28007, dtype: object

In [31]:
combined['Target'].value_counts()

1    17883
0    10119
2        5
Name: Target, dtype: int64

In [32]:
feature_importances = pd.DataFrame({'feature': list(X_cols), 'importance': importances}).sort_values('importance', 
                                    ascending = False)
    

In [33]:
feature_importances

Unnamed: 0,feature,importance
125,paymentminyr_m2,42607.353921
159,payedsum/TotalContract,25741.922412
45,Date m6 year,12470.535937
9,Town,10209.922539
72,paymenthist_month 5,7755.834884
...,...,...
53,FirstPaymentDate year,0.000000
51,LastTransactionDate day,0.000000
164,age_group_ payment_sum,0.000000
167,age_group_ payment_max,0.000000


***Create Test File Submission***

In [34]:
test_predictions = np.array(test_predictions)

import scipy
test_predictions = list(scipy.stats.mode(test_predictions)[0][0])

In [35]:
test['Target'] = test_predictions

In [36]:
test['Target'].value_counts()

1    6003
0    3332
2       1
Name: Target, dtype: int64

In [37]:
test[['ID', 'Target']].to_csv('Noleak Test Loan Status V2.csv')