In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as gbm

Importing the data

In [2]:
train_path=os.path.join('..', 'data', 'processed', 'train.csv')

pd.set_option('display.max_columns', 80)

train_data=pd.read_csv(train_path)
train_data.head()

Unnamed: 0,id,hi_int_prncp_pd,acc_now_delinq,acc_open_past_24mths,addr_state,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,emp_title,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,3819,0,0,2,CT,21120.0,Individual,3662,86.1,0,0,33.69,2010,1.0,Security Guard,C,RENT,w,12.62,2019,7000,0,18,0,0,0,3828.54,0,0,debt_consolidation,C1,36,0,65921,0.0,1
1,3820,0,0,3,CA,32400.0,Individual,2973,30.2,0,0,15.78,1975,3.0,Security Guard,C,OWN,w,11.99,2016,16200,2,8,0,0,0,0.0,3,2,debt_consolidation,C1,36,0,17840,0.0,0
2,3821,0,0,3,NY,30251.0,Individual,1983,46.2,0,0,20.04,2010,2.0,Security Guard,B,RENT,w,11.49,2018,9600,0,10,0,0,0,0.0,0,0,debt_consolidation,B5,36,0,19829,0.0,1
3,3822,0,0,1,NJ,66976.0,Individual,1638,77.4,0,0,8.58,2012,1.0,Security Guard,C,OWN,w,12.62,2019,12000,0,6,0,0,0,9665.27,0,0,debt_consolidation,C1,60,0,9830,0.0,1
4,3823,0,0,2,MO,125000.0,Individual,577,0.0,0,0,2.34,1982,10.0,Administrator,A,MORTGAGE,w,6.49,2019,24000,4,13,0,0,0,11358.24,0,0,debt_consolidation,A2,60,0,6924,0.0,1


Creating the x matrix and the y label

In [3]:
y_train=train_data['hi_int_prncp_pd'].copy()

x_train=train_data.drop(columns=['addr_state', 'emp_title', 'hi_int_prncp_pd', 'id', 'purpose'])
x_train.head()

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,0,2,21120.0,Individual,3662,86.1,0,0,33.69,2010,1.0,C,RENT,w,12.62,2019,7000,0,18,0,0,0,3828.54,0,0,C1,36,0,65921,0.0,1
1,0,3,32400.0,Individual,2973,30.2,0,0,15.78,1975,3.0,C,OWN,w,11.99,2016,16200,2,8,0,0,0,0.0,3,2,C1,36,0,17840,0.0,0
2,0,3,30251.0,Individual,1983,46.2,0,0,20.04,2010,2.0,B,RENT,w,11.49,2018,9600,0,10,0,0,0,0.0,0,0,B5,36,0,19829,0.0,1
3,0,1,66976.0,Individual,1638,77.4,0,0,8.58,2012,1.0,C,OWN,w,12.62,2019,12000,0,6,0,0,0,9665.27,0,0,C1,60,0,9830,0.0,1
4,0,2,125000.0,Individual,577,0.0,0,0,2.34,1982,10.0,A,MORTGAGE,w,6.49,2019,24000,4,13,0,0,0,11358.24,0,0,A2,60,0,6924,0.0,1


Importing the LightGBMClassifier

In [4]:
from lightgbm import LGBMClassifier

In [5]:
gbm.__version__

'2.3.1'

In [6]:
lgbm=LGBMClassifier(objective='binary',
                   random_state=402,
                   n_jobs=4)

Importing classes to tune GBM

In [7]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

In [8]:
parameters={'num_leaves':[30,40,50,60,70],
           'learning_rate':[0.01,0.03,0.1,0.15,0.3],
           'n_estimators':[100,200,250],
           'subsample':[0.67,0.8],
           'colsample_bytree':[0.7,0.8]}

In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5778 entries, 0 to 5777
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   acc_now_delinq        5778 non-null   int64  
 1   acc_open_past_24mths  5778 non-null   int64  
 2   annual_inc            5778 non-null   float64
 3   application_type      5778 non-null   object 
 4   avg_cur_bal           5778 non-null   int64  
 5   bc_util               5778 non-null   float64
 6   delinq_2yrs           5778 non-null   int64  
 7   delinq_amnt           5778 non-null   int64  
 8   dti                   5778 non-null   float64
 9   earliest_cr_line      5778 non-null   int64  
 10  emp_length            5777 non-null   float64
 11  grade                 5778 non-null   object 
 12  home_ownership        5778 non-null   object 
 13  initial_list_status   5778 non-null   object 
 14  int_rate              5778 non-null   float64
 15  last_credit_pull_d   

In [10]:
for column in x_train.columns:
    col_type=x_train[column].dtype
    
    if col_type=='object' or col_type.name=='category':
        x_train[column]=x_train[column].astype('category')

In [11]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5778 entries, 0 to 5777
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   acc_now_delinq        5778 non-null   int64   
 1   acc_open_past_24mths  5778 non-null   int64   
 2   annual_inc            5778 non-null   float64 
 3   application_type      5778 non-null   category
 4   avg_cur_bal           5778 non-null   int64   
 5   bc_util               5778 non-null   float64 
 6   delinq_2yrs           5778 non-null   int64   
 7   delinq_amnt           5778 non-null   int64   
 8   dti                   5778 non-null   float64 
 9   earliest_cr_line      5778 non-null   int64   
 10  emp_length            5777 non-null   float64 
 11  grade                 5778 non-null   category
 12  home_ownership        5778 non-null   category
 13  initial_list_status   5778 non-null   category
 14  int_rate              5778 non-null   float64 
 15  last

In [12]:
cat_attributes=list(x_train.select_dtypes(include='category'))
cat_attributes

['application_type',
 'grade',
 'home_ownership',
 'initial_list_status',
 'sub_grade']

In [13]:
num_attributes=list(x_train.select_dtypes(include=['float64', 'int64']))

pipeline=ColumnTransformer([
    ('num', KNNImputer(), num_attributes),
    ('cat', OrdinalEncoder(), cat_attributes)
])

In [14]:
# preparing the training data
x_train_prepared=pipeline.fit_transform(x_train)
x_train_prepared=pd.DataFrame(x_train_prepared)
x_train_prepared.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,0.0,2.0,21120.0,3662.0,86.1,0.0,0.0,33.69,2010.0,1.0,12.62,2019.0,7000.0,0.0,18.0,0.0,0.0,0.0,3828.54,0.0,0.0,36.0,0.0,65921.0,0.0,1.0,0.0,2.0,3.0,1.0,10.0
1,0.0,3.0,32400.0,2973.0,30.2,0.0,0.0,15.78,1975.0,3.0,11.99,2016.0,16200.0,2.0,8.0,0.0,0.0,0.0,0.0,3.0,2.0,36.0,0.0,17840.0,0.0,0.0,0.0,2.0,2.0,1.0,10.0
2,0.0,3.0,30251.0,1983.0,46.2,0.0,0.0,20.04,2010.0,2.0,11.49,2018.0,9600.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,19829.0,0.0,1.0,0.0,1.0,3.0,1.0,9.0
3,0.0,1.0,66976.0,1638.0,77.4,0.0,0.0,8.58,2012.0,1.0,12.62,2019.0,12000.0,0.0,6.0,0.0,0.0,0.0,9665.27,0.0,0.0,60.0,0.0,9830.0,0.0,1.0,0.0,2.0,2.0,1.0,10.0
4,0.0,2.0,125000.0,577.0,0.0,0.0,0.0,2.34,1982.0,10.0,6.49,2019.0,24000.0,4.0,13.0,0.0,0.0,0.0,11358.24,0.0,0.0,60.0,0.0,6924.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0


In [15]:
col_names=list(x_train.columns)

In [16]:
x_train_prepared.rename(columns=dict(zip(x_train_prepared.columns[0:], col_names)),inplace=True)
x_train_prepared.head()

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,0.0,2.0,21120.0,3662.0,86.1,0.0,0.0,33.69,2010.0,1.0,12.62,2019.0,7000.0,0.0,18.0,0.0,0.0,0.0,3828.54,0.0,0.0,36.0,0.0,65921.0,0.0,1.0,0.0,2.0,3.0,1.0,10.0
1,0.0,3.0,32400.0,2973.0,30.2,0.0,0.0,15.78,1975.0,3.0,11.99,2016.0,16200.0,2.0,8.0,0.0,0.0,0.0,0.0,3.0,2.0,36.0,0.0,17840.0,0.0,0.0,0.0,2.0,2.0,1.0,10.0
2,0.0,3.0,30251.0,1983.0,46.2,0.0,0.0,20.04,2010.0,2.0,11.49,2018.0,9600.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,19829.0,0.0,1.0,0.0,1.0,3.0,1.0,9.0
3,0.0,1.0,66976.0,1638.0,77.4,0.0,0.0,8.58,2012.0,1.0,12.62,2019.0,12000.0,0.0,6.0,0.0,0.0,0.0,9665.27,0.0,0.0,60.0,0.0,9830.0,0.0,1.0,0.0,2.0,2.0,1.0,10.0
4,0.0,2.0,125000.0,577.0,0.0,0.0,0.0,2.34,1982.0,10.0,6.49,2019.0,24000.0,4.0,13.0,0.0,0.0,0.0,11358.24,0.0,0.0,60.0,0.0,6924.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0


In [17]:
fit_params={'eval_metric':'accuracy',
           'verbose':50,
           'feature_name':'auto',
           'categorical_feature':cat_attributes}

In [18]:
k_folds=StratifiedKFold(n_splits=10,
                       shuffle=True,
                       random_state=402)

In [19]:
model_search=RandomizedSearchCV(lgbm,
                               param_distributions=parameters,
                               n_iter=100,
                               scoring='accuracy',
                               n_jobs=-1,
                               cv=k_folds,
                               random_state=402)

In [20]:
%time model_search.fit(x_train_prepared, y_train, **fit_params)

New categorical_feature is ['application_type', 'grade', 'home_ownership', 'initial_list_status', 'sub_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


CPU times: user 4.9 s, sys: 302 ms, total: 5.2 s
Wall time: 26min 44s


RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=402, shuffle=True),
                   error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=4,
                                            num_leaves=31, objective=...
                                            subsample_for_bin=200000,
                                            subsample_freq=0),
                   iid='deprecated', n_iter=100, n_jobs=-1,
   

In [23]:
model_search.best_estimator_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
               importance_type='split', learning_rate=0.03, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=4, num_leaves=50, objective='binary',
               random_state=402, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.67, subsample_for_bin=200000, subsample_freq=0)

In [25]:
lgbm_tuned=model_search.best_estimator_

Reading in the test data to make the predictions.

In [26]:
test_path=os.path.join('..', 'data', 'processed', 'test.csv')

test_data=pd.read_csv(test_path)
x_test=test_data.drop(columns=['addr_state', 'emp_title', 'id', 'purpose'])

In [27]:
for column in x_test.columns:
    col_type=x_test[column].dtype
    
    if col_type=='object' or col_type.name=='category':
        x_test[column]=x_test[column].astype('category')

Preparing the test data

In [41]:
x_test_prepared=pipeline.transform(x_test)
x_test_prepared=pd.DataFrame(x_test_prepared)

In [42]:
x_test_prepared.rename(columns=dict(zip(x_test_prepared.columns[0:], col_names)),inplace=True)
x_test_prepared.head()

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,0.0,10.0,45000.0,3125.0,40.6,0.0,0.0,27.6,1987.0,10.0,17.57,2018.0,10000.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,379.0,65615.0,0.0,1.0,0.0,3.0,3.0,1.0,18.0
1,0.0,1.0,40000.0,1534.0,26.6,0.0,0.0,17.54,2010.0,3.0,8.24,2017.0,14400.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,13806.0,0.0,0.0,0.0,1.0,2.0,1.0,5.0
2,0.0,1.0,113000.0,49748.0,82.6,0.0,0.0,19.67,1999.0,8.0,7.49,2019.0,30000.0,4.0,9.0,0.0,0.0,0.0,8141.22,0.0,0.0,36.0,0.0,447731.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0
3,0.0,6.0,105000.0,42642.0,81.8,0.0,0.0,30.47,1988.0,10.0,8.18,2019.0,10000.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,469067.0,0.0,0.0,0.0,1.0,1.0,1.0,5.0
4,0.0,9.0,80000.0,4766.0,52.2,0.0,0.0,22.94,1990.0,10.0,5.32,2019.0,6000.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,100087.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0


In [49]:
gbm_predictions=lgbm_tuned.predict(x_test_prepared)
gbm_predictions=pd.DataFrame(gbm_predictions).rename(columns={0:'Category'})
gbm_predictions.tail()

Unnamed: 0,Category
3813,1
3814,0
3815,1
3816,1
3817,1


In [50]:
observation_id=test_data['id'].copy()
observation_id=pd.DataFrame(observation_id).rename(columns={'id':'Id'})

In [51]:
gbm_preds=pd.concat([observation_id, gbm_predictions], axis='columns')
gbm_preds

Unnamed: 0,Id,Category
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
3813,3814,1
3814,3815,0
3815,3816,1
3816,3817,1


In [52]:
model_search.best_score_

0.9158890694620186

In [53]:
# turning the predictions column into character
gbm_preds['Category']=gbm_preds['Category'].apply(str)
gbm_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Id        3818 non-null   int64 
 1   Category  3818 non-null   object
dtypes: int64(1), object(1)
memory usage: 59.8+ KB


In [55]:
file_path=os.path.join('..', 'data', 'predictions', 'lgbm_predictions.csv')
gbm_preds.to_csv(file_path, index=False)

In [57]:
accuracy_score(y_train, lgbm_tuned.predict(x_train_prepared))

0.9574247144340602

95.7% accuracy on the training set. Will check what the sample accuracy on the test set is to determine if the model is overfitting.

The model got an accuracy of 92.3% for the testing set. Will need to implement some form of regularization to reduce the disparity between training and testing accuracy.