## Importing packages

In [0]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

Now I will load the data set that is located in the data folder of my project directory.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train_path='/content/drive/My Drive/Classification-Competition/data/processed/train.csv'
train_data=pd.read_csv(train_path)
train_data.head()

Unnamed: 0,id,hi_int_prncp_pd,acc_now_delinq,acc_open_past_24mths,addr_state,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,emp_title,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,3819,0,0,2,CT,21120.0,Individual,3662,86.1,0,0,33.69,2010,1.0,Security Guard,C,RENT,w,12.62,2019,7000,0,18,0,0,0,3828.54,0,0,debt_consolidation,C1,36,0,65921,0.0,1
1,3820,0,0,3,CA,32400.0,Individual,2973,30.2,0,0,15.78,1975,3.0,Security Guard,C,OWN,w,11.99,2016,16200,2,8,0,0,0,0.0,3,2,debt_consolidation,C1,36,0,17840,0.0,0
2,3821,0,0,3,NY,30251.0,Individual,1983,46.2,0,0,20.04,2010,2.0,Security Guard,B,RENT,w,11.49,2018,9600,0,10,0,0,0,0.0,0,0,debt_consolidation,B5,36,0,19829,0.0,1
3,3822,0,0,1,NJ,66976.0,Individual,1638,77.4,0,0,8.58,2012,1.0,Security Guard,C,OWN,w,12.62,2019,12000,0,6,0,0,0,9665.27,0,0,debt_consolidation,C1,60,0,9830,0.0,1
4,3823,0,0,2,MO,125000.0,Individual,577,0.0,0,0,2.34,1982,10.0,Administrator,A,MORTGAGE,w,6.49,2019,24000,4,13,0,0,0,11358.24,0,0,debt_consolidation,A2,60,0,6924,0.0,1


Now that I have the data loaded I can clean it up and in order to feed it into xgboost model.

In [0]:
# will first try without using addr_state, purpose and emp_title; naive approach
y_train=train_data['hi_int_prncp_pd'].copy()

x_train_rf=train_data.drop(columns=['addr_state', 'emp_title', 'hi_int_prncp_pd', 'id', 'purpose'])
x_train_rf.head()

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,0,2,21120.0,Individual,3662,86.1,0,0,33.69,2010,1.0,C,RENT,w,12.62,2019,7000,0,18,0,0,0,3828.54,0,0,C1,36,0,65921,0.0,1
1,0,3,32400.0,Individual,2973,30.2,0,0,15.78,1975,3.0,C,OWN,w,11.99,2016,16200,2,8,0,0,0,0.0,3,2,C1,36,0,17840,0.0,0
2,0,3,30251.0,Individual,1983,46.2,0,0,20.04,2010,2.0,B,RENT,w,11.49,2018,9600,0,10,0,0,0,0.0,0,0,B5,36,0,19829,0.0,1
3,0,1,66976.0,Individual,1638,77.4,0,0,8.58,2012,1.0,C,OWN,w,12.62,2019,12000,0,6,0,0,0,9665.27,0,0,C1,60,0,9830,0.0,1
4,0,2,125000.0,Individual,577,0.0,0,0,2.34,1982,10.0,A,MORTGAGE,w,6.49,2019,24000,4,13,0,0,0,11358.24,0,0,A2,60,0,6924,0.0,1


In [0]:
x_train_rf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5778 entries, 0 to 5777
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   acc_now_delinq        5778 non-null   int64  
 1   acc_open_past_24mths  5778 non-null   int64  
 2   annual_inc            5778 non-null   float64
 3   application_type      5778 non-null   object 
 4   avg_cur_bal           5778 non-null   int64  
 5   bc_util               5778 non-null   float64
 6   delinq_2yrs           5778 non-null   int64  
 7   delinq_amnt           5778 non-null   int64  
 8   dti                   5778 non-null   float64
 9   earliest_cr_line      5778 non-null   int64  
 10  emp_length            5777 non-null   float64
 11  grade                 5778 non-null   object 
 12  home_ownership        5778 non-null   object 
 13  initial_list_status   5778 non-null   object 
 14  int_rate              5778 non-null   float64
 15  last_credit_pull_d   

Setting up a pipeline to one hot encode variables to feed into XGBoost

In [0]:
# splitting up the variables based on type
num_attribs=list(x_train_rf.select_dtypes(include=['int64', 'float64']))
cat_attribs=list(x_train_rf.select_dtypes(include=['object']))

In [0]:
knn_impute=KNNImputer(n_neighbors=5)

full_pipeline=ColumnTransformer([
    ('num', knn_impute, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

Preparing the data

In [0]:
x_train_prepared=full_pipeline.fit_transform(x_train_rf)
pd.DataFrame(x_train_prepared).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75
0,0.0,2.0,21120.0,3662.0,86.1,0.0,0.0,33.69,2010.0,1.0,12.62,2019.0,7000.0,0.0,18.0,0.0,0.0,0.0,3828.54,0.0,0.0,36.0,0.0,65921.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,3.0,32400.0,2973.0,30.2,0.0,0.0,15.78,1975.0,3.0,11.99,2016.0,16200.0,2.0,8.0,0.0,0.0,0.0,0.0,3.0,2.0,36.0,0.0,17840.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.0,30251.0,1983.0,46.2,0.0,0.0,20.04,2010.0,2.0,11.49,2018.0,9600.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,19829.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,66976.0,1638.0,77.4,0.0,0.0,8.58,2012.0,1.0,12.62,2019.0,12000.0,0.0,6.0,0.0,0.0,0.0,9665.27,0.0,0.0,60.0,0.0,9830.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,2.0,125000.0,577.0,0.0,0.0,0.0,2.34,1982.0,10.0,6.49,2019.0,24000.0,4.0,13.0,0.0,0.0,0.0,11358.24,0.0,0.0,60.0,0.0,6924.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
# !pip install -q xgboost==1.1.0      # ----necessary to install latest version----

[K     |████████████████████████████████| 127.6MB 106kB/s 
[?25h

In [0]:
import xgboost as xgb

In [0]:
xgb.__version__

'1.1.0'

In [0]:
# will use this to tune the boosted tree model
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost.sklearn import XGBClassifier

Doing random parameter search using RandomizedGridSearch

In [0]:
boosted_estimator=XGBClassifier(objective='binary:logistic',
                                tree_method='gpu_hist',
                                gpu_id=0,
                                random_state=402)

In [0]:
# creating a dictionary that has the space that will be searched over
parameters={'max_depth': randint(2, 6), # default 3
            'colsample_bytree': uniform(0.7, 0.3),
            'gamma': uniform(0, 0.5), # it is fine to set it to 0
            'learning_rate': uniform(0.03, 0.3), # default 0.1 
            'n_estimators': randint(100, 150), # default 100 trees
            'subsample': [0.6, 0.7, 0.8]}

In [0]:
estimator_search=RandomizedSearchCV(boosted_estimator,
                                    param_distributions=parameters,
                                    n_iter=200,
                                    scoring='accuracy',
                                    cv=10)

In [0]:
# testing for GPU connection
# import tensorflow as tf
# tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
%time estimator_search.fit(x_train_prepared, y_train)

CPU times: user 8min 18s, sys: 2min 55s, total: 11min 14s
Wall time: 11min 18s


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=0, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_e...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f58c0df4630>,
                                        'max_depth'

In [0]:
estimator_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.867296867206786,
              gamma=0.4087139515691656, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.0313900047678075,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=109, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=402, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.7,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [0]:
tuned_xgb=estimator_search.best_estimator_

In [0]:
test_path='/content/drive/My Drive/Classification-Competition/data/processed/test.csv'

test_data=pd.read_csv(test_path)
test_data.head()

Unnamed: 0,id,acc_now_delinq,acc_open_past_24mths,addr_state,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,emp_title,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,1,0,10,NY,45000.0,Individual,3125,40.6,0,0,27.6,1987,10.0,Security Guard,D,RENT,w,17.57,2018,10000,0,21,0,0,0,0.0,0,0,medical,D4,60,379,65615,0.0,1
1,2,0,1,NY,40000.0,Individual,1534,26.6,0,0,17.54,2010,3.0,Security Guard,B,OWN,w,8.24,2017,14400,0,10,0,0,0,0.0,0,0,credit_card,B1,36,0,13806,0.0,0
2,3,0,1,FL,113000.0,Individual,49748,82.6,0,0,19.67,1999,8.0,Administrator,A,MORTGAGE,w,7.49,2019,30000,4,9,0,0,0,8141.22,0,0,debt_consolidation,A4,36,0,447731,0.0,1
3,4,0,6,CA,105000.0,Individual,42642,81.8,0,0,30.47,1988,10.0,Administrator,B,MORTGAGE,w,8.18,2019,10000,2,12,0,0,0,0.0,0,0,credit_card,B1,36,0,469067,0.0,0
4,5,0,9,CA,80000.0,Individual,4766,52.2,0,0,22.94,1990,10.0,Administrator,A,RENT,w,5.32,2019,6000,0,21,0,0,0,0.0,0,0,home_improvement,A1,36,0,100087,0.0,0


In [0]:
test_data_dropped=test_data.drop(columns=['addr_state', 'emp_title', 'id', 'purpose'])
test_data_dropped

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,annual_inc,application_type,avg_cur_bal,bc_util,delinq_2yrs,delinq_amnt,dti,earliest_cr_line,emp_length,grade,home_ownership,initial_list_status,int_rate,last_credit_pull_d,loan_amnt,mort_acc,num_sats,num_tl_120dpd_2m,num_tl_90g_dpd_24m,num_tl_30dpd,out_prncp_inv,pub_rec,pub_rec_bankruptcies,sub_grade,term,tot_coll_amt,tot_cur_bal,total_rec_late_fee,verification_status
0,0,10,45000.0,Individual,3125,40.6,0,0,27.60,1987,10.0,D,RENT,w,17.57,2018,10000,0,21,0,0,0,0.00,0,0,D4,60,379,65615,0.0,1
1,0,1,40000.0,Individual,1534,26.6,0,0,17.54,2010,3.0,B,OWN,w,8.24,2017,14400,0,10,0,0,0,0.00,0,0,B1,36,0,13806,0.0,0
2,0,1,113000.0,Individual,49748,82.6,0,0,19.67,1999,8.0,A,MORTGAGE,w,7.49,2019,30000,4,9,0,0,0,8141.22,0,0,A4,36,0,447731,0.0,1
3,0,6,105000.0,Individual,42642,81.8,0,0,30.47,1988,10.0,B,MORTGAGE,w,8.18,2019,10000,2,12,0,0,0,0.00,0,0,B1,36,0,469067,0.0,0
4,0,9,80000.0,Individual,4766,52.2,0,0,22.94,1990,10.0,A,RENT,w,5.32,2019,6000,0,21,0,0,0,0.00,0,0,A1,36,0,100087,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,0,5,135000.0,Individual,3176,17.3,0,0,16.43,2005,4.0,B,RENT,w,12.73,2019,40000,0,14,0,0,0,36268.19,0,0,B5,36,0,44466,0.0,1
3814,0,4,68000.0,Individual,5553,69.9,0,0,32.93,2002,10.0,B,RENT,f,10.99,2019,7700,0,11,0,0,0,1227.14,0,0,B4,36,0,61078,0.0,1
3815,0,3,70000.0,Individual,8901,92.4,0,0,21.02,2005,4.0,D,OWN,w,21.49,2019,21000,0,5,0,0,0,15130.68,1,0,D5,60,0,44505,28.7,1
3816,0,7,98000.0,Individual,4580,65.0,2,0,15.73,2001,7.0,C,OWN,w,16.91,2019,5000,0,11,0,1,0,4331.34,0,0,C5,36,3212,50384,0.0,0


In [0]:
test_data_prepared=full_pipeline.transform(test_data_dropped)
pd.DataFrame(test_data_prepared).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75
0,0.0,10.0,45000.0,3125.0,40.6,0.0,0.0,27.6,1987.0,10.0,17.57,2018.0,10000.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,379.0,65615.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,40000.0,1534.0,26.6,0.0,0.0,17.54,2010.0,3.0,8.24,2017.0,14400.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,13806.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,113000.0,49748.0,82.6,0.0,0.0,19.67,1999.0,8.0,7.49,2019.0,30000.0,4.0,9.0,0.0,0.0,0.0,8141.22,0.0,0.0,36.0,0.0,447731.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,6.0,105000.0,42642.0,81.8,0.0,0.0,30.47,1988.0,10.0,8.18,2019.0,10000.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,469067.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,9.0,80000.0,4766.0,52.2,0.0,0.0,22.94,1990.0,10.0,5.32,2019.0,6000.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,100087.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
predictions=tuned_xgb.predict(test_data_prepared)
predictions=pd.DataFrame(predictions).rename(columns={0:'Category'})
predictions.head()

Unnamed: 0,Category
0,0
1,0
2,0
3,0
4,0


In [0]:
observation_id=test_data['id'].copy()
pd.DataFrame(observation_id).rename(columns={'id':'Id'})

Unnamed: 0,Id
0,1
1,2
2,3
3,4
4,5
...,...
3813,3814
3814,3815
3815,3816
3816,3817


In [0]:
submission=pd.concat([observation_id, predictions], axis='columns')
submission

Unnamed: 0,id,Category
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
3813,3814,1
3814,3815,0
3815,3816,1
3816,3817,1


In [0]:
drive_path='/content/drive/My Drive/Classification-Competition/data/predictions/xgb_predictions.csv'
submission.to_csv(drive_path, index=False)

In [0]:
estimator_search.best_score_

0.91658111098451

In [0]:
accuracy_score(y_train, tuned_xgb.predict(x_train_prepared))

0.9323295257874697