In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas_profiling 

# 전처리
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 데이터 분할
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV

# 모델링
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm

# 평가지표
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
#
import warnings

In [6]:
from bayes_opt import BayesianOptimization

In [7]:
final = pd.read_csv(r'C:\Users\KYOBO\Desktop\dataset\final.csv')

In [8]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13141813 entries, 0 to 13141812
Data columns (total 34 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   user_id                              float64
 1   sum_applied                          float64
 2   application_id                       int64  
 3   gender                               float64
 4   insert_time                          object 
 5   yearly_income                        float64
 6   income_type                          object 
 7   employment_type                      object 
 8   houseown_type                        object 
 9   desired_amount                       float64
 10  purpose                              object 
 11  personal_rehabilitation_yn           float64
 12  personal_rehabilitation_complete_yn  float64
 13  existing_loan_cnt                    float64
 14  existing_loan_amt                    float64
 15  reage                         

In [9]:
final = final.drop(['user_id', 'application_id', 'insert_time', 'loanapply_insert_time','loanapply_insert_hour'], axis = 1)

In [10]:
final['income_type'] = final['income_type'].astype('category')
final['employment_type'] = final['employment_type'].astype('category')
final['houseown_type'] = final['houseown_type'].astype('category')
final['purpose'] = final['purpose'].astype('category')
final['first_event'] = final['first_event'].astype('category')
final['weekday'] = final['weekday'].astype('category')

In [11]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13141813 entries, 0 to 13141812
Data columns (total 29 columns):
 #   Column                               Dtype   
---  ------                               -----   
 0   sum_applied                          float64 
 1   gender                               float64 
 2   yearly_income                        float64 
 3   income_type                          category
 4   employment_type                      category
 5   houseown_type                        category
 6   desired_amount                       float64 
 7   purpose                              category
 8   personal_rehabilitation_yn           float64 
 9   personal_rehabilitation_complete_yn  float64 
 10  existing_loan_cnt                    float64 
 11  existing_loan_amt                    float64 
 12  reage                                float64 
 13  career                               float64 
 14  credit_grade                         float64 
 15  spec_clust   

In [12]:
final = pd.get_dummies(final, columns = ['income_type', 'employment_type', 'houseown_type',
                                        'purpose','weekday','first_event'])

In [13]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13141813 entries, 0 to 13141812
Data columns (total 57 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   sum_applied                          float64
 1   gender                               float64
 2   yearly_income                        float64
 3   desired_amount                       float64
 4   personal_rehabilitation_yn           float64
 5   personal_rehabilitation_complete_yn  float64
 6   existing_loan_cnt                    float64
 7   existing_loan_amt                    float64
 8   reage                                float64
 9   career                               float64
 10  credit_grade                         float64
 11  spec_clust                           float64
 12  bank_id                              int64  
 13  product_id                           int64  
 14  loan_limit                           float64
 15  loan_rate                     

In [14]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [15]:
final_train = final[final['is_applied'].notnull()]
final_test = final[final['is_applied'].isnull()]

In [16]:
data = final_train.drop('is_applied',axis=1)
target = final_train['is_applied']

In [17]:
undersample = RandomUnderSampler(sampling_strategy='majority')
data_under, target_under = undersample.fit_resample(data, target)
print(Counter(target_under))

Counter({0.0: 560336, 1.0: 560336})


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data_under, target_under, test_size=0.3, random_state=0)

In [29]:
model1_xgb = xgb.XGBClassifier()
model1_xgb.fit(X_train,
              y_train,
              verbose=True,
              early_stopping_rounds=10,
              eval_metric='aucpr',
              eval_set=[(X_val,y_val)])

[0]	validation_0-aucpr:0.81412
[1]	validation_0-aucpr:0.82547
[2]	validation_0-aucpr:0.83213
[3]	validation_0-aucpr:0.83788
[4]	validation_0-aucpr:0.84129
[5]	validation_0-aucpr:0.84559
[6]	validation_0-aucpr:0.84798
[7]	validation_0-aucpr:0.85048
[8]	validation_0-aucpr:0.85227
[9]	validation_0-aucpr:0.85431
[10]	validation_0-aucpr:0.85626
[11]	validation_0-aucpr:0.85795
[12]	validation_0-aucpr:0.85898
[13]	validation_0-aucpr:0.85983
[14]	validation_0-aucpr:0.86115
[15]	validation_0-aucpr:0.86193
[16]	validation_0-aucpr:0.86302
[17]	validation_0-aucpr:0.86378
[18]	validation_0-aucpr:0.86429
[19]	validation_0-aucpr:0.86506
[20]	validation_0-aucpr:0.86566
[21]	validation_0-aucpr:0.86601
[22]	validation_0-aucpr:0.86651
[23]	validation_0-aucpr:0.86702
[24]	validation_0-aucpr:0.86753
[25]	validation_0-aucpr:0.86819
[26]	validation_0-aucpr:0.86867
[27]	validation_0-aucpr:0.86925
[28]	validation_0-aucpr:0.86956
[29]	validation_0-aucpr:0.86975
[30]	validation_0-aucpr:0.87033
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
y_pred = model1_xgb.predict(X_val)

In [28]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [33]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_val,y_pred)))

f1 score : 0.8308


In [25]:
plot_confusion_matrix(model1_xgb,
                     X_val,
                     y_val,
                     values_format='d',
                     display_labels=["0","1"])

XGBoostError: [16:42:17] c:\users\administrator\workspace\xgboost-win64_release_1.4.0\src\c_api\c_api_utils.h:161: Invalid missing value: null

In [None]:
eval_metric=F1_eval