In [1]:
# 패키지 설치
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, auc
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report, accuracy_score, f1_score

from bayes_opt import BayesianOptimization

In [2]:
import lightgbm
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV

In [4]:
final_df = pd.read_csv('/Users/sungahwang/Desktop/BC_data/final.csv')
final_df.head()

Unnamed: 0,user_id,sum_applied,application_id,gender,insert_time,yearly_income,income_type,employment_type,houseown_type,desired_amount,...,is_applied,weekday,loanapply_insert_hour,bank_apply_rate,product_apply_rate,cluster2,action_cnt,use_day_cnt,first_event,timeout_sum(s)
0,545882.0,8.0,576643,1.0,2022-05-09 10:54:51,72000000.0,직장가입자(4대보험O),정규직,전월세,20000000.0,...,0.0,0,10,0.042501,0.050284,0,41.913806,5.765978,CompleteIDCertification,67.997282
1,545882.0,8.0,576643,1.0,2022-05-09 10:54:51,72000000.0,직장가입자(4대보험O),정규직,전월세,20000000.0,...,0.0,0,10,0.017018,0.016149,0,41.913806,5.765978,CompleteIDCertification,67.997282
2,558819.0,5.0,2136706,1.0,2022-05-09 10:41:05,39000000.0,직장가입자(4대보험O),정규직,전월세,80000000.0,...,0.0,0,10,0.024636,0.028622,0,41.913806,5.765978,CompleteIDCertification,67.997282
3,558819.0,5.0,2136706,1.0,2022-05-09 10:41:05,39000000.0,직장가입자(4대보험O),정규직,전월세,80000000.0,...,0.0,0,10,0.021634,0.035367,0,41.913806,5.765978,CompleteIDCertification,67.997282
4,558819.0,5.0,2136706,1.0,2022-05-09 10:41:05,39000000.0,직장가입자(4대보험O),정규직,전월세,80000000.0,...,0.0,0,10,0.027749,0.035265,4,41.913806,5.765978,CompleteIDCertification,67.997282


In [5]:
final = final_df.drop(['user_id', 'application_id', 'insert_time', 'loanapply_insert_time'], axis = 1)

In [6]:
final['income_type'] = final['income_type'].astype('category')
final['employment_type'] = final['employment_type'].astype('category')
final['houseown_type'] = final['houseown_type'].astype('category')
final['purpose'] = final['purpose'].astype('category')
final['first_event'] = final['first_event'].astype('category')
final['weekday'] = final['weekday'].astype('category')
final['personal_rehabilitation_yn'] = final['personal_rehabilitation_yn'].astype('category')
final['personal_rehabilitation_complete_yn'] = final['personal_rehabilitation_complete_yn'].astype('category')
final['gender'] = final['gender'].astype('category')
final['spec_clust'] = final['spec_clust'].astype('category')
final['cluster2'] = final['cluster2'].astype('category')

In [7]:
final = pd.get_dummies(final)

In [8]:
final['yearly_income']=np.log(final['yearly_income'])
final["desired_amount"]=np.log(final["desired_amount"])
final["existing_loan_amt"]=np.log(final["existing_loan_amt"])

In [9]:
final_train = final[final['is_applied'].notnull()]
final_test = final[final['is_applied'].isnull()]

In [10]:
data = final_train.drop('is_applied',axis=1)
target = final_train['is_applied']

In [11]:
final_test_train = final_test.drop('is_applied',axis=1)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.3, random_state=0)

# lgb + random search

In [13]:
parameters = {'num_leaves':[50, 60, 70], 'min_child_samples':[13,15,17],'max_depth':[15,20,25],
             'learning_rate':[0.05,0.1,0.15], 'scale_pos_weight':[4,5,6,8]}

In [14]:
lgb = LGBMClassifier()

In [15]:
clf = RandomizedSearchCV(lgb, parameters, scoring='f1', n_iter=10)
clf.fit(X = X_train, y = y_train)
print(clf.best_params_)

{'scale_pos_weight': 4, 'num_leaves': 60, 'min_child_samples': 15, 'max_depth': 20, 'learning_rate': 0.15}


In [16]:
lgb_param = LGBMClassifier(scale_pos_weight = 4, num_leaves = 70, min_child_samples = 13, max_depth = 25, learning_rate = 0.15)
lgb_param.fit(X_train, y_train)
lgb_param_preds = lgb_param.predict(X_val)

In [17]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_val, lgb_param_preds)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_val, lgb_param_preds)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_val, lgb_param_preds)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_val,lgb_param_preds)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_val, lgb_param_preds)))

f1 score : 0.4105
accuracy: 0.9158
precision: 0.3403
recall: 0.5174
roc_auc: 0.7285


* 추가

In [18]:
lgb_param = LGBMClassifier(scale_pos_weight = 4, n_estimators = 800,
                           objective = 'binary', metric='f1',
                           num_leaves = 72 ,
                           min_child_samples = 12, max_depth = 25,
                           learning_rate = 0.3)
lgb_param.fit(X_train, y_train)
lgb_param_preds = lgb_param.predict(X_val)

In [19]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_val, lgb_param_preds)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_val, lgb_param_preds)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_val, lgb_param_preds)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_val,lgb_param_preds)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_val, lgb_param_preds)))

f1 score : 0.4263
accuracy: 0.9169
precision: 0.3501
recall: 0.5450
roc_auc: 0.7421


In [20]:
lgb_param = LGBMClassifier(scale_pos_weight = 4, n_estimators = 800,
                           objective = 'binary', metric='f1',
                           num_leaves = 72 ,
                           min_child_samples = 12, max_depth = 25,
                           learning_rate = 0.3)
lgb_param.fit(X_train, y_train)
lgb_param_preds = lgb_param.predict(X_train)

In [21]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_train, lgb_param_preds)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_train, lgb_param_preds)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_train, lgb_param_preds)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_train,lgb_param_preds)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_train, lgb_param_preds)))

f1 score : 0.4592
accuracy: 0.9215
precision: 0.3766
recall: 0.5884
roc_auc: 0.7649


# lgb + rus

In [22]:
rus = RandomUnderSampler(random_state= 42,sampling_strategy= {0:600000, 1:392199 })

In [23]:
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print(Counter(y_rus))

Counter({0.0: 600000, 1.0: 392199})


In [24]:
parameters2 = {'num_leaves':[80, 90, 100, 110, 120], 'min_child_samples':[8,10,12],'max_depth':[10,20,30],
             'learning_rate':[0.1,0.15, 0.2, 0.25]}

In [25]:
lgb = LGBMClassifier()

In [26]:
clf = RandomizedSearchCV(lgb, parameters2, scoring='f1',n_iter=10)
clf.fit(X = X_rus, y = y_rus)
print(clf.best_params_)

{'num_leaves': 120, 'min_child_samples': 12, 'max_depth': 20, 'learning_rate': 0.25}


In [27]:
lgb_param2 = LGBMClassifier(num_leaves = 110, min_child_samples = 10, max_depth = 30, learning_rate = 0.25)
lgb_param2.fit(X_rus, y_rus)
lgb_param_preds2 = lgb_param2.predict(X_val)

In [28]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_val, lgb_param_preds2)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_val, lgb_param_preds2)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_val, lgb_param_preds2)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_val,lgb_param_preds2)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_val, lgb_param_preds2)))

f1 score : 0.3527
accuracy: 0.8329
precision: 0.2260
recall: 0.8030
roc_auc: 0.8189


* 추가

In [29]:
lgb_param2 = LGBMClassifier(n_estimators = 1500, num_leaves = 110,
                            objective = 'binary', metric='f1',
                            min_child_samples = 12, max_depth = 28,
                            learning_rate = 0.35)
lgb_param2.fit(X_rus, y_rus)
lgb_param_preds2 = lgb_param2.predict(X_val)

In [30]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_val, lgb_param_preds2)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_val, lgb_param_preds2)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_val, lgb_param_preds2)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_val,lgb_param_preds2)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_val, lgb_param_preds2)))

f1 score : 0.3630
accuracy: 0.8412
precision: 0.2349
recall: 0.7981
roc_auc: 0.8210


In [31]:
lgb_param2 = LGBMClassifier(n_estimators = 1500, num_leaves = 110, objective = 'binary',
                            metric='f1', min_child_samples = 12,
                            max_depth = 28, learning_rate = 0.35)
lgb_param2.fit(X_rus, y_rus)
lgb_param_preds2 = lgb_param2.predict(X_train)

In [32]:
#f1 score
print('f1 score : {:0.4f}'.format(f1_score(y_train, lgb_param_preds2)))
#정확도
print('accuracy: {:0.4f}'.format(accuracy_score(y_train, lgb_param_preds2)))
# 정밀도
print('precision: {:0.4f}'.format(precision_score(y_train, lgb_param_preds2)))
#재현율
print('recall: {:0.4f}'.format(recall_score(y_train,lgb_param_preds2)))
# ROC_AUC
print('roc_auc: {:0.4f}'.format(roc_auc_score(y_train, lgb_param_preds2)))

f1 score : 0.4303
accuracy: 0.8584
precision: 0.2787
recall: 0.9440
roc_auc: 0.8986


# lgb+ random search : Best

In [33]:
lbg_preds_test = lgb_param.predict(final_test_train.to_numpy())
lbg_preds_test

array([1., 0., 0., ..., 0., 0., 0.])

In [34]:
final_test.loc[(final_test['is_applied'].isnull()), 'is_applied'] = lbg_preds_test

In [35]:
final_target = final_test['is_applied']

# Dataset: application_id, product_id, 대출 여부(예측)

In [36]:
final_test_df = final_df[final_df['is_applied'].isnull()]

In [37]:
final_test_df = final_test_df[['application_id', 'product_id']]
final_test_df

Unnamed: 0,application_id,product_id
9325995,1748340,191
9325996,1748340,169
9325997,1748340,7
9325998,1748340,268
9325999,1748340,118
...,...,...
12581472,1428218,200
12581473,1428218,7
12581474,1428218,257
12581475,1428218,110


In [38]:
df_concat = pd.concat([final_test_df, final_target], axis=1)

In [39]:
df_concat

Unnamed: 0,application_id,product_id,is_applied
9325995,1748340,191,1.0
9325996,1748340,169,0.0
9325997,1748340,7,0.0
9325998,1748340,268,1.0
9325999,1748340,118,0.0
...,...,...,...
12581472,1428218,200,0.0
12581473,1428218,7,0.0
12581474,1428218,257,0.0
12581475,1428218,110,0.0


In [40]:
df_concat.to_csv("/Users/sungahwang/Desktop/BC_data/lgb_test_target.csv", index = False)