In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('/최종제출서류/추가제출파일/train.csv')
test = pd.read_csv('/최종제출서류/추가제출파일/test.csv')

In [None]:
train['is_applied'].value_counts(100)

0.0    0.939379
1.0    0.060621
Name: is_applied, dtype: float64

In [None]:
# 모델 학습 --> 전체 데이터를 다 쓰는 경우
num_seed = 42
num_cv = 5

X_features = train.drop(columns=['is_applied', 'application_id', 'user_id', 'gender', 'personal_rehabilitation_yn',
                                 'personal_rehabilitation_complete_yn', 'SignUp'])
y_label = train['is_applied']
lgbm = LGBMClassifier(n_estimators=1000, max_depth=6, boosting_type='gbdt', n_jobs=-1, boost_from_average=False, random_state=num_seed)   # boost_from_average=False --> 레이블 값이 불균형한 경우 사용
scores = cross_validate(lgbm, X_features, y_label, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=num_cv)
print(scores)
print('\n\n')
print(f"acc : {np.mean(scores['test_accuracy'])}")
print(f"pre : {np.mean(scores['test_precision'])}")
print(f"rec : {np.mean(scores['test_recall'])}")
print(f"f1 : {np.mean(scores['test_f1'])}")
print(f"roc_auc : {np.mean(scores['test_roc_auc'])}")

{'fit_time': array([784.8736949 , 762.23326349, 784.23655796, 784.22426629,
       790.60042787]), 'score_time': array([81.27342391, 83.86290574, 86.05246258, 85.93022633, 91.11342716]), 'test_accuracy': array([0.94151718, 0.94158085, 0.94144383, 0.94121294, 0.94165005]), 'test_precision': array([0.59380824, 0.59738214, 0.58742313, 0.58011612, 0.59696952]), 'test_recall': array([0.11163853, 0.11141249, 0.11444136, 0.1095364 , 0.11532289]), 'test_f1': array([0.18794296, 0.18780005, 0.19156262, 0.18427782, 0.19330334]), 'test_roc_auc': array([0.88637884, 0.88701592, 0.8842713 , 0.8839245 , 0.88443145])}



acc : 0.9414809718459048
pre : 0.5911398289518732
rec : 0.11247033294907438
f1 : 0.1889773570683686
roc_auc : 0.8852044022317376


In [None]:
num_seed = 42
num_cv = 5
X_features = train.drop(columns=['is_applied', 'application_id', 'user_id', 'gender', 'personal_rehabilitation_yn',
                                 'personal_rehabilitation_complete_yn', 'SignUp'])
y_label = train['is_applied']

X_test = test.drop(columns=['is_applied', 'application_id', 'user_id', 'gender', 'personal_rehabilitation_yn',
                            'personal_rehabilitation_complete_yn', 'SignUp'])

lgbm = LGBMClassifier(n_estimators=1000, max_depth=6, boosting_type='gbdt', n_jobs=-1, boost_from_average=False, random_state=num_seed)   # boost_from_average=False --> 레이블 값이 불균형한 경우 사용
X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_label, shuffle=True, test_size=0.2, random_state=42, stratify=y_label)
lgbm.fit(X_train, y_label, eval_set=(X_valid, y_valid), verbose=1, early_stopping_rounds=500, eval_metric='auc')


In [None]:
# 최종 예측 - StratifiedKFold를 사용하여 모델 학습 후 predict 메서드로 test데이터 예측

num_seed = 42
num_cv = 5

X_features = train.drop(columns=['is_applied', 'application_id', 'user_id', 'gender', 'personal_rehabilitation_yn',
                                 'personal_rehabilitation_complete_yn', 'SignUp'])
y_label = train['is_applied']

X_test = test.drop(columns=['is_applied', 'application_id', 'user_id', 'gender', 'personal_rehabilitation_yn',
                            'personal_rehabilitation_complete_yn', 'SignUp'])

skf = StratifiedKFold(n_splits=num_cv, shuffle=True, random_state=num_seed)
lgbm = LGBMClassifier(n_estimators=1000, max_depth=6, boosting_type='gbdt', n_jobs=-1, boost_from_average=False, random_state=num_seed)   # boost_from_average=False --> 레이블 값이 불균형한 경우 사용
result = 0

for train_index, valid_index in skf.split(X_features, y_label):
  X_train = X_features.iloc[train_index]
  X_valid = X_features.iloc[valid_index]
  y_train = y_label.iloc[train_index]
  y_valid = y_label.iloc[valid_index]

  lgbm.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=1, early_stopping_rounds=500, eval_metric='auc')
  
  result += lgbm.predict(X_test) / 5
  

In [None]:
# 모델 예측 평균이 0.5 넘으면 1, 아니면 0으로 예측
test['is_applied'] = result
test['is_applied'] = test['is_applied'].apply(lambda x: 1 if x>0.5 else 0)
test = test.drop(columns = ['user_id', 'gender', 'credit_score', 'yearly_income',
       'company_enter_month', 'houseown_type', 'desired_amount', 'purpose',
       'personal_rehabilitation_yn', 'personal_rehabilitation_complete_yn',
       'existing_loan_cnt', 'existing_loan_amt', 'num_event', 'time_gap',
       'CompleteIDCertification', 'EndLoanApply', 'GetCreditInfo', 'Login',
       'OpenApp', 'SignUp', 'StartLoanApply', 'UseDSRCalc', 'UseLoanManage',
       'UsePrepayCalc', 'ViewLoanApplyIntro', 'rehabailitation_total',
       'age_group', 'income_employment_type', 'existing_avg_loan_amt',
       'year_income_age', 'bank_id','loan_limit', 'loan_rate',
       'insert_year', 'insert_month', 'insert_day', 'loan_day',
       'loan_hour', 'loan_minute'])
test.to_csv('result.csv', index=False)