In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('font', family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Malgun Gothic'
import seaborn as sns
import scipy.stats as stats

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
df_raw = pd.read_csv('data/bank.csv')
df_raw['JOB'].fillna('Other', inplace = True)
df_raw.fillna(df_raw.mean(), inplace = True)

df_raw_dummy = pd.get_dummies(df_raw)

df_train, df_test = train_test_split(df_raw_dummy, test_size= 0.2, random_state=1234)
print('학습용 데이터의 크기: {}'.format(df_train.shape))
print('평가용 데이터의 크기: {}'.format(df_test.shape))

df_train_y = df_train['BAD']
df_train_x = df_train.drop(columns = 'BAD', inplace = False)
df_test_y = df_test['BAD']
df_test_x = df_test.drop(columns = 'BAD', inplace = False)

학습용 데이터의 크기: (2998, 19)
평가용 데이터의 크기: (750, 19)


  df_raw.fillna(df_raw.mean(), inplace = True)


In [5]:
# 개별 모델 생성

dt_model = DecisionTreeClassifier(max_depth = 6, random_state = 1234)
rf_model = RandomForestClassifier(n_estimators=100, random_state=1234)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=1234)

dt_model.fit(df_train_x, df_train_y)
rf_model.fit(df_train_x, df_train_y)
gb_model.fit(df_train_x, df_train_y)

dt_pred = dt_model.predict(df_test_x)
rf_pred = rf_model.predict(df_test_x)
gb_pred = gb_model.predict(df_test_x)


print('의사결정나무의 정확도: {:.3f}'.format(accuracy_score(df_test_y, dt_pred)))
print('랜덤포레스트의 정확도: {:.3f}'.format(accuracy_score(df_test_y, rf_pred)))
print('그래디언트부스팅의 정확도: {:.3f}'.format(accuracy_score(df_test_y, gb_pred)))




의사결정나무의 정확도: 0.932
랜덤포레스트의 정확도: 0.949
그래디언트부스팅의 정확도: 0.941


In [9]:
# 최종 데이터셋 구성

pred = np.array([dt_pred, rf_pred, gb_pred])
print(pred.shape)

(3, 750)


In [10]:
pred = np.transpose(pred)
print(pred.shape)

(750, 3)


In [12]:
# 최종 모델

log_model = LogisticRegression(C = 10)
log_model.fit(pred, df_test_y)
final = log_model.predict(pred)

print('최종정확도: {:.3f}'.format(accuracy_score(df_test_y, final)))
print(classification_report(df_test_y, final))

최종정확도: 0.951
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       684
           1       1.00      0.44      0.61        66

    accuracy                           0.95       750
   macro avg       0.97      0.72      0.79       750
weighted avg       0.95      0.95      0.94       750



In [16]:
from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

#1

In [19]:
# 개별 모델 생성


gb_model = GradientBoostingClassifier(n_estimators=100, random_state=1234)
xgb_model = XGBClassifier(learning_rate = 0.3, booster = 'gbtree', max_depth = 5, n_estimators = 150, gamma = 0,
                          objective = 'binary:logistic', random_state = 1234)
lgbm_model = LGBMClassifier(learning_rate=0.3, max_depth=9, num_leaves = 33, random_state=1234)

lgbm_model.fit(df_train_x, df_train_y)
xgb_model.fit(df_train_x, df_train_y)
gb_model.fit(df_train_x, df_train_y)

lgbm_pred = lgbm_model.predict(df_test_x)
xgb_pred = xgb_model.predict(df_test_x)
gb_pred = gb_model.predict(df_test_x)


print('Light GBM의 정확도: {:.3f}'.format(accuracy_score(df_test_y, lgbm_pred)))
print('XGBoosting의 정확도: {:.3f}'.format(accuracy_score(df_test_y, xgb_pred)))
print('그래디언트부스팅의 정확도: {:.3f}'.format(accuracy_score(df_test_y, gb_pred)))



Light GBM의 정확도: 0.953
XGBoosting의 정확도: 0.953
그래디언트부스팅의 정확도: 0.941


In [26]:
pred = np.array([lgbm_pred, xgb_pred, gb_pred])

In [27]:
pred.shape

(3, 750)

In [28]:
pred = np.transpose(pred)

In [29]:
pred.shape

(750, 3)

In [34]:
model = LogisticRegression(C = 10)
model.fit(pred, df_test_y)
final = model.predict(pred)
print(classification_report(df_test_y, final))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       684
           1       0.93      0.56      0.70        66

    accuracy                           0.96       750
   macro avg       0.94      0.78      0.84       750
weighted avg       0.96      0.96      0.95       750



#2

In [35]:
# 개별 모델 생성

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=1234)
xgb_model = XGBClassifier(learning_rate = 0.3, booster = 'gbtree', max_depth = 5, n_estimators = 150, gamma = 0,
                          objective = 'binary:logistic', random_state = 1234)
lgbm_model = LGBMClassifier(learning_rate=0.3, max_depth=9, num_leaves = 33, random_state=1234)

lgbm_model.fit(df_train_x, df_train_y)
xgb_model.fit(df_train_x, df_train_y)
gb_model.fit(df_train_x, df_train_y)

lgbm_pred = lgbm_model.predict(df_train_x)
xgb_pred = xgb_model.predict(df_train_x)
gb_pred = gb_model.predict(df_train_x)


# print('Light GBM의 정확도: {:.3f}'.format(accuracy_score(df_test_y, lgbm_pred)))
# print('XGBoosting의 정확도: {:.3f}'.format(accuracy_score(df_test_y, xgb_pred)))
# print('그래디언트부스팅의 정확도: {:.3f}'.format(accuracy_score(df_test_y, gb_pred)))





In [44]:
pred = np.array([lgbm_pred, xgb_pred, gb_pred])

In [45]:
pred.shape

(3, 2998)

In [46]:
pred = np.transpose(pred)
pred.shape

(2998, 3)

In [41]:
df_train_y.shape

(2998,)

In [43]:
df_test_x.shape

(750, 18)

In [49]:
model = LogisticRegression(C = 10)
model.fit(pred, df_train_y)
final = model.predict(pred)
print(classification_report(df_train_y, final))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2719
           1       1.00      1.00      1.00       279

    accuracy                           1.00      2998
   macro avg       1.00      1.00      1.00      2998
weighted avg       1.00      1.00      1.00      2998

