In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

data = pd.read_csv("DATA/Standard_Scaler_result.csv",encoding="cp949")
data.head()

Unnamed: 0,CUSTNO,GNO,CBSCORE,CBSCOREGRD,CREDITOTAMT,YSALEAMT,ESTMM,ASSETAMT,IMSAAMT,IMJUAMT,...,환율,GDP,소상공인체감지수,실업률,물가지수,국고채,금리,유가등락률,소비자심리지수,ONEHOT
0,475821,l180202101898,1.483693,-1.008776,-0.182324,5.59941,-0.351065,-0.497246,-0.176818,-0.327516,...,-0.06809,2.274533,-1.302456,-0.290076,2.728294,-1.492204,-3.80167,2.712699,1.30053,3
1,74417,l230201700120,0.43662,-0.269293,-0.187359,3.004204,-0.774902,-0.434147,-0.395499,0.047573,...,0.509001,-1.236236,-1.058419,0.571922,-0.733929,-0.36394,-0.649856,1.705658,-1.167094,8
2,387787,l110201603233,0.68299,-1.008776,-0.283032,1.765241,1.889216,-0.497246,-0.176818,-0.327516,...,0.991659,-1.589789,0.8532,-0.290076,-1.872513,-0.546902,-0.649856,0.170943,-0.965339,7
3,395418,l230201700254,0.169719,-0.269293,-0.212536,1.64796,-0.744628,-0.497246,-0.176818,-0.327516,...,0.490639,-1.236236,1.219254,-0.721075,-0.788147,-0.241965,-0.649856,0.665173,-0.810142,3
4,190372,l200201601418,-3.033092,2.688638,-0.247784,0.478163,1.616749,-0.202785,-0.351763,-0.327516,...,0.409321,-1.809655,-0.10261,-0.721075,-2.530877,-1.736154,-0.649856,-0.958727,-0.142797,7


In [4]:
features = data[['CBSCORE', 'CREDITOTAMT', 'YSALEAMT',
       'ESTMM',  'IMSAAMT', 'IMJUAMT', 'BUSAAMT', 'BUJUAMT',
       'BU1TOTAMT', 'GAMT', 'LABORCNT', 'KOSPI', '환율', 'GDP', 
       '소상공인체감지수', '실업률', '물가지수', '국고채', '금리', '유가등락률',
       '소비자심리지수','ONEHOT']]
act = data['ACTCD']

In [5]:
from sklearn.model_selection import train_test_split

train_x, test_x,train_y, test_y = train_test_split(features, act,
stratify=act,train_size=0.7,test_size=0.3,random_state=1)

print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(27381, 22) (11736, 22) (27381,) (11736,)


In [6]:
from imblearn.over_sampling import BorderlineSMOTE, ADASYN
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek

#언더 샘플링
enn = EditedNearestNeighbours(kind_sel="all", n_neighbors=10)
tomekl = TomekLinks()

#오버 샘플링
bsmote = BorderlineSMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

#혼합 샘플링
smotee = SMOTEENN(random_state=42)
smoteT = SMOTETomek(random_state=42)


X_under1_train, Y_under1_train = enn.fit_resample(train_x, train_y)
X_under2_train, Y_under2_train = tomekl.fit_resample(train_x, train_y)

X_over1_train, Y_over1_train = bsmote.fit_resample(train_x,train_y)
X_over2_train, Y_over2_train = adasyn.fit_resample(train_x,train_y)

X_comb1_train, Y_comb1_train = smotee.fit_resample(train_x, train_y)
X_comb2_train, Y_comb2_train = smoteT.fit_resample(train_x, train_y)

BorderlineSMOTE

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2, algorithm='kd_tree',weights='uniform')
abst_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=0.1)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf), ('adaboost', abst_clf)],
    voting = 'soft'
)

voting_clf.fit(X_over1_train,Y_over1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2)),
                             ('adaboost',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                                 learning_rate=0.1,
                                                 n_estimators=100))],
                 voting='soft')

In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_over1_train, Y_over1_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[7533 3133]
 [ 361  709]]
              precision    recall  f1-score   support

        정상보증       0.95      0.71      0.81     10666
        사고보증       0.18      0.66      0.29      1070

    accuracy                           0.70     11736
   macro avg       0.57      0.68      0.55     11736
weighted avg       0.88      0.70      0.76     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[10381   285]
 [  754   316]]
              precision    recall  f1-score   support

        정상보증       0.93      0.97      0.95     10666
        사고보증       0.53      0.30      0.38      1070

    accuracy                           0.91     11736
   macro avg       0.73      0.63      0.67     11736
weighted avg       0.90      0.91      0.90     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[9560 1106]
 [ 694  376]]
              precision    recall  f1-score   support

        정상보증       0.93      0.90      0.9

In [9]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,2,2,1]
)

voting_clf.fit(X_over1_train,Y_over1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [10]:
voting_clf.fit(X_over1_train, Y_over1_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)

print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[10226   440]
 [  714   356]]
              precision    recall  f1-score   support

        정상보증       0.93      0.96      0.95     10666
        사고보증       0.45      0.33      0.38      1070

    accuracy                           0.90     11736
   macro avg       0.69      0.65      0.66     11736
weighted avg       0.89      0.90      0.90     11736



Adasyn

In [11]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2, algorithm='kd_tree',weights='uniform')
abst_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=0.1)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf), ('adaboost', abst_clf)],
    voting = 'soft'
)

voting_clf.fit(X_over2_train,Y_over2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2)),
                             ('adaboost',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                                 learning_rate=0.1,
                                                 n_estimators=100))],
                 voting='soft')

In [12]:
for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_over2_train, Y_over2_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[7034 3632]
 [ 307  763]]
              precision    recall  f1-score   support

        정상보증       0.96      0.66      0.78     10666
        사고보증       0.17      0.71      0.28      1070

    accuracy                           0.66     11736
   macro avg       0.57      0.69      0.53     11736
weighted avg       0.89      0.66      0.74     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[10354   312]
 [  750   320]]
              precision    recall  f1-score   support

        정상보증       0.93      0.97      0.95     10666
        사고보증       0.51      0.30      0.38      1070

    accuracy                           0.91     11736
   macro avg       0.72      0.63      0.66     11736
weighted avg       0.89      0.91      0.90     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[9351 1315]
 [ 663  407]]
              precision    recall  f1-score   support

        정상보증       0.93      0.88      0.9

In [11]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,2,2,1]
)

voting_clf.fit(X_over2_train,Y_over2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [12]:
voting_clf.fit(X_over2_train, Y_over2_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)

print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[10160   506]
 [  718   352]]
              precision    recall  f1-score   support

        정상보증       0.93      0.95      0.94     10666
        사고보증       0.41      0.33      0.37      1070

    accuracy                           0.90     11736
   macro avg       0.67      0.64      0.65     11736
weighted avg       0.89      0.90      0.89     11736



ENN

In [13]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,1]
)

voting_clf.fit(X_under1_train,Y_under1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
                 voting='soft', weights=[1, 2, 1])

In [14]:
for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_under1_train, Y_under1_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[9503 1163]
 [ 675  395]]
              precision    recall  f1-score   support

        정상보증       0.93      0.89      0.91     10666
        사고보증       0.25      0.37      0.30      1070

    accuracy                           0.84     11736
   macro avg       0.59      0.63      0.61     11736
weighted avg       0.87      0.84      0.86     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[9876  790]
 [ 615  455]]
              precision    recall  f1-score   support

        정상보증       0.94      0.93      0.93     10666
        사고보증       0.37      0.43      0.39      1070

    accuracy                           0.88     11736
   macro avg       0.65      0.68      0.66     11736
weighted avg       0.89      0.88      0.88     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[8926 1740]
 [ 595  475]]
              precision    recall  f1-score   support

        정상보증       0.94      0.84      0.88   

3 RANDOMFOREST, LR, KNN

In [15]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,2,2,1]
)

voting_clf.fit(X_under1_train,Y_under1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [16]:
voting_clf.fit(X_under1_train, Y_under1_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)

print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[9691  975]
 [ 578  492]]
              precision    recall  f1-score   support

        정상보증       0.94      0.91      0.93     10666
        사고보증       0.34      0.46      0.39      1070

    accuracy                           0.87     11736
   macro avg       0.64      0.68      0.66     11736
weighted avg       0.89      0.87      0.88     11736



TomekLinks

In [19]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,1]
)

voting_clf.fit(X_under2_train,Y_under2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
                 voting='soft', weights=[1, 2, 1])

In [20]:
for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_under2_train, Y_under2_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[10640    26]
 [ 1055    15]]
              precision    recall  f1-score   support

        정상보증       0.91      1.00      0.95     10666
        사고보증       0.37      0.01      0.03      1070

    accuracy                           0.91     11736
   macro avg       0.64      0.51      0.49     11736
weighted avg       0.86      0.91      0.87     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[10643    23]
 [  844   226]]
              precision    recall  f1-score   support

        정상보증       0.93      1.00      0.96     10666
        사고보증       0.91      0.21      0.34      1070

    accuracy                           0.93     11736
   macro avg       0.92      0.60      0.65     11736
weighted avg       0.92      0.93      0.90     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[10455   211]
 [  915   155]]
              precision    recall  f1-score   support

        정상보증       0.92      0.98 

In [17]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,2,2,1]
)

voting_clf.fit(X_under2_train,Y_under2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [18]:
#for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
voting_clf.fit(X_under2_train, Y_under2_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)
#print(clf)
print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[10641    25]
 [  843   227]]
              precision    recall  f1-score   support

        정상보증       0.93      1.00      0.96     10666
        사고보증       0.90      0.21      0.34      1070

    accuracy                           0.93     11736
   macro avg       0.91      0.60      0.65     11736
weighted avg       0.92      0.93      0.90     11736



SMOTE ENN

In [21]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,1]
)

voting_clf.fit(X_comb1_train,Y_comb1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
                 voting='soft', weights=[1, 2, 1])

In [22]:
for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_comb1_train, Y_comb1_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[5695 4971]
 [ 207  863]]
              precision    recall  f1-score   support

        정상보증       0.96      0.53      0.69     10666
        사고보증       0.15      0.81      0.25      1070

    accuracy                           0.56     11736
   macro avg       0.56      0.67      0.47     11736
weighted avg       0.89      0.56      0.65     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[9485 1181]
 [ 575  495]]
              precision    recall  f1-score   support

        정상보증       0.94      0.89      0.92     10666
        사고보증       0.30      0.46      0.36      1070

    accuracy                           0.85     11736
   macro avg       0.62      0.68      0.64     11736
weighted avg       0.88      0.85      0.86     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[8095 2571]
 [ 476  594]]
              precision    recall  f1-score   support

        정상보증       0.94      0.76      0.84   

In [23]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft'
)

voting_clf.fit(X_comb1_train,Y_comb1_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [24]:
#for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
voting_clf.fit(X_comb1_train, Y_comb1_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)
#print(clf)
print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[8530 2136]
 [ 428  642]]
              precision    recall  f1-score   support

        정상보증       0.95      0.80      0.87     10666
        사고보증       0.23      0.60      0.33      1070

    accuracy                           0.78     11736
   macro avg       0.59      0.70      0.60     11736
weighted avg       0.89      0.78      0.82     11736



=> randomforest 3 계층 추가 후, 성능 개선됨!

SMOTE TOMEK

In [19]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf)],
    voting = 'soft', weights=[1,2,1]
)

voting_clf.fit(X_comb2_train,Y_comb2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
                 voting='soft', weights=[1, 2, 1])

In [20]:
for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(X_comb1_train, Y_comb1_train)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(random_state=42)
[[5685 4981]
 [ 206  864]]
              precision    recall  f1-score   support

        정상보증       0.97      0.53      0.69     10666
        사고보증       0.15      0.81      0.25      1070

    accuracy                           0.56     11736
   macro avg       0.56      0.67      0.47     11736
weighted avg       0.89      0.56      0.65     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[9445 1221]
 [ 563  507]]
              precision    recall  f1-score   support

        정상보증       0.94      0.89      0.91     10666
        사고보증       0.29      0.47      0.36      1070

    accuracy                           0.85     11736
   macro avg       0.62      0.68      0.64     11736
weighted avg       0.88      0.85      0.86     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[8095 2571]
 [ 476  594]]
              precision    recall  f1-score   support

        정상보증       0.94      0.76      0.84   

In [33]:
log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf1', rnd_clf), ('rf2', rnd_clf), ('rf3', rnd_clf), ('knn', knn_clf)],
    voting = 'soft'
)

voting_clf.fit(X_comb2_train,Y_comb2_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf2',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('rf3',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))],
          

In [35]:
voting_clf.fit(X_comb2_train, Y_comb2_train)
y_pred = voting_clf.predict(test_x)
cfmat = confusion_matrix(test_y,y_pred)
print(cfmat)
print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

[[9921  745]
 [ 663  407]]
              precision    recall  f1-score   support

        정상보증       0.94      0.93      0.93     10666
        사고보증       0.35      0.38      0.37      1070

    accuracy                           0.88     11736
   macro avg       0.65      0.66      0.65     11736
weighted avg       0.88      0.88      0.88     11736

