In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

data = pd.read_csv("DATA/Robust_Scaler_result.csv",encoding="cp949")
data.head()

Unnamed: 0,CUSTNO,GNO,CBSCORE,CBSCOREGRD,CREDITOTAMT,YSALEAMT,ESTMM,ASSETAMT,IMSAAMT,IMJUAMT,...,환율,GDP,소상공인체감지수,실업률,물가지수,국고채,금리,유가등락률,소비자심리지수,ONEHOT
0,475821,l180202101898,1.015873,1,0.230769,10.743169,0.014085,10,0.0,0.0,...,0.009592,1.885958,51.4,0.0,1.724868,-1.25641,-4.0,1.7506,0.798246,3
1,74417,l230201700120,0.206349,2,0.205128,6.027322,-0.380282,25,-0.5,25.0,...,0.53717,-0.901805,53.8,0.666667,-0.640212,-0.307692,-1.0,1.100719,-0.596491,8
2,387787,l110201603233,0.396825,1,-0.282051,3.775956,2.098592,10,0.0,0.0,...,0.978417,-1.182547,72.6,0.0,-1.417989,-0.461538,-1.0,0.110312,-0.482456,7
3,395418,l230201700254,0.0,2,0.076923,3.562842,-0.352113,10,0.0,0.0,...,0.520384,-0.901805,76.2,-0.333333,-0.677249,-0.205128,-1.0,0.429257,-0.394737,3
4,190372,l200201601418,-2.47619,6,-0.102564,1.437158,1.84507,80,-0.4,0.0,...,0.446043,-1.357134,63.2,-0.333333,-1.867725,-1.461538,-1.0,-0.618705,-0.017544,7


In [3]:
features = data[['CBSCORE', 'CREDITOTAMT', 'YSALEAMT',
       'ESTMM',  'IMSAAMT', 'IMJUAMT', 'BUSAAMT', 'BUJUAMT',
       'BU1TOTAMT', 'GAMT', 'LABORCNT', 'KOSPI', '환율', 'GDP', 
       '소상공인체감지수', '실업률', '물가지수', '국고채', '금리', '유가등락률',
       '소비자심리지수','ONEHOT']]
act = data['ACTCD']

In [4]:
from sklearn.model_selection import train_test_split

train_x, test_x,train_y, test_y = train_test_split(features, act,
stratify=act,train_size=0.7,test_size=0.3,random_state=1)

print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(27381, 22) (11736, 22) (27381,) (11736,)


In [5]:
import statsmodels.api as sm
from imblearn.over_sampling import BorderlineSMOTE

sm = BorderlineSMOTE(random_state=42)
over_train_x, over_train_y = sm.fit_resample(train_x,train_y)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = 3000)
rnd_clf = RandomForestClassifier(n_estimators=200,max_depth=50,max_features=1)
knn_clf = KNeighborsClassifier(n_neighbors= 2,algorithm='kd_tree',weights='uniform')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('knn', knn_clf)],
    voting = 'hard'
)

voting_clf.fit(over_train_x,over_train_y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(max_iter=3000,
                                                 random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=50,
                                                     max_features=1,
                                                     n_estimators=200)),
                             ('knn',
                              KNeighborsClassifier(algorithm='kd_tree',
                                                   n_neighbors=2))])

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

for clf in (log_clf, rnd_clf, knn_clf, voting_clf):
    clf.fit(over_train_x, over_train_y)
    y_pred = clf.predict(test_x)
    cfmat = confusion_matrix(test_y,y_pred)
    print(clf)
    print(cfmat)
    print(classification_report(test_y, y_pred, target_names=['정상보증', '사고보증']))

LogisticRegression(max_iter=3000, random_state=42)
[[7383 3283]
 [ 365  705]]
              precision    recall  f1-score   support

        정상보증       0.95      0.69      0.80     10666
        사고보증       0.18      0.66      0.28      1070

    accuracy                           0.69     11736
   macro avg       0.56      0.68      0.54     11736
weighted avg       0.88      0.69      0.75     11736

RandomForestClassifier(max_depth=50, max_features=1, n_estimators=200)
[[10414   252]
 [  773   297]]
              precision    recall  f1-score   support

        정상보증       0.93      0.98      0.95     10666
        사고보증       0.54      0.28      0.37      1070

    accuracy                           0.91     11736
   macro avg       0.74      0.63      0.66     11736
weighted avg       0.90      0.91      0.90     11736

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)
[[9593 1073]
 [ 719  351]]
              precision    recall  f1-score   support

        정상보증       0.93    