In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

data = pd.read_csv("MinMax_Scaler_result.csv",encoding="cp949")
data.head()

Unnamed: 0,CUSTNO,GNO,CBSCORE,CBSCOREGRD,CREDITOTAMT,YSALEAMT,ESTMM,ASSETAMT,IMSAAMT,IMJUAMT,...,KIND_JOB_K,KIND_JOB_L,KIND_JOB_M,KIND_JOB_N,KIND_JOB_O,KIND_JOB_P,KIND_JOB_Q,KIND_JOB_R,KIND_JOB_S,KIND_JOB_T
0,475821,l180202101898,0.94929,0.0,0.002338,0.223956,0.058085,0.000768,0.004153,0.0,...,0,0,0,0,0,0,0,0,0,0
1,74417,l230201700120,0.897566,0.166667,0.002221,0.129121,0.014129,0.00192,0.0,0.015489,...,0,0,0,0,0,0,0,0,0,0
2,387787,l110201603233,0.909736,0.0,0.0,0.083846,0.290424,0.000768,0.004153,0.0,...,0,0,0,0,0,0,0,0,0,0
3,395418,l230201700254,0.884381,0.166667,0.001637,0.07956,0.017268,0.000768,0.004153,0.0,...,0,0,0,0,0,0,0,0,0,0
4,190372,l200201601418,0.726166,0.833333,0.000818,0.036813,0.262166,0.006143,0.000831,0.0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
features = data[['CBSCORE', 'CREDITOTAMT', 'YSALEAMT',
       'ESTMM',  'IMSAAMT', 'IMJUAMT', 'BUJUAMT',
       'BU1TOTAMT', 'GAMT', 'LABORCNT', 'KOSPI', '환율', 'GDP', 
       '실업률', '물가지수', '국고채', '금리', '유가등락률',
       '소비자심리지수','KIND_JOB_A', 'KIND_JOB_B', 'KIND_JOB_C', 'KIND_JOB_D',
       'KIND_JOB_E', 'KIND_JOB_F', 'KIND_JOB_G', 'KIND_JOB_H', 'KIND_JOB_I',
       'KIND_JOB_J', 'KIND_JOB_K', 'KIND_JOB_L', 'KIND_JOB_M', 'KIND_JOB_N',
       'KIND_JOB_O', 'KIND_JOB_P', 'KIND_JOB_Q', 'KIND_JOB_R', 'KIND_JOB_S',
       'KIND_JOB_T']]
act = data['ACTCD']

In [3]:
from sklearn.model_selection import train_test_split

train_x, test_x,train_y, test_y = train_test_split(features, act,
stratify=act,train_size=0.7,test_size=0.3,random_state=1)

print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(27381, 39) (11736, 39) (27381,) (11736,)


In [4]:
from imblearn.over_sampling import BorderlineSMOTE, ADASYN
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek

#언더 샘플링
enn = EditedNearestNeighbours(kind_sel="all", n_neighbors=10)
tomekl = TomekLinks()

#오버 샘플링
bsmote = BorderlineSMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

#혼합 샘플링
smotee = SMOTEENN(random_state=42)
smoteT = SMOTETomek(random_state=42)


X_under1_train, Y_under1_train = enn.fit_resample(train_x, train_y)
X_under2_train, Y_under2_train = tomekl.fit_resample(train_x, train_y)

X_over1_train, Y_over1_train = bsmote.fit_resample(train_x,train_y)
X_over2_train, Y_over2_train = adasyn.fit_resample(train_x,train_y)

X_comb1_train, Y_comb1_train = smotee.fit_resample(train_x, train_y)
X_comb2_train, Y_comb2_train = smoteT.fit_resample(train_x, train_y)

### 아래 분석 결과, Logistic Regression 하이퍼 파라미터: C와 max_iter 조정 결과 변화 없음

## 오버샘플링

1. BorderlineSMOTE

In [5]:
# 임계값 함수

def cut_off(y, threshold) :
    Y = y.copy()  # 대문자 Y를 새로운 변수로 하여 기존의 y값에 영향이 가지 않도록 한다.
    Y[Y>threshold] = 1
    Y[Y<threshold] = 0
    return Y.astype(int)

from sklearn.metrics import confusion_matrix
# confusion matrix accuracy(정확도) 계산함수

def acc(cfmat):
    return (cfmat[0,0] + cfmat[1,1])/(cfmat[0,0] + cfmat[1,1] + cfmat[0,1] + cfmat[1,0])

In [6]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in c:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = 1000, C = i)
    results = model.fit(X_over1_train,Y_over1_train)
    
    #로지스틱 모형 적합
    #model = sm.Logit(Y_over1_train,X_over1_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [7]:
print(anal_result)

[(0.01, 0.6511588275391956, array([[6930, 3736],
       [ 358,  712]], dtype=int64), 0.16007194244604317, 0.6654205607476635, 0.2580645161290323, 0.657574334377207), (0.1, 0.684986366734833, array([[7327, 3339],
       [ 358,  712]], dtype=int64), 0.17575907183411504, 0.6654205607476635, 0.2780706893184925, 0.6761848725358419), (1, 0.6999829584185412, array([[7500, 3166],
       [ 355,  715]], dtype=int64), 0.1842308683329039, 0.6682242990654206, 0.2888305392849929, 0.6856966235623372), (10, 0.701346284935242, array([[7510, 3156],
       [ 349,  721]], dtype=int64), 0.1859685323703895, 0.6738317757009346, 0.2914897917930059, 0.6889691411788004), (100, 0.7012610770279482, array([[7508, 3158],
       [ 348,  722]], dtype=int64), 0.18608247422680413, 0.6747663551401869, 0.2917171717171717, 0.6893426750386853)]


In [8]:
print(anal_result)

[(0.01, 0.6511588275391956, array([[6930, 3736],
       [ 358,  712]], dtype=int64), 0.16007194244604317, 0.6654205607476635, 0.2580645161290323, 0.657574334377207), (0.1, 0.684986366734833, array([[7327, 3339],
       [ 358,  712]], dtype=int64), 0.17575907183411504, 0.6654205607476635, 0.2780706893184925, 0.6761848725358419), (1, 0.6999829584185412, array([[7500, 3166],
       [ 355,  715]], dtype=int64), 0.1842308683329039, 0.6682242990654206, 0.2888305392849929, 0.6856966235623372), (10, 0.701346284935242, array([[7510, 3156],
       [ 349,  721]], dtype=int64), 0.1859685323703895, 0.6738317757009346, 0.2914897917930059, 0.6889691411788004), (100, 0.7012610770279482, array([[7508, 3158],
       [ 348,  722]], dtype=int64), 0.18608247422680413, 0.6747663551401869, 0.2917171717171717, 0.6893426750386853)]


2. ADASYN

In [9]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in m:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = i)
    results = model.fit(X_over2_train,Y_over2_train)
    
    #로지스틱 모형 적합
    #model = sm.Logit(Y_over2_train,X_over2_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [10]:
print(anal_result)

[(1500, 0.6664962508520791, array([[7074, 3592],
       [ 322,  748]], dtype=int64), 0.17235023041474654, 0.6990654205607477, 0.2765249537892791, 0.6811471861851178), (2000, 0.6664962508520791, array([[7074, 3592],
       [ 322,  748]], dtype=int64), 0.17235023041474654, 0.6990654205607477, 0.2765249537892791, 0.6811471861851178), (2500, 0.6664962508520791, array([[7074, 3592],
       [ 322,  748]], dtype=int64), 0.17235023041474654, 0.6990654205607477, 0.2765249537892791, 0.6811471861851178), (3000, 0.6664962508520791, array([[7074, 3592],
       [ 322,  748]], dtype=int64), 0.17235023041474654, 0.6990654205607477, 0.2765249537892791, 0.6811471861851178)]


## 언더샘플링

1. EditedNearestNeighbours

In [11]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in m:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = i)
    results = model.fit(X_under1_train,Y_under1_train)
    
    #로지스틱 모형 적합
    #model = sm.Logit(Y_under1_train,X_under1_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [12]:
print(anal_result)

[(1500, 0.8602590320381731, array([[9760,  906],
       [ 734,  336]], dtype=int64), 0.27053140096618356, 0.31401869158878504, 0.2906574394463668, 0.6145379413316135), (2000, 0.8602590320381731, array([[9760,  906],
       [ 734,  336]], dtype=int64), 0.27053140096618356, 0.31401869158878504, 0.2906574394463668, 0.6145379413316135), (2500, 0.8602590320381731, array([[9760,  906],
       [ 734,  336]], dtype=int64), 0.27053140096618356, 0.31401869158878504, 0.2906574394463668, 0.6145379413316135), (3000, 0.8602590320381731, array([[9760,  906],
       [ 734,  336]], dtype=int64), 0.27053140096618356, 0.31401869158878504, 0.2906574394463668, 0.6145379413316135)]


2. Borderline SMOTE

In [13]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in m:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = i)
    results = model.fit(X_under2_train,Y_under2_train)
    
    #로지스틱 모형 적합
    #model = sm.Logit(Y_under2_train,X_under2_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [14]:
print(anal_result)

[(1500, 0.9083162917518746, array([[10655,    11],
       [ 1065,     5]], dtype=int64), 0.3125, 0.004672897196261682, 0.009208103130755063, 0.501820791369554), (2000, 0.9083162917518746, array([[10655,    11],
       [ 1065,     5]], dtype=int64), 0.3125, 0.004672897196261682, 0.009208103130755063, 0.501820791369554), (2500, 0.9083162917518746, array([[10655,    11],
       [ 1065,     5]], dtype=int64), 0.3125, 0.004672897196261682, 0.009208103130755063, 0.501820791369554), (3000, 0.9083162917518746, array([[10655,    11],
       [ 1065,     5]], dtype=int64), 0.3125, 0.004672897196261682, 0.009208103130755063, 0.501820791369554)]


# 혼합 샘플링

1. SMOTEENN

In [15]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in m:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = i)
    results = model.fit(X_comb1_train,Y_comb1_train)
    
    #로지스틱 모형 적합
    #model = sm.Logit(Y_comb1_train,X_comb1_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [16]:
print(anal_result)

[(1500, 0.5613496932515337, array([[5726, 4940],
       [ 208,  862]], dtype=int64), 0.14856945880730782, 0.805607476635514, 0.2508731082654249, 0.6712267647569095), (2000, 0.5613496932515337, array([[5726, 4940],
       [ 208,  862]], dtype=int64), 0.14856945880730782, 0.805607476635514, 0.2508731082654249, 0.6712267647569095), (2500, 0.5613496932515337, array([[5726, 4940],
       [ 208,  862]], dtype=int64), 0.14856945880730782, 0.805607476635514, 0.2508731082654249, 0.6712267647569095), (3000, 0.5613496932515337, array([[5726, 4940],
       [ 208,  862]], dtype=int64), 0.14856945880730782, 0.805607476635514, 0.2508731082654249, 0.6712267647569095)]


2. SMOTE Tomek

In [17]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score

anal_result = []
m = [1500, 2000, 2500, 3000]
c = [0.01, 0.1, 1, 10, 100]

for i in m:
    model = LogisticRegression(random_state = 42, solver = 'lbfgs', max_iter = i)
    results = model.fit(X_comb2_train,Y_comb2_train)
    
    #로지스틱 모형 적합
   # model = sm.Logit(Y_comb2_train,X_comb2_train)
    #results = model.fit()

    pred_y = results.predict(test_x)
    pred_Y = cut_off(pred_y, 0.5)

    cfmat = confusion_matrix(test_y,pred_Y)
    accuracy = acc(cfmat)

    pr = precision_score(test_y,pred_Y)
    r = recall_score(test_y,pred_Y)
    f1= f1_score(test_y,pred_Y)
    auc_score = roc_auc_score(test_y,pred_y)
    
    anal_result.append((i, accuracy, cfmat, pr, r, f1, auc_score))

In [18]:
sorted(anal_result,key=lambda x:x[4],reverse=True)

[(1500,
  0.6796182685753238,
  array([[7231, 3435],
         [ 325,  745]], dtype=int64),
  0.17822966507177032,
  0.6962616822429907,
  0.2838095238095238,
  0.6871051520159263),
 (2000,
  0.6796182685753238,
  array([[7231, 3435],
         [ 325,  745]], dtype=int64),
  0.17822966507177032,
  0.6962616822429907,
  0.2838095238095238,
  0.6871051520159263),
 (2500,
  0.6796182685753238,
  array([[7231, 3435],
         [ 325,  745]], dtype=int64),
  0.17822966507177032,
  0.6962616822429907,
  0.2838095238095238,
  0.6871051520159263),
 (3000,
  0.6796182685753238,
  array([[7231, 3435],
         [ 325,  745]], dtype=int64),
  0.17822966507177032,
  0.6962616822429907,
  0.2838095238095238,
  0.6871051520159263)]