In [1]:
import pandas as pd 
import numpy as np

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_raw = pd.read_csv('data/bank.csv')
df_raw.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320.0,HomeImp,Other,9.0,0,0.0,101.466002,1.0,8,37.113614
1,1,1800,28502,43034.0,HomeImp,Other,11.0,0,0.0,88.76603,0.0,8,36.884894
2,0,2300,102370,120953.0,HomeImp,Office,2.0,0,0.0,90.992533,0.0,13,31.588503
3,1,2400,34863,47471.0,HomeImp,Mgr,12.0,0,0.0,70.49108,1.0,21,38.263601
4,0,2400,98449,117195.0,HomeImp,Office,4.0,0,0.0,93.811775,0.0,13,29.681827


In [3]:
df_raw.isnull().sum()

BAD          0
LOAN         0
MORTDUE      0
VALUE       14
REASON       0
JOB         81
YOJ        205
DEROG        0
DELINQ      30
CLAGE       37
NINQ        56
CLNO         0
DEBTINC      0
dtype: int64

In [4]:
df_raw_t = df_raw.dropna()

In [5]:
#  데이터의 문자형 변수에 대한 더미변수 생성
df_raw_dummy = pd.get_dummies(df_raw_t)
df_raw_dummy.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1700,30548,40320.0,9.0,0,0.0,101.466002,1.0,8,37.113614,0,1,0,0,1,0,0,0
1,1,1800,28502,43034.0,11.0,0,0.0,88.76603,0.0,8,36.884894,0,1,0,0,1,0,0,0
2,0,2300,102370,120953.0,2.0,0,0.0,90.992533,0.0,13,31.588503,0,1,0,1,0,0,0,0
3,1,2400,34863,47471.0,12.0,0,0.0,70.49108,1.0,21,38.263601,0,1,1,0,0,0,0,0
4,0,2400,98449,117195.0,4.0,0,0.0,93.811775,0.0,13,29.681827,0,1,0,1,0,0,0,0


In [6]:
# x, y 변수 분리
df_raw_y = df_raw_dummy['BAD']
df_raw_x = df_raw_dummy.drop(columns = 'BAD')

###### 오버 샘플링 데이터 생성하기

In [7]:
smote = SMOTE(k_neighbors=5, random_state=0)
df_raw_x_over, df_raw_y_over = smote.fit_resample(df_raw_x, df_raw_y)
print('SMOTE 적용 전 데이터 세트: ', df_raw_x.shape, df_raw_y.value_counts())
print('SMOTE 적용 후 데이터 세트: ', df_raw_x_over.shape, df_raw_y_over.value_counts())

SMOTE 적용 전 데이터 세트:  (3364, 18) 0    3064
1     300
Name: BAD, dtype: int64
SMOTE 적용 후 데이터 세트:  (6128, 18) 1    3064
0    3064
Name: BAD, dtype: int64


###### 언더 샘플링 데이터 생성하기

In [8]:
down = RandomUnderSampler(random_state=1234)
df_raw_x_down, df_raw_y_down = down.fit_resample(df_raw_x, df_raw_y)
print('언더샘플링 적용 전 데이터 세트: ', df_raw_x.shape, df_raw_y.value_counts())
print('언더샘플링 적용 후 데이터 세트: ', df_raw_x_down.shape, df_raw_y_down.value_counts())

언더샘플링 적용 전 데이터 세트:  (3364, 18) 0    3064
1     300
Name: BAD, dtype: int64
언더샘플링 적용 후 데이터 세트:  (600, 18) 0    300
1    300
Name: BAD, dtype: int64


In [9]:
df_raw.mean()

  df_raw.mean()


BAD             0.092049
LOAN        19483.564568
MORTDUE     75117.309232
VALUE      106791.045795
YOJ             9.130398
DEROG           0.161153
DELINQ          0.269769
CLAGE         180.873315
NINQ            1.028982
CLNO           21.649680
DEBTINC        34.130636
dtype: float64