In [26]:
import pandas as pd

In [27]:
bank_df = pd.read_csv('../rawdata/bank_prep.csv')

In [28]:
bank_df.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,58,0,2143,1,0,5,261,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
1,44,0,29,1,0,5,151,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
2,33,0,2,1,1,5,76,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
3,47,0,1506,1,0,5,92,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
4,33,0,1,0,0,5,198,1,-1,0,...,0,0,0,0,0,0,1,0,0,0


In [29]:
# yes와 no는 불균형 상태 
bank_df['housing'].value_counts()

1    25130
0    20081
Name: housing, dtype: int64

In [30]:
bank_df['default'].value_counts()

0    44396
1      815
Name: default, dtype: int64

In [31]:
bank_df['loan'].value_counts()

0    37967
1     7244
Name: loan, dtype: int64

In [32]:
!pip show imbalanced-learn

Name: imbalanced-learn
Version: 0.8.0
Summary: Toolbox for imbalanced dataset in machine learning.
Home-page: https://github.com/scikit-learn-contrib/imbalanced-learn
Author: None
Author-email: None
License: MIT
Location: c:\python39\lib\site-packages
Requires: scikit-learn, numpy, scipy, joblib
Required-by: 


In [33]:
!pip install imbalanced-learn



You should consider upgrading via the 'c:\python39\python.exe -m pip install --upgrade pip' command.


In [46]:
# 언더샘플링
import numpy as np 
from imblearn.under_sampling import RandomUnderSampler

X = np.array(bank_df.drop('y', axis=1))
Y = np.array(bank_df['y'])
print(np.sum(Y==1), np.sum(Y==0))

sampler = RandomUnderSampler(random_state=42)
X, Y = sampler.fit_resample(X, Y)
print(np.sum(Y == 1), np.sum(Y == 0))

5289 39922
5289 5289


In [47]:
# 오버샘플링
import numpy as np 
from imblearn.over_sampling import RandomOverSampler

X = np.array(bank_df.drop('y', axis=1))
Y = np.array(bank_df['y'])
print(np.sum(Y == 1), np.sum(Y == 0))

sampler = RandomOverSampler(random_state=42)
X, Y = sampler.fit_resample(X, Y)
print(np.sum(Y==1), np.sum(Y==0))

5289 39922
39922 39922


In [48]:
# SMOTE
import numpy as np 
from imblearn.over_sampling import SMOTE 

X = np.array(bank_df.drop('y', axis=1))
Y = np.array(bank_df['y'])
print(np.sum(Y == 1), np.sum(Y == 0))

sm = SMOTE(random_state=42)
X, Y = sm.fit_resample(X, Y)
print(np.sum(Y==1), np.sum(Y==0))

5289 39922
39922 39922


In [54]:
# 결정트리
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)
scores = []

for train_id, test_id in kf.split(X):
    x = X[train_id]
    y = Y[train_id]
    clf = DecisionTreeClassifier()
    clf.fit(x, y)
    pred_y = clf.predict(X[test_id])
    score = accuracy_score(Y[test_id], pred_y)
    scores.append(score)
    
scores = np.array(scores)
print(scores.mean(), scores.std())

0.9249034920184765 0.002552270957002199


In [55]:
# precision, recall
from sklearn.metrics import recall_score, precision_score

print(recall_score(Y[test_id], pred_y))
print(precision_score(Y[test_id], pred_y))

0.92875
0.9195544554455446


In [56]:
print(clf)

DecisionTreeClassifier()


In [57]:
# GridSearch
from sklearn.model_selection import GridSearchCV

params = {
    'criterion' : ['entropy'],
    'max_depth' : [2, 4, 6, 8, 10],
    'min_samples_leaf' : [10, 20, 30, 40, 50]
}

clf_gs = GridSearchCV(DecisionTreeClassifier(), params,
                     cv = KFold(n_splits=10, shuffle=True), scoring='accuracy')
clf_gs.fit(X, Y)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'min_samples_leaf': [10, 20, 30, 40, 50]},
             scoring='accuracy')

In [58]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.915635543158954
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 10}


In [61]:
clf_best = DecisionTreeClassifier(criterion='entropy', max_depth= 10, min_samples_leaf=10)
clf_best.fit(X, Y)

print(clf_best.feature_importances_)

[5.30149730e-03 0.00000000e+00 8.40474320e-03 1.20219584e-01
 2.58812030e-03 2.10789409e-02 3.40083878e-01 4.46287372e-03
 5.73725480e-02 3.22550273e-04 1.17659859e-02 7.07698599e-03
 7.06740159e-03 7.46999333e-04 1.98760064e-03 1.13526530e-03
 3.76232990e-04 9.22014225e-03 8.00784146e-03 1.18245735e-01
 1.61435748e-02 5.58667357e-02 0.00000000e+00 3.19931370e-02
 2.84430928e-02 6.71802614e-02 5.73771331e-03 0.00000000e+00
 2.56573137e-02 4.35132453e-02 0.00000000e+00 0.00000000e+00]


In [63]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=5)
selector.fit(X, Y)
mask = selector.get_support()

print(bank_df.drop('y', axis=1).columns)
print(mask)

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'divorced', 'married', 'single',
       'primary', 'secondary', 'tertiary', 'unknown', 'cellular', 'telephone',
       'unknown.1', 'apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar',
       'may', 'nov', 'oct', 'sep'],
      dtype='object')
[False False False  True False False  True False False False False  True
 False False False False False False False  True False False False False
 False False False False  True False False False]


In [71]:
mask[1]

False

In [75]:
# duration은 공선성 조심
bank_df.drop('y', axis=1).columns[[i for i, x in enumerate(mask) if x]]

Index(['housing', 'duration', 'married', 'unknown.1', 'may'], dtype='object')