In [29]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('font', family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Malgun Gothic'
import seaborn as sns
import scipy.stats as stats

In [2]:
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
df_raw = pd.read_csv('data/bank.csv')

In [4]:
df_raw['JOB'].fillna('Other', inplace = True)
df_raw.fillna(df_raw.mean(), inplace = True)

  df_raw.fillna(df_raw.mean(), inplace = True)


In [5]:
df_raw.isnull().sum()

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

In [6]:
df_raw_dummy = pd.get_dummies(df_raw)
df_raw_dummy.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1700,30548,40320.0,9.0,0,0.0,101.466002,1.0,8,37.113614,0,1,0,0,1,0,0,0
1,1,1800,28502,43034.0,11.0,0,0.0,88.76603,0.0,8,36.884894,0,1,0,0,1,0,0,0
2,0,2300,102370,120953.0,2.0,0,0.0,90.992533,0.0,13,31.588503,0,1,0,1,0,0,0,0
3,1,2400,34863,47471.0,12.0,0,0.0,70.49108,1.0,21,38.263601,0,1,1,0,0,0,0,0
4,0,2400,98449,117195.0,4.0,0,0.0,93.811775,0.0,13,29.681827,0,1,0,1,0,0,0,0


In [7]:
df_train, df_test = train_test_split(df_raw_dummy, test_size=0.3, random_state=1234)

print('학습용 데이터의 크기: {}'.format(df_train.shape))
print('평가용 데이터의 크기: {}'.format(df_test.shape))

학습용 데이터의 크기: (2623, 19)
평가용 데이터의 크기: (1125, 19)


In [8]:
df_train_y = df_train['BAD']
df_train_x = df_train.drop(columns = 'BAD', inplace = False)
df_test_y = df_test['BAD']
df_test_x = df_test.drop(columns = 'BAD', inplace = False)

- 모델 생성

In [10]:
log_model = LogisticRegression()
dt_model = DecisionTreeClassifier(max_depth = 6)

vo_model = VotingClassifier(estimators = [('LR', log_model), ('DT', dt_model)], voting = 'soft')

In [15]:
vo_model.fit(df_train_x, df_train_y)
print('보팅 분류기 학습 정확도: {:.3f}'.format(vo_model.score(df_train_x, df_train_y)))

y_pred = vo_model.predict(df_test_x)
print('보팅 분류기 평가 정확도: {:.5f}'.format(accuracy_score(df_test_y, y_pred)))
print(vo_model.score(df_test_x, df_test_y))

보팅 분류기 학습 정확도: 0.941
보팅 분류기 평가 정확도: 0.93244
0.9324444444444444


- 모델 생성

In [21]:
classifiers = [log_model, dt_model]
for classifier in classifiers:
    classifier.fit(df_train_x, df_train_y)
    y_pred = classifier.predict(df_test_x)
    class_name = classifier.__class__.__name__
    print('{0} 평가 정확도 : {1:.4f}'.format(class_name, accuracy_score(df_test_y, y_pred)))
    
print('\n보팅 분류기 평가 지표: \n\n', classification_report(df_test_y, y_pred))

LogisticRegression 평가 정확도 : 0.9058
DecisionTreeClassifier 평가 정확도 : 0.9289

보팅 분류기 평가 지표: 

               precision    recall  f1-score   support

           0       0.93      1.00      0.96      1021
           1       0.85      0.28      0.42       104

    accuracy                           0.93      1125
   macro avg       0.89      0.64      0.69      1125
weighted avg       0.92      0.93      0.91      1125



# 실습

In [22]:
df_tel = pd.read_csv('data/tele_customer.csv')
df_tel.head()

Unnamed: 0,CHURN,CUSTOMER_ID,GENDER,AGE,CHARGE_TYPE,HANDSET,USAGE_BAND,SERVICE_DURATION,DROPPED_CALLS,PEAK_CALLS_NO,PEAK_CALLS_TIME,WEEKEND_CALLS_NO,WEEKEND_CALLS_TIME,TOTAL_CALLS_NO,TOTAL_CALLS_TIME
0,Active,K102990,F,31,CAT 100,ASAD170,Med,33.33,6,218,379.8,9,21.6,366,632.4
1,Active,K103280,M,27,CAT 100,S50,Med,30.87,10,373,656.4,11,28.0,411,810.4
2,Active,K103660,M,27,CAT 100,ASAD170,Med,49.4,11,260,582.0,0,0.0,265,614.1
3,Active,K103730,M,61,CAT 100,ASAD170,Med,48.6,1,294,661.8,14,101.5,326,844.0
4,Active,K104560,F,52,CAT 100,S50,Med,56.03,0,221,555.6,7,25.0,274,673.0


In [23]:
df_tel.isnull().sum()

CHURN                 0
CUSTOMER_ID           0
GENDER                0
AGE                   0
CHARGE_TYPE           0
HANDSET               0
USAGE_BAND            0
SERVICE_DURATION      0
DROPPED_CALLS         0
PEAK_CALLS_NO         0
PEAK_CALLS_TIME       0
WEEKEND_CALLS_NO      0
WEEKEND_CALLS_TIME    0
TOTAL_CALLS_NO        0
TOTAL_CALLS_TIME      0
dtype: int64

In [24]:
df_tel['CHURN'].unique()

array(['Active', 'Churned'], dtype=object)

In [25]:
df_tel['CHURN'].replace({'Active': 0, 'Churned': 1}, inplace = True)

In [26]:
df_tel_dummy = pd.get_dummies(df_tel)
df_tel_dummy

Unnamed: 0,CHURN,AGE,SERVICE_DURATION,DROPPED_CALLS,PEAK_CALLS_NO,PEAK_CALLS_TIME,WEEKEND_CALLS_NO,WEEKEND_CALLS_TIME,TOTAL_CALLS_NO,TOTAL_CALLS_TIME,...,HANDSET_S80,HANDSET_SOP10,HANDSET_SOP20,HANDSET_WC95,USAGE_BAND_High,USAGE_BAND_Low,USAGE_BAND_Med,USAGE_BAND_MedHigh,USAGE_BAND_MedLow,USAGE_BAND_None
0,0,31,33.33,6,218,379.8,9,21.6,366,632.4,...,0,0,0,0,0,0,1,0,0,0
1,0,27,30.87,10,373,656.4,11,28.0,411,810.4,...,0,0,0,0,0,0,1,0,0,0
2,0,27,49.40,11,260,582.0,0,0.0,265,614.1,...,0,0,0,0,0,0,1,0,0,0
3,0,61,48.60,1,294,661.8,14,101.5,326,844.0,...,0,0,0,0,0,0,1,0,0,0
4,0,52,56.03,0,221,555.6,7,25.0,274,673.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
925,1,19,17.40,12,60,100.8,27,59.4,127,628.2,...,0,1,0,0,0,0,1,0,0,0
926,1,59,17.37,15,82,130.2,12,25.8,357,650.7,...,0,0,1,0,0,0,1,0,0,0
927,1,33,44.93,12,174,286.2,25,67.0,596,1039.0,...,0,0,0,0,0,0,1,0,0,0
928,1,25,40.43,14,315,552.0,0,0.0,609,1188.3,...,0,0,0,0,0,0,1,0,0,0


- 데이터 분리

In [27]:
df_tel_y = df_tel_dummy['CHURN']
df_tel_x = df_tel_dummy.drop(columns = 'CHURN')

In [28]:
df_train_x, df_test_x, df_train_y, df_train_y = train_test_split(df_tel_x, df_tel_y, test_size=0.2, random_state=1234)

In [None]:
log_model = LogisticRegression()
dt_model = DecisionTreeClassifier(max_depth = 6)

vo_model = VotingClassifier(estimators = [('LR', log_model), ('DT', dt_model)], voting = 'soft')