## 연습문제. 모델 성능평가

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

### 문제.

타이타닉 데이터를 결정트리와 로지스틱 회귀를 이용하여 생존자를 예측하고, 예측 성능 결과를 앞에서 작성한 print_eval_score()를 이용하여 생성하고 어떤 모델이 좋은 결과를 냈는지 비교하시오.

- 데이터 전처리에서 범주형 데이터는 원-핫 인코딩을 사용한 경우와 라벨 인코딩을 사용한 경우로 나누어 각각 모델을 학습하고 예측한 결과를 비교하시오.

    1. [#Case1] 모델: 로지스틱회귀, 전처리 : 라벨인코딩 적용
    2. [#Case2] 모델: 로지스틱회귀, 전처리 : 원핫인코딩 적용
    3. [#Case3] 모델: 결정트리, 전처리 : 라벨인코딩 적용
    4. [#Case4] 모델: 결정트리, 전처리 : 원핫인코딩 적용


- 모듈 임포트

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from sklearn.metrics import roc_curve, precision_recall_curve

#### 전처리 함수

In [36]:
# 결측치 처리 함수
def fillna(df):
    df_ = df.copy()
    df_['Age'] = df.Age.fillna(df_.Age.mean())
    df_['Cabin'] = df_.Cabin.fillna('N')
    df_['Embarked'] = df_.Embarked.fillna('N')
    return df_

# 레이블 인코딩 함수
def encode_ftrs(df):
    df_ = df.copy()
    df_['Cabin'] = df.Cabin.str[:1]
    ftrs = ['Sex','Cabin','Embarked']
    for col in ftrs:
        enc = LabelEncoder()
        enc.fit(df_[col])
        df_[col] = enc.transform(df_[col])
    return df_

# 원핫 인코딩
def encode_ftrs2(df):
    df_ = df.copy()
    df_['Cabin'] = df.Cabin.str[:1]
    ftrs = ['Sex','Cabin','Embarked']
    dummy =pd.get_dummies(df_[ftrs], dtype=int)
    df_.drop(ftrs, axis=1, inplace=True)
    df_ = pd.concat([df_, dummy], axis=1)
    return df_

# 피처 삭제
def drop_ftrs(df):
    df_ = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
    return df_

def preprocess(df, encode='label'):
    df_ = drop_ftrs(df)
    df_ = fillna(df_)
    if encode == 'label':
        df_ = encode_ftrs(df_)
    elif encode == 'onehot':
        df_ = encode_ftrs2(df_)
    return df_

#### 분류모델 성능평가지표 출력 함수 작성

In [25]:
def print_eval_score(y_test, y_pred, y_pred_proba_c1=None):
    mat = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    g = np.sqrt(precision*recall)
    print(f'Confusion Matrix:\n{mat}')
    print(f'accuracy: {acc:.4f}, recall: {recall:.4f} precision: {precision:.4f}', end='  ') 
    print(f'f1:{f1:.4f} g-measure:{g:.4f}', end=' ')
    if y_pred_proba_c1 is not None: 
        roc_auc = roc_auc_score(y_test, y_pred_proba_c1)
        print(f'AUC:{roc_auc:.4f}')
    print()
    print(classification_report(y_test, y_pred))

#### 정밀도와 재현율 plot

In [26]:
def precision_recall_curve_plot(y_test, y_pred_proba_c1):
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_c1)
    plt.plot(thresholds, precision[:-1], label='precision')
    plt.plot(thresholds, recall[:-1], label='recall')
    plt.plot(thresholds, precision[:-1], label='precision')
    plt.xlabel('thrshold')
    plt.ylabel('Precision & Recall')
    plt.legend()
    plt.xticks(np.arange(0,1,0.1))
    plt.grid()
    plt.show()

#### roc_curve plot

In [27]:
def roc_curve_plot(y_test, pred_proba_c1):
    fpr, tpr, thresholds = roc_curve(y_test, pred_proba_c1)
    plt.plot(fpr, tpr, label='ROC')
    plt.plot([0,1],[0,1], 'k--', label='Random')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend()
    plt.show()

### 데이터 준비

In [16]:
def prepareData(data, encode='label'):
    df = preprocess(data, encode=encode)
    X = df.drop('Survived', axis=1)
    y = df.Survived
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
    return X_train, X_test, y_train, y_test

#### 모델학습 함수

In [18]:
def modeling(data, model='lr', encode='label'):
    X_train, X_test, y_train, y_test = prepareData(data, encode=encode)

    # 모델 생성
    if model == 'lr':
        clf = LogisticRegression(solver='liblinear')
    elif model == 'dt':
        clf = DecisionTreeClassifier(random_state=156)

    # 학습
    clf.fit( X_train, y_train)
    
    # 예측
    pred = clf.predict(X_test)
    pred_proba_c1 = clf.predict_proba(X_test)[:,1]
    
    # 평가
    print_eval_score(y_test, pred, pred_proba_c1)

In [19]:
titan = pd.read_csv('data/titanic/train.csv')

### 1. 로지스틱 회귀 모델로 생존자 예측

#### 1) 전처리를 라벨 인코딩 적용

In [28]:
modeling(titan, model='lr', encode='label')

Confusion Matrix:
[[108  10]
 [ 14  47]]
accuracy: 0.8659, recall: 0.7705 precision: 0.8246  f1:0.7966 g-measure:0.7971 AUC:0.8987

              precision    recall  f1-score   support

           0       0.89      0.92      0.90       118
           1       0.82      0.77      0.80        61

    accuracy                           0.87       179
   macro avg       0.85      0.84      0.85       179
weighted avg       0.86      0.87      0.86       179



#### 2) 전처리를 원핫 인코딩 적용

In [37]:
modeling(titan, model='lr', encode='onehot')

Confusion Matrix:
[[106  12]
 [ 15  46]]
accuracy: 0.8492, recall: 0.7541 precision: 0.7931  f1:0.7731 g-measure:0.7734 AUC:0.9003

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       118
           1       0.79      0.75      0.77        61

    accuracy                           0.85       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.85      0.85      0.85       179



### 2. 결정트리 모델로 생존자 예측

#### 1) 전처리를 라벨 인코딩 적용

In [31]:
modeling(titan, model='dt', encode='label')

Confusion Matrix:
[[99 19]
 [16 45]]
accuracy: 0.8045, recall: 0.7377 precision: 0.7031  f1:0.7200 g-measure:0.7202 AUC:0.7824

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       118
           1       0.70      0.74      0.72        61

    accuracy                           0.80       179
   macro avg       0.78      0.79      0.78       179
weighted avg       0.81      0.80      0.81       179



#### 2) 전처리를 원핫 인코딩 적용

In [38]:
modeling(titan, model='dt', encode='onehot')

Confusion Matrix:
[[102  16]
 [ 14  47]]
accuracy: 0.8324, recall: 0.7705 precision: 0.7460  f1:0.7581 g-measure:0.7582 AUC:0.8123

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       118
           1       0.75      0.77      0.76        61

    accuracy                           0.83       179
   macro avg       0.81      0.82      0.81       179
weighted avg       0.83      0.83      0.83       179



[분석결과]


---------