
# ch03. 평가(evaluation)

In [4]:
import numpy as np
from sklearn.base import BaseEstimator

In [51]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

    
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [55]:
class MyDummyClassfier(BaseEstimator):
    def fit(self, x, y=None):
        pass
    def predict(self, X):
        pred = np.zeros( (X.shape[0], 1))
        for i in range (X.shape[0]) : 
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else :
                pred[i] = 1
                
        return pred

In [56]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score

In [62]:
titanic_df = pd.read_csv('data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, t_train, y_test=train_test_split(X_titanic_df, y_titanic_df,
                                                 test_size=0.2, random_state=0)
myclf = MyDummyClassfier()
myclf.fit(X_train, y_train)

Myprediction = myclf.predict(X_test)
print('Dummy Classifier의 정확도는 : {0:.4f}'.format(accuracy_score(y_test, Myprediction)))

Dummy Classifier의 정확도는 : 0.7877


In [63]:
from sklearn.datasets import load_digits


class MyFakeClassifier(BaseEstimator):
    def fit(self,x,y):
        pass
    
    def predict(self,x,y):
        return np.zeros( (len(X), 1), dtype=bool)
    
digits = load_digits()

y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=11)

## 정밀도와 재현율

In [71]:
from sklearn.linear_model import LogisticRegression

In [72]:
from sklearn.metrics import accuracy_score, precision_score,recall_score, confusion_matrix

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score( y_test, pred)
    precision = precision_score( y_test, pred)
    recall = recall_score( y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}'.format(accuracy, precision, recall))

In [75]:
titanic_df = pd.read_csv('data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df,
                                                   test_size=20, random_state=11)

lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### precision/recall trade-off

In [92]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba

array([[0.46122621, 0.53877379],
       [0.89654576, 0.10345424],
       [0.89122111, 0.10877889],
       [0.90126135, 0.09873865],
       [0.89063535, 0.10936465],
       [0.90367358, 0.09632642],
       [0.90587359, 0.09412641],
       [0.16218097, 0.83781903],
       [0.81962869, 0.18037131],
       [0.35314585, 0.64685415],
       [0.92029435, 0.07970565],
       [0.90026025, 0.09973975],
       [0.89120849, 0.10879151],
       [0.90578098, 0.09421902],
       [0.47511584, 0.52488416],
       [0.87770173, 0.12229827],
       [0.92227642, 0.07772358],
       [0.75960686, 0.24039314],
       [0.77551314, 0.22448686],
       [0.12820936, 0.87179064]])

In [93]:
pred

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
      dtype=int64)

In [94]:
from sklearn.preprocessing import Binarizer

X = [[1,-1,2],
    [2,0,0],[0,1.1,1.2]]
     
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(X))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [95]:
custom_threshold = 0.5

pred_proba_1 = pred_proba[:, 1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000


In [98]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred)
print('F1 스코어 : {0:.4f}'.format(f1))

F1 스코어 : 0.8000


In [101]:
thresholds = [0.4, 0.45, 0.50, 0.55, 0.60]

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값 : ', custom_threshold)
        get_clf_eval(y_test, custom_predict)
    
get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1,1), thresholds)

임곗값 :  0.4
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.45
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.5
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.55
오차 행렬
[[15  0]
 [ 2  3]]
정확도 : 0.9000, 정밀도 : 1.0000, 재현율 : 0.6000, F1 : 0.7500
임곗값 :  0.6
오차 행렬
[[15  0]
 [ 2  3]]
정확도 : 0.9000, 정밀도 : 1.0000, 재현율 : 0.6000, F1 : 0.7500


In [102]:
def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}, F1 : {3:.4f}'.format(accuracy, precision, recall, f1))
          
thresholds = [0.4, 0.45, 0.50, 0.55, 0.60]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)

임곗값 :  0.4
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.45
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.5
오차 행렬
[[14  1]
 [ 1  4]]
정확도 : 0.9000, 정밀도 : 0.8000, 재현율 : 0.8000, F1 : 0.8000
임곗값 :  0.55
오차 행렬
[[15  0]
 [ 2  3]]
정확도 : 0.9000, 정밀도 : 1.0000, 재현율 : 0.6000, F1 : 0.7500
임곗값 :  0.6
오차 행렬
[[15  0]
 [ 2  3]]
정확도 : 0.9000, 정밀도 : 1.0000, 재현율 : 0.6000, F1 : 0.7500


## ROC곡선과 AUC곡선

In [107]:
from sklearn.metrics import roc_curve

pred_proba_class1 = lr_clf.predict_proba(X_test)[:, 1]

fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)
thr_index = np.arange(1, thresholds.shape[0], 5)
print('샘플 추출을 위한 임곗값 배열의 index 10개 : ', thr_index)
print('샘플용 10개의 임곗값 : ', np.round(thresholds[thr_index], 2))
print('샘플 입곗값별 FPR : ', np.round(fprs[thr_index], 3))
print('샘플 입곗값별 TPR : ', np.round(tprs[thr_index], 3))

샘플 추출을 위한 임곗값 배열의 index 10개 :  [1]
샘플용 10개의 임곗값 :  [0.87]
샘플 입곗값별 FPR :  [0.]
샘플 입곗값별 TPR :  [0.2]


In [None]:
피마, 타이타닉 과제.

### 피마 인디언 당뇨병 사례

In [None]:
태글