**predict_proba메소드**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,confusion_matrix

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    
    print('오차행렬')
    print(confusion)
    print('정확도 : {0:.3f}'.format(accuracy))
    print('정밀도 : {0:.3f}'.format(precision))
    print('재현율 : {0:.3f}'.format(recall))

In [8]:
from sklearn.preprocessing import LabelEncoder

# Null 처리
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(0,inplace=True)
    df.fillna('N', inplace=True)
    return df

# 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis =1, inplace=True)
    return df

# 레이블 인코딩
def format_features(df):
    df['Cabin'] = df['Cabin'].str[0]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [9]:
from sklearn.linear_model import LogisticRegression

#데이터 로딩
titanic_df = pd.read_csv('./data/titanic_train.csv')

# 결정값
y_titanic_df = titanic_df['Survived']
# 학습 데이터 셋
X_titanic_df = titanic_df.drop('Survived',axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = \
train_test_split(X_titanic_df, y_titanic_df, test_size = 0.2, random_state = 11)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
get_clf_eval(y_test, lr_pred)

오차행렬
[[104  14]
 [ 13  48]]
정확도 : 0.849
정밀도 : 0.774
재현율 : 0.787


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:10]

array([[0.46198241, 0.53801759],
       [0.87863883, 0.12136117],
       [0.87716256, 0.12283744],
       [0.88266765, 0.11733235],
       [0.85534206, 0.14465794],
       [0.88224043, 0.11775957],
       [0.88839194, 0.11160806],
       [0.20879521, 0.79120479],
       [0.78287265, 0.21712735],
       [0.36921677, 0.63078323]])

In [12]:
pred=lr_clf.predict(X_test)

In [14]:
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)], axis = 1)
pred_proba_result[:10]

array([[0.46198241, 0.53801759, 1.        ],
       [0.87863883, 0.12136117, 0.        ],
       [0.87716256, 0.12283744, 0.        ],
       [0.88266765, 0.11733235, 0.        ],
       [0.85534206, 0.14465794, 0.        ],
       [0.88224043, 0.11775957, 0.        ],
       [0.88839194, 0.11160806, 0.        ],
       [0.20879521, 0.79120479, 1.        ],
       [0.78287265, 0.21712735, 0.        ],
       [0.36921677, 0.63078323, 1.        ]])

## Binarizer 클래스
- threshold 변수를 특정 값으로 설정하고
- Binarizer 클래스의 fit_transform 메서드를 이용해서
- 넘파이 ndarray 입력 값을 지정된 threshold보다 같거나 작으면 0값
- 크면 1값으로 변환해서 반환

In [15]:
from sklearn.preprocessing import Binarizer
X = [[1,-1,2],
    [2,0,0],
    [0,1.1,1.2]]

binarizer = Binarizer(threshold = 1.5)
print(binarizer.fit_transform(X))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 0.]]


In [16]:
from sklearn.preprocessing import Binarizer

c_threshold = 0.5

pred_proba_1 = pred_proba[:,1].reshape(-1,1)

bina = Binarizer(threshold = c_threshold).fit(pred_proba_1)
custom_predict = bina.transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차행렬
[[104  14]
 [ 13  48]]
정확도 : 0.849
정밀도 : 0.774
재현율 : 0.787


In [17]:
c_threshold = 0.4

pred_proba_1 = pred_proba[:,1].reshape(-1,1)

bina = Binarizer(threshold = c_threshold).fit(pred_proba_1)
custom_predict = bina.transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차행렬
[[98 20]
 [10 51]]
정확도 : 0.832
정밀도 : 0.718
재현율 : 0.836
