# 분류기 만들기

타이타닉 데이터의 생존여부 분류   
- 규칙 : 성별(sex) = 1 생존하지 않은 것으로

In [15]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    
    def fit(self, X, y):
        pass              # 학습은 하지 않음
    
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))        # 기본값은 전부 0 (사망)
        
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:            # 남자면 0 (사망)
                pred[i] = 0
            else:                                # 여자면 1 (생존)
                pred[i] = 1
                
        return pred
    
    # 단순한 룰 기반 모델 (남자 -> 사망, 여자 -> 생존)
    # 기계학습 없이 성별만으로 예측하는 가짜 모델

In [16]:
import pandas as pd
load_titanic = pd.read_csv('titanic.csv')
X_titanic_df = load_titanic.drop('Survived', axis=1)
y_titanic_df = load_titanic['Survived']

In [17]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df)            # 결측치 처리
    df = drop_features(df)     # 필요 없는 열 제거
    df = format_features(df)   # 범주형 변수 레이블 인코딩
    return df

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=0)

myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

my_pred = myclf.predict(X_test)
accuracy_score(y_test, my_pred)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

0.7877094972067039

# 혼동행렬

In [19]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred)

array([[92, 18],
       [20, 49]])

In [20]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

# 로지스틱 회귀, 랜덤포레스트, KNN의 정밀도, 재현율 비교

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

models = {
    'knn' : KNeighborsClassifier(n_neighbors=5),
    'rf' : RandomForestClassifier(),
    'lr' : LogisticRegression(max_iter=3000)
}

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    print(confusion)
    print(accuracy, precision, recall, f1)
    print('-'*20)
    

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = get_clf_eval(y_test, pred)

[[94 16]
 [31 38]]
0.7374301675977654 0.7037037037037037 0.5507246376811594 0.6178861788617886
--------------------
[[98 12]
 [22 47]]
0.8100558659217877 0.7966101694915254 0.6811594202898551 0.734375
--------------------
[[92 18]
 [16 53]]
0.8100558659217877 0.7464788732394366 0.7681159420289855 0.7571428571428571
--------------------


In [22]:
lr = LogisticRegression(max_iter=3000)
lr.fit(X_train, y_train)

pred_proba = lr.predict_proba(X_test)  # 클래스별 확률값 반환 -> [사망확률, 생존확률]
pos_proba = pred_proba[:,1]  # 양성클래스일 확률 (생존일 확률)

threshold = 0.5  # 임계치
custom_proba = (pos_proba >= threshold).astype(int)  # 임계치보다 크면 1넣어줌 (True/False -> 1/0 변환)
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[92 18]
 [16 53]]
0.8100558659217877 0.7464788732394366 0.7681159420289855 0.7571428571428571
--------------------


# 정밀도와 재현율의 변화
 정밀도와 재현율의 불균형이 심할 때, 혹은 비지니스의 요구사항이 있을 때 임계치를 조정해야 함

 임계치를 낮추면 정밀도는 낮아지고, 재현율은 올라간다.

# 평가결과 확인하기

In [23]:
f1_score(y_test, pred)

np.float64(0.7571428571428571)

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))  # 평가보고서

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.75      0.77      0.76        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [26]:
pd.Series(lr.coef_[0], index=X_train.columns).sort_values()  # 피처의 중요도 = 계수

Sex        -2.593416
Pclass     -0.901628
SibSp      -0.368137
Embarked   -0.107352
Parch      -0.059052
Cabin      -0.058762
Age        -0.042756
Fare        0.001286
dtype: float64