In [1]:
import seaborn as sns
import numpy as np
import pandas as pd


In [2]:
df = sns.load_dataset("titanic")
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


### 1. 데이터 전처리

- Feature selection

In [3]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','deck']]
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


- 결측지 처리


In [13]:
print(df.shape)
print(df.isna().sum())

(891, 9)
survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64


In [16]:
# age - 평균
df["age"].fillna(df["age"].mean(), inplace = True)
df.age.isnull().sum()

0

In [17]:
# embarked - 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [18]:
df.embarked.fillna("S", inplace = True)
df.isnull().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
deck        688
dtype: int64

In [23]:
# deck - 열 삭제
df.drop(columns = ["deck"], inplace = True)
df.head()
df.isnull().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

- 카테고리 값(sex, embarked)을 숫자로 변환


In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [26]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2


2. Train//Test dataset으로 분리


In [27]:
# X 와 Y를 넘파이 배열로
X = df.iloc[:,0:].values
y = df.survived.values
X.shape, y.shape

((891, 8), (891,))

In [29]:
# y 값의 분포
# df.survived.value_counts()
np.unique(y, return_counts = True)

(array([0, 1]), array([549, 342]))

In [50]:
# train/test dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify = y, random_state = 2022, test_size = 0.2 
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 8), (179, 8), (712,), (179,))

In [51]:
np.unique(y_train, return_counts = True)

(array([0, 1]), array([439, 273]))

### 3. RandomForest 모델로 학습

In [52]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [36]:
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2022)

4. 모델 예측 및 평가

In [37]:
rfc.score(X_test, y_test)

1.0

5. 3, 4 대신에 GridSearchCV수행

In [53]:
params = {
    "max_depth" : [2,4,6,8],
    "min_samples_split": [2,4,6]
}

In [54]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, params, scoring = "accuracy", cv = 5)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [45]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [46]:
params = {
    "max_depth" : [2,4,6,8],
    "min_samples_split": [2,4,6]
}
grid_rf = GridSearchCV(rfc, params, scoring = "accuracy", cv = 5)
%time grid_rf.fit(X_train, y_train)

CPU times: user 13.6 s, sys: 99.7 ms, total: 13.7 s
Wall time: 14.6 s


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [None]:
grid_rf.best_params_

In [55]:
best_rf = grid_rf.best_estimator_

In [56]:
best_rf.score(X_test, y_test)

1.0

### 5. 테이스 데이터에 적용

In [57]:
X_test[25], y_test[25]

(array([ 1.  ,  3.  ,  1.  , 45.  ,  0.  ,  0.  ,  8.05,  2.  ]), 1)

In [60]:
best_rf.predict(X_test[25].reshape(1,-1))[0]

1

### 7. 엉터리 분류기

In [62]:
# 여성의 생존률
df.groupby('sex')["survived"].mean()

sex
0    0.742038
1    0.188908
Name: survived, dtype: float64

In [63]:
from sklearn.base import BaseEstimator

class MyClassifier(BaseEstimator):
  # fir(), predict() method만 재정이 (Overriding)
  def fit(self, X, y):
    pass
  def predict(self, X):
    pred = np.zeros(X.shape[0]) # X 의 행의 갯수만큼 0으로 초기화한 배열을 생성
    for i in range(X.shape[0]):
      if X[i,1] == 0: # 여성이면
        pred[i] = 1   # 생존으로 처리
    return pred

In [65]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)
pred_my = my_clf.predict(X_test)

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.6145251396648045

- 모델의 성능을 평가할 때 무조건적으로 정확도를 사용하는 것은 지양해야함

In [71]:
pred_rf = best_rf.predict(X_test)
sdf = pd.DataFrame({"y_test":y_test, "RF":pred_rf,"My":pred_my})
sdf

Unnamed: 0,y_test,RF,My
0,1,1,0.0
1,0,0,0.0
2,1,1,0.0
3,0,0,0.0
4,0,0,0.0
...,...,...,...
174,0,0,0.0
175,1,1,0.0
176,0,0,0.0
177,1,1,0.0


In [72]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_rf)

array([[110,   0],
       [  0,  69]])

In [73]:
confusion_matrix(y_test, pred_my)

array([[110,   0],
       [ 69,   0]])

In [75]:
from sklearn.metrics import precision_score, recall_score


In [78]:
# 정밀도 (precision_score TP/(FP+TP))
precision_score(y_test, pred_rf), precision_score(y_test,pred_my)

  _warn_prf(average, modifier, msg_start, len(result))


(1.0, 0.0)

In [80]:
# 재현율(Recall TP / (FN + TP))
recall_score(y_test, pred_rf), recall_score(y_test, pred_my)

(1.0, 0.0)

In [82]:
# F1 Score
from sklearn.metrics import f1_score
f1_score(y_test,pred_rf), f1_score(y_test, pred_my)

(1.0, 0.0)

In [83]:
# AUC Score
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,pred_rf), roc_auc_score(y_test, pred_my)

(1.0, 0.5)