# 타이타닉호 생존자 예측

### 1. 데이터 전처리

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


- 필요한 컬럼만 추출

In [5]:
cols = 'survived pclass sex age sibsp parch fare embarked'
columns = cols.split()
columns

['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

In [6]:
df = df[columns]
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


- 결측치 처리

In [7]:
# embarked 결측치를 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [8]:
df.embarked.fillna('S', inplace=True)
df.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [9]:
# age 결측치를 평균으로 대체
df.age.fillna(df.age.mean(), inplace=True)

In [10]:
# 결측치가 있는지 최종 확인
df.isna().sum().sum()

0

- Label Encoding 수행

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
for feature in ['sex', 'embarked']:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


### 2. Train/test data set 분리

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,1:].values, df['survived'].values, stratify=df.survived.values,
    test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

### 3. GridSearchCV 를 이용하여 학습/파라메터 설정
- Decision Tree

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [16]:
dtc = DecisionTreeClassifier(random_state=2021)
params = {
    'max_depth': [2,4,6,8],
    'min_samples_leaf': [2,4,6,8],
    'min_samples_split': [2,4,6,8]
}
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=2021,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [2, 4, 6, 8],
                         '

In [17]:
grid_dt.best_params_

{'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2}

In [19]:
params = {
    'max_depth': [5,6,7],           # 'max_depth': [2,4,6,8],
    'min_samples_leaf': [1,2,3],    # 'min_samples_leaf': [2,4,6,8],
    'min_samples_split': [2,3]    # 'min_samples_split': [2,4,6,8]
}
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=2021,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [5, 6, 7], 'min_samples_leaf': [1, 2, 3]

In [20]:
grid_dt.best_params_

{'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2}

In [21]:
best_clf = grid_dt.best_estimator_

### 4. 예측 및 평가

In [22]:
best_clf.score(X_test, y_test)

0.8212290502793296

### 타이타닉 엉터리 분류기

In [23]:
# 성별로 그룹핑을 한 다음에 생존률을 구하기
df.groupby('sex')['survived'].mean()

sex
0    0.742038
1    0.188908
Name: survived, dtype: float64

In [30]:
from sklearn.base import BaseEstimator
class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X[i,1] == 0:        # 성별이 여자면 무조건 생존으로 예측
                pred[i,0] = 1
        return pred

In [31]:
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)

In [32]:
pred = my_clf.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7877094972067039