# 타이타닉 생존률 예측 (로지스틱회귀, SVM, KNN, Random Forest, GaussianNB) 


In [179]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<br>

## 1. Preparing dataset (2번부터 실습 진행)

In [3]:
data_df = pd.read_csv('titanic.csv')
data_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Data info

- **PassengerId** : Unique ID of passenger
- **Survived** : 0 = No, 1 = Yes
- **pclass** : Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- **sibsp** : # of siblings & spouses aboard the Titanic
- **parch** : # of parents / children aboard the Titanic
- **ticket** : Ticket number
- **cabin** : Cabin number
- **embarked** : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [182]:
y_data = data_df[['Survived']]
# y_data.head(3)

In [183]:
del data_df['Survived']
x_data = data_df.copy()
# x_data.head(3)

<br>

## 2. Feature engineering & Feature selection

#### 시도해볼 수 있는 전략들

- 불필요한 열이나 예측에 방해가 되는 열은 아예 지우기 (ex. PassengerId)
- 결측치 채우기 
- Text로 되어있는 Category(Factor)는 숫자로 바꿔주기 (ex. Male/Female -> 0/1)
- 실수 범위를 구간 범위로 바꿔주기 
- 필요한 경우 기존 열을 바탕으로 새로운 열을 계산해 추가하기

In [184]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [185]:
data_df.isnull().sum() # 결측값 확인

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [186]:
# 성별 0,1로 바꾸기
def character_to_num(data) :
    if data == 'male':
        return 0
    else: 
        return 1

In [187]:
x_data['Sex'] = x_data['Sex'].apply(character_to_num)

In [188]:
# Age결측값 class별로 평균으로 넣기
mean_age_P1 = x_data['Age'].loc[x_data['Pclass']==1].mean() 
mean_age_P2 = x_data['Age'].loc[x_data['Pclass']==2].mean()
mean_age_P3 = x_data['Age'].loc[x_data['Pclass']==3].mean()

In [189]:
x_data.loc[(x_data['Pclass'] == 1) & (x_data['Age'].isnull()), 'Age'] = int(mean_age_P1)
x_data.loc[(x_data['Pclass'] == 2) & (x_data['Age'].isnull()), 'Age'] = int(mean_age_P2)
x_data.loc[(x_data['Pclass'] == 3) & (x_data['Age'].isnull()), 'Age'] = int(mean_age_P3)

In [190]:
x_data['Age'].isnull().sum() # 'age'결측치 완료

0

In [191]:
#실수 범위를 구간 범위로 연령별 구분
x_data.loc[ x_data['Age'] <= 16, 'Age'] = 0
x_data.loc[(x_data['Age'] > 16) & (x_data['Age'] <= 32), 'Age'] = 1
x_data.loc[(x_data['Age'] > 32) & (x_data['Age'] <= 48), 'Age'] = 2
x_data.loc[(x_data['Age'] > 48) & (x_data['Age'] <= 64), 'Age'] = 3
x_data.loc[ x_data['Age'] > 64, 'Age'] = 4

In [192]:
# 요금별 구간 범위 구분
x_data.loc[ x_data['Fare'] <= 7.854, 'Fare'] = 0
x_data.loc[(x_data['Fare'] > 7.854) & (x_data['Fare'] <= 10.5), 'Fare'] = 1
x_data.loc[(x_data['Fare'] > 10.5) & (x_data['Fare'] <= 21.679), 'Fare']   = 2
x_data.loc[(x_data['Fare'] > 21.679) & (x_data['Fare'] <= 39.688), 'Fare']   = 3
x_data.loc[ x_data['Fare'] > 39.688, 'Fare'] = 4

In [193]:
# 이름별로 나누기
x_data['Title'] = x_data['Name'].str.extract(' ([A-Za-z]+)\.')

In [194]:
x_data.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,3,"Braund, Mr. Owen Harris",0,1.0,1,0,A/5 21171,0.0,,S,Mr
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,PC 17599,4.0,C85,C,Mrs
2,3,3,"Heikkinen, Miss. Laina",1,1.0,0,0,STON/O2. 3101282,1.0,,S,Miss


In [195]:
pd.crosstab(x_data['Title'], x_data['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,182


In [196]:
x_data['Title'] = x_data['Title'].replace(['Capt', 'Col', 'Countess', 'Don','Dona', 'Dr', 'Jonkheer',
                                              'Lady','Major', 'Rev', 'Sir'], 'Other')
x_data['Title'] = x_data['Title'].replace('Mlle', 'Miss')
x_data['Title'] = x_data['Title'].replace('Mme', 'Mrs')
x_data['Title'] = x_data['Title'].replace('Ms', 'Miss')

x_data['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Other      23
Name: Title, dtype: int64

In [197]:
# Embarked 결측값 각 개수 확인
x_data.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [198]:
x_data['Embarked']=x_data['Embarked'].fillna('S') # 결측값도 S일 확률이 높으므로 S로 넣기

In [199]:
x_data['Embarked'].isnull().sum() # 'Embarked'결측치 완료


0

In [200]:
x_data['FamilySize'] = x_data['SibSp'] + x_data['Parch'] # 가족수 구하고 새로운 열 생성

In [201]:
x_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize
0,1,3,"Braund, Mr. Owen Harris",0,1.0,1,0,A/5 21171,0.0,,S,Mr,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,PC 17599,4.0,C85,C,Mrs,1
2,3,3,"Heikkinen, Miss. Laina",1,1.0,0,0,STON/O2. 3101282,1.0,,S,Miss,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,113803,4.0,C123,S,Mrs,1
4,5,3,"Allen, Mr. William Henry",0,2.0,0,0,373450,1.0,,S,Mr,0


In [202]:
x_data = x_data.drop(['Name','PassengerId','Ticket','SibSp','Parch','Cabin'],axis=1) # 필요없는 열삭제

In [203]:
x_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,3,0,1.0,0.0,S,Mr,1
1,1,1,2.0,4.0,C,Mrs,1
2,3,1,1.0,1.0,S,Miss,0
3,1,1,2.0,4.0,S,Mrs,1
4,3,0,2.0,1.0,S,Mr,0


In [204]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null float64
Fare          891 non-null float64
Embarked      891 non-null object
Title         891 non-null object
FamilySize    891 non-null int64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [205]:
#타입 변경
x_data[['Age','Fare']] = x_data[['Age','Fare']].astype('int64')

In [206]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null int64
Fare          891 non-null int64
Embarked      891 non-null object
Title         891 non-null object
FamilySize    891 non-null int64
dtypes: int64(5), object(2)
memory usage: 48.8+ KB


In [207]:
# One-hot-encoding for categorical variables
x_data = pd.get_dummies(x_data)


<br>

## 2. Train - Test split (비율을 7:3 으로 유지해주시고, seed는 0을 적용해주세요)

In [208]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data,y_data, test_size=0.3, random_state=0)

In [209]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(623, 13)
(268, 13)
(623, 1)
(268, 1)


<br>

## 3. Create model instance variable (동시에 여러 모델을 다른 이름으로 만들 수 있습니다.)

In [213]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [223]:
# 여러 모델 비교
def train_and_test(model):
    
    model.fit(x_train, y_train)
    
    print('Accuracy: ', accuracy_score(model.predict(x_test), y_test))

In [224]:
# 랜덤포레스트 최적 n_estimators
alist = []
def randomforest(model):
    
    model.fit(x_train, y_train)
    bb=accuracy_score(model.predict(x_test), y_test)
    alist.append(round(bb,4))
    
   
    

In [225]:
# Logistic Regression
log_pred = train_and_test(LogisticRegression())
# SVM+
svm_pred = train_and_test(SVC())
#kNN
knn_pred_4 = train_and_test(KNeighborsClassifier(n_neighbors = 4))
# Random Forest
rf_pred = train_and_test(RandomForestClassifier(n_estimators=100))
# Navie Bayes
nb_pred = train_and_test(GaussianNB())

Accuracy:  0.8246268656716418
Accuracy:  0.8246268656716418
Accuracy:  0.8171641791044776
Accuracy:  0.835820895522388
Accuracy:  0.7723880597014925


In [226]:
# 가장 좋은 모델에서 최적의 n_estimators 값 찾기
for i in np.arange(1,141,1) : 
    randomforest(RandomForestClassifier(n_estimators=i))
print("max_accuracy :{} ".format(max(alist)))
print('\n')
print("max_accuracy_n_estimators :{}".format(alist.index(max(alist))+1))


max_accuracy :0.8507 


max_accuracy_n_estimators :36
