## 1. 문제 정의
어떤 특징을 갖는 사람이 생존할지를 예측하는 문제

## 2. 데이터 읽어오기

### * 캐글에서 받게되는 파일의 구성
- train.csv - 예측 모델을 만들기 위해 사용하는 학습셋
- test.csv - 예측 모델을 이용하여 예측할 탑승객 정보가 담긴 테스트셋
- sampleSubmission.csv - 제출시 사용할 수 있는 csv 파일

In [32]:
import pandas as pd

train = pd.read_csv('data/train.csv', index_col = 0)
test = pd.read_csv('data/test.csv', index_col = 0)

## 3. 데이터 확인

In [33]:
train.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## 4. Feature engineering

### 숫자가 아닌 데이터를 숫자로 맵핑

In [34]:
train_test_data = [train, test]
def map_col(col, mapping_info):
    for dataset in train_test_data:
        dataset[col] = dataset[col].map(mapping_info)

In [35]:
map_col('Sex', {"male": 0, "female": 1})
map_col('Embarked', {"S": 0, "C": 1, "Q": 2})

In [36]:
train.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.108669
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.043835
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.118593
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.012186
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.060606
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.07932
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.063462
Embarked,0.108669,0.043835,0.118593,0.012186,-0.060606,-0.07932,0.063462,1.0


In [37]:
train.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### 불필요한 feature 제거

In [38]:
train_data = train.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
test_data = test.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
train_data.tail()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
887,0,2,0,27.0,0,0,13.0,0.0
888,1,1,1,19.0,0,0,30.0,0.0
889,0,3,1,,1,2,23.45,0.0
890,1,1,0,26.0,0,0,30.0,1.0
891,0,3,0,32.0,0,0,7.75,2.0


### 결측값 처리하기
- train: 결측값 있는 경우 제외
- test: 결측값은 평균값으로 대체

In [8]:
train_data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [39]:
train_data.dropna(how = 'any', inplace = True)
train_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [40]:
test_data.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [41]:
test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace = True)
test_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### 종속 변수와 독립 변수 설정

In [42]:
target = train_data.Survived
train_data.drop('Survived', axis = 1, inplace = True)

## 모델 적용

In [44]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

In [45]:
len_test = int(len(train_data)/10)
print(len_test)
def getScore(train_data, clf):        
    test_data   = train_data[-len_test:]; train_data = train_data[:-len_test]
    test_target = target[-len_test:];     train_target = target[:-len_test]
    clf.fit(train_data, train_target)    
    return (clf.predict(test_data) == np.array(test_target)).sum() / len_test

71


In [46]:
getScore(train_data, KNeighborsClassifier(n_neighbors = 13))

0.7605633802816901

In [47]:
scores = pd.DataFrame([
    getScore(train_data, KNeighborsClassifier(n_neighbors = 13)),
    getScore(train_data, DecisionTreeClassifier()),
    getScore(train_data, RandomForestClassifier(n_estimators=13)),
    getScore(train_data, GaussianNB()),
    getScore(train_data, SVC(gamma = 'auto'))],
    columns = ['score'],
    index = ['KNN', 'DT', 'RF', 'NB', 'SVM'])
scores

Unnamed: 0,score
KNN,0.760563
DT,0.830986
RF,0.887324
NB,0.816901
SVM,0.802817


In [48]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

def getScore(train_data, clf):
    scoring = 'accuracy'
    score = cross_val_score(clf, train_data, target, 
                            cv=k_fold, n_jobs=1, scoring=scoring)
    return score

In [49]:
getScore(train_data, KNeighborsClassifier(n_neighbors = 13))

array([0.66666667, 0.65277778, 0.5915493 , 0.70422535, 0.5915493 ,
       0.71830986, 0.67605634, 0.76056338, 0.71830986, 0.70422535])

In [54]:
scores = pd.DataFrame([
    getScore(train_data, KNeighborsClassifier(n_neighbors = 13)),
    getScore(train_data, DecisionTreeClassifier(random_state = 0)),
    getScore(train_data, RandomForestClassifier(n_estimators=13, random_state = 0)),
    getScore(train_data, GaussianNB()),
    getScore(train_data, SVC(gamma = 'auto'))],
    index = ['KNN', 'DT', 'RF', 'NB', 'SVM'])
scores['Mean'] = np.around(scores.mean(axis = 1)*100).astype('int')
scores

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Mean
KNN,0.666667,0.652778,0.591549,0.704225,0.591549,0.71831,0.676056,0.760563,0.71831,0.704225,68
DT,0.791667,0.777778,0.605634,0.760563,0.746479,0.788732,0.676056,0.774648,0.788732,0.816901,75
RF,0.722222,0.805556,0.661972,0.802817,0.774648,0.859155,0.746479,0.802817,0.802817,0.816901,78
NB,0.694444,0.819444,0.661972,0.816901,0.816901,0.830986,0.760563,0.830986,0.760563,0.802817,78
SVM,0.666667,0.638889,0.605634,0.746479,0.633803,0.746479,0.690141,0.746479,0.732394,0.676056,69


In [55]:
scores = pd.DataFrame([
    getScore(train_data, RandomForestClassifier(n_estimators=5, random_state = 0)),
    getScore(train_data, RandomForestClassifier(n_estimators=9, random_state = 0)),
    getScore(train_data, RandomForestClassifier(n_estimators=13, random_state = 0)),
    getScore(train_data, RandomForestClassifier(n_estimators=17, random_state = 0)),
    getScore(train_data, RandomForestClassifier(n_estimators=21, random_state = 0)),
   ])
scores['Mean'] = np.around(scores.mean(axis = 1)*100).astype('int')
scores

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Mean
0,0.75,0.805556,0.690141,0.788732,0.788732,0.84507,0.760563,0.830986,0.802817,0.830986,79
1,0.75,0.819444,0.661972,0.788732,0.774648,0.873239,0.760563,0.816901,0.802817,0.802817,79
2,0.722222,0.805556,0.661972,0.802817,0.774648,0.859155,0.746479,0.802817,0.802817,0.816901,78
3,0.736111,0.819444,0.676056,0.830986,0.774648,0.859155,0.774648,0.802817,0.816901,0.802817,79
4,0.777778,0.819444,0.661972,0.859155,0.774648,0.859155,0.788732,0.816901,0.802817,0.788732,79


In [57]:
def getPredict(train_data, test_data, clf):
    clf.fit(train_data, target)
    return clf.predict(test_data)

prediction = pd.DataFrame({
    'KNN': getPredict(train_data, test_data, KNeighborsClassifier(n_neighbors = 13)),
    'DT' : getPredict(train_data, test_data, DecisionTreeClassifier()),
    'RF' : getPredict(train_data, test_data, RandomForestClassifier(n_estimators=13)),
    'NB' : getPredict(train_data, test_data, GaussianNB()),
    'SVM': getPredict(train_data, test_data, SVC(gamma = 'auto'))})
prediction

Unnamed: 0,KNN,DT,RF,NB,SVM
0,0,0,0,0,0
1,0,0,0,1,0
2,0,1,0,0,1
3,0,1,1,0,0
4,0,0,1,1,0
5,0,0,0,0,0
6,0,0,0,1,0
7,1,0,0,0,1
8,0,1,1,1,0
9,1,0,0,0,1


In [21]:
prediction['Survived'] = (prediction.sum(axis = 1) > 2) * 1

In [22]:
prediction

Unnamed: 0,KNN,DT,RF,NB,SVM,Survived
0,0,0,0,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,1,0
3,0,1,1,0,0,0
4,0,0,1,1,0,0
5,0,0,0,0,0,0
6,0,0,0,1,0,0
7,1,0,0,0,1,0
8,0,1,1,1,0,1
9,1,0,0,0,1,0


In [28]:
submission = pd.read_csv("data/sample_submission.csv")
submission["Survived"] = prediction['Survived']
submission.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [29]:
submission.to_csv("submission.csv", index=False)