In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import re
import json

# https://matplotlib.org/3.1.1/tutorials/introductory/customizing.html
sns.set(rc={"axes.facecolor":"white", "axes.edgecolor":"black"})

### 오차행렬 confusion matrix
- 파이썬 라이브러리를 활용한 머신러닝 p.361
- 핸즈온 머신러닝 p.129

In [2]:
train = pd.read_csv("./res/titanic/train.csv")
test = pd.read_csv("./res/titanic/test.csv")

In [3]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train.isnull().sum() # 각 feature의 NaN 개수 확인

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
train["Age_mean"] = train["Age"]
train["Age_mean"].fillna(train["Age"].mean(),inplace=True) # inplace=True: 데이터에 assign
test["Age_mean"] = test["Age"]
test["Age_mean"].fillna(train["Age"].mean(),inplace=True)

In [15]:
train["Age_mean"].isnull().sum()

0

In [16]:
print(train["Age_mean"].mean()) #29.69
print(train["Age"].mean()) #29.69

29.699117647058763
29.69911764705882


In [17]:
train["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [18]:
# 성별 one-hot encoding을 위해 True/False로 변환
train["Gender"] = train["Sex"] == "female"
test["Gender"] = test["Sex"] == "female"

In [19]:
train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [20]:
train["Embarked"].isnull().sum() #2개가 NaN으로 출력

2

In [21]:
test['Embarked'].isnull().sum()

0

In [22]:
# S:100 , C:010, Q:001 → one-hot encodng 형식
train["Embarked_S"] = (train["Embarked"] == "S")
train["Embarked_C"] = (train["Embarked"] == "C")
train["Embarked_Q"] = (train["Embarked"] == "Q")

In [23]:
train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [24]:
train["Embarked_S"].sum()

644

In [26]:
# one-hot encodng 형식으로 볼 수 있음
train[["Embarked_S","Embarked_C","Embarked_Q"]].head()

Unnamed: 0,Embarked_S,Embarked_C,Embarked_Q
0,True,False,False
1,False,True,False
2,True,False,False
3,True,False,False
4,True,False,False


In [27]:
test["Embarked_S"] = (test["Embarked"] == "S")
test["Embarked_C"] = (test["Embarked"] == "C")
test["Embarked_Q"] = (test["Embarked"] == "Q")

### 가족 수의 따라 가족 분류
대가족(5인 이상):L , 소가족(1인) :S, 중가족(2~4):M
* SibSp: 형제와 사촌의 수
* Parch: 부모와 자녀의 수 

In [28]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1 # 자기자신 포함
train["FamilySize"].value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64

In [30]:
train["Family"] = train["FamilySize"]

In [31]:
train.loc[train["FamilySize"] == 1,"Family"] = "S" #fmailysize=1과 같은 family 열을 참조
train.loc[(train["FamilySize"] > 1) & (train["FamilySize"] < 5), "Family"] = "M"
train.loc[train["FamilySize"] > 4,"Family"] = "L"

In [33]:
train[["Family", "FamilySize"]].head(10)

Unnamed: 0,Family,FamilySize
0,M,2
1,M,2
2,S,1
3,M,2
4,S,1
5,S,1
6,S,1
7,L,5
8,M,3
9,M,2


In [34]:
train["Family_S"] = train["Family"] == "S"
train["Family_M"] = train["Family"] == "M"
train["Family_L"] = train["Family"] == "L"

In [37]:
train[["Family_S", "Family_M", "Family_L", "Family", "FamilySize"]].head()

Unnamed: 0,Family_S,Family_M,Family_L,Family,FamilySize
0,False,True,False,M,2
1,False,True,False,M,2
2,True,False,False,S,1
3,False,True,False,M,2
4,True,False,False,S,1


In [38]:
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
test["FamilySize"].value_counts()

1     253
2      74
3      57
4      14
5       7
11      4
7       4
6       3
8       2
Name: FamilySize, dtype: int64

In [39]:
test["Family"] = test["FamilySize"]

In [53]:
test.loc[test["FamilySize"] == 1, "Family"] = "S" # fmailysize = 1과 같은 family 열 참조
test.loc[(test["FamilySize"] > 1) & (test["FamilySize"] < 5),"Family"] = "M"
test.loc[test["FamilySize"] > 4, "Family"] = "L"
test[["Family", "FamilySize"]].head(10)

Unnamed: 0,Family,FamilySize
0,S,1
1,M,2
2,S,1
3,S,1
4,M,3
5,S,1
6,S,1
7,M,3
8,S,1
9,M,3


In [54]:
test["Family_S"] = test["Family"] == "S"
test["Family_M"] = test["Family"] == "M"
test["Family_L"] = test["Family" ] == "L"
test[["Family_S", "Family_M", "Family_L"]]

Unnamed: 0,Family_S,Family_M,Family_L
0,True,False,False
1,False,True,False
2,True,False,False
3,True,False,False
4,False,True,False
5,True,False,False
6,True,False,False
7,False,True,False
8,True,False,False
9,False,True,False


In [55]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Age_mean,Gender,Embarked_S,Embarked_C,Embarked_Q,FamilySize,Family,Family_S,Family_M,Family_L
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,22.0,False,True,False,False,2,M,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,38.0,True,False,True,False,2,M,False,True,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,26.0,True,True,False,False,1,S,True,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,35.0,True,True,False,False,2,M,False,True,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,35.0,False,True,False,False,1,S,True,False,False


In [56]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Age_mean,Gender,Embarked_S,Embarked_C,Embarked_Q,FamilySize,Family,Family_S,Family_M,Family_L
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,...,34.5,False,False,False,True,1,S,True,False,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,...,47.0,True,True,False,False,2,M,False,True,False
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,...,62.0,False,False,False,True,1,S,True,False,False
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,...,27.0,False,True,False,False,1,S,True,False,False
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,...,22.0,True,True,False,False,3,M,False,True,False


In [57]:
train["Pclass"].dtypes

CategoricalDtype(categories=[1, 2, 3], ordered=False)

In [58]:
train["Pclass"] = train["Pclass"].astype("category")
train["Pclass"].dtypes

CategoricalDtype(categories=[1, 2, 3], ordered=False)

In [59]:
fn = ["Gender", "Age_mean",
      "Embarked_S", "Embarked_C", "Embarked_Q",
      "Family_S", "Family_M", "Family_L"]

X_train = train[fn]
X_train.head()

Unnamed: 0,Gender,Age_mean,Embarked_S,Embarked_C,Embarked_Q,Family_S,Family_M,Family_L
0,False,22.0,True,False,False,False,True,False
1,True,38.0,False,True,False,False,True,False
2,True,26.0,True,False,False,True,False,False
3,True,35.0,True,False,False,False,True,False
4,False,35.0,True,False,False,True,False,False


In [60]:
y_label = train["Survived"]
X_test = test[fn]
X_test.head()

Unnamed: 0,Gender,Age_mean,Embarked_S,Embarked_C,Embarked_Q,Family_S,Family_M,Family_L
0,False,34.5,False,False,True,True,False,False
1,True,47.0,True,False,False,False,True,False
2,False,62.0,False,False,True,True,False,False
3,False,27.0,True,False,False,True,False,False
4,True,22.0,True,False,False,False,True,False


## Desicion Tree 모델링

In [61]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=3) # 깊이를 3까지 지정
model# DecisionTreeClassifier 속성, criterion은 주로 'gini'선택

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [62]:
model.fit(X_train,y_label) # 나무 한그루 생성. 랜덤포레스트를 쓰면 나무를 여러개 만들 수 있다

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## 계층별 k-겹 교차검증
(파이썬 라이브러리를 활용한 머신러닝 p.328 참고)

In [63]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=10, shuffle=True, random_state=2019) #n_split=3, shuffle=False, random_state=None, 
# random_State는 임의로 설정
kfold

KFold(n_splits=10, random_state=2019, shuffle=True)

In [75]:
# 일반적으로 데이터는 X_train처럼 대문자로 쓰고, y는 대문자 또는 소문자를 씀
# scoring 하는 방법은 몇가지 더 있는데 보통 accuracy를 씀
type(cross_val_score(model, X_train, y_label, cv=kfold, scoring="accuracy"))

numpy.ndarray

In [67]:
score = cross_val_score(model, X_train, y_label, cv=kfold)
print(score) # train 데이터를 890개를 10 계층으로 약 90개씩 나누어서 각 계층에 해당하는 정확도 출력 
round(np.mean(score)*100, 2) # 총 데이터의 정확도

[0.85555556 0.80898876 0.84269663 0.84269663 0.80898876 0.85393258
 0.74157303 0.80898876 0.83146067 0.79775281]


81.93

In [68]:
prediction = model.predict(X_test) # test 데이터 survived 예측 값
prediction.shape

(418,)

In [69]:
test["Survived"] = prediction
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_mean', 'Gender',
       'Embarked_S', 'Embarked_C', 'Embarked_Q', 'FamilySize', 'Family',
       'Family_S', 'Family_M', 'Family_L', 'Survived'],
      dtype='object')

In [86]:
submission = test[['PassengerId','Survived']]
submission.to_csv("./res/titanic/submit.csv", index=False)