In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [4]:
df_train = pd.read_csv("../input/titanic/train.csv")
df_test = pd.read_csv("../input/titanic/test.csv")

In [152]:
df_test

In [151]:
df_test.values

In [5]:
df_train.head()

In [6]:
df_train.shape

In [7]:
df_train.describe()

In [8]:
df_test.describe()

In [9]:
df_train.columns

In [10]:
# null 데이터의 확인
for col in df_train.columns:
    msg = 'columns: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

In [11]:
# null 데이터의 확인
for col in df_test.columns:
    msg = 'columns: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msg)

In [12]:
msno.matrix(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2)) # color 는 RGB 값
                                                                         # 하얀색으로 비어있는 값은 NUll

In [13]:
# iloc 공부하기!
df_train.iloc[:, 1]

In [14]:
msno.bar(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))

In [15]:
                   # 1행, 2열
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')

sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

In [16]:
df_train['Survived'].value_counts()

In [17]:
# Series 는 항상 plot을 가지고 있다.
type(df_train['Survived'].value_counts())

In [18]:
# df_train['Survived'].value_counts().plot()   같음
# plt.plot(df_train['Survived'].value_counts()) 

In [19]:
df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', shadow=True)

In [20]:
df_train.head()

In [21]:
df_train.shape

## EDA (Exploratory Data Analysis) : 탐색적 자료 분석, 자료 시각화
### 2.1 Pclass

In [22]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

In [23]:
# count, sum(합계), mean(평균), mean 여기에서 생존률
df_train[['Pclass', 'Survived']].groupby(['Pclass']).mean()

In [24]:
# color map scheme 검색하여 여러 색을 사용할 수 있음
# https://matplotlib.org/stable/tutorials/colors/colormaps.html
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='Blues')

In [25]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

In [26]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False)

In [27]:
type(df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False))

In [28]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot()

In [29]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False).plot()

In [30]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar()

In [31]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(18,8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32', '#FFDF00', '#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of passangers By Pclass', y=y_position) # 단순 value count
ax[0].set_ylabel('Count')
                      # hue : 색 구분
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()

## 2.2 Sex

In [32]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex : Survived vs Dead')
plt.show()

In [33]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean()

In [34]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')

### 2.2 Both Sex and Pclass

In [35]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

- Lady first
- Money brings survival?

In [36]:
sns.factorplot(x='Sex', y='Survived', col='Pclass', data=df_train, saturation=5, size=9, aspect=1)

In [37]:
sns.factorplot(x='Sex', y='Survived', hue='Pclass', data=df_train, saturation=5, size=9, aspect=1)

### 2.3 Age

In [38]:
print('제일 나이 많은 탑승객 : {:.1f} years'.format(df_train['Age'].max()))
print('제일 어린 탑승객 : {:.1f} years'.format(df_train['Age'].min()))
print('탑승객 평균 나이 : {:.1f} years'.format(df_train['Age'].mean()))

In [39]:
fig, ax = plt.subplots(1, 1, figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [40]:
df_train[df_train['Survived'] == 1]['Age'].hist()

In [41]:
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'])

In [42]:
df_train[df_train['Survived'] == 1]['Age'].hist()

인덱싱 방법

In [43]:
df_train.iloc[2,:]

In [44]:
for row in df_train.iterrows():
    break

row

In [45]:
df_train[df_train['Survived'] == 1]

도화지를 준비하는 법!

1. f = plt.figure(figsize=(10,10))

2. f,ax = plt.subplots(1,1, figsize=(10,10))

3. plt.figure(figsize=(10,10))


In [46]:
f, ax = plt.subplots(1,1, figsize=(5,5))
a = np.arange(100)
b = np.sin(a)

ax.plot(b)

In [47]:
plt.subplots(1,1, figsize=(5,5))
a = np.arange(100)
b = np.sin(a)

plt.plot(b)

In [48]:
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')

plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

# fig, ax = plt.subplots(1,1, figsize=(5,5))
# a = np.arange(100)
# b= np.sin(a)

# ax 일때는 set_xlabel('name') 식으로!
# ax.plot(b)
# ax.set_xlabel('sdf')

In [49]:
fig, ax = plt.subplots(1, 1, figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 1)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 1)]['Age'], ax=ax)
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('1st Class')
plt.show()

In [50]:
fig, ax = plt.subplots(1, 1, figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 2)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 2)]['Age'], ax=ax)
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('2rd Class')
plt.show()

In [51]:
fig, ax = plt.subplots(1, 1, figsize=(9,5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 3)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 3)]['Age'], ax=ax)
plt.legend(['Survived == 0', 'Survived == 1'])
plt.title('3rd Class')
plt.show()

In [52]:
plt.figure(figsize=(8,6))
df_train['Age'][(df_train['Pclass'] == 1) & (df_train['Survived'] == 0)].plot(kind='hist')
# df_train['Age'][(df_train['Pclass'] == 1) & (df_train['Survived'] == 1)].plot(kind='hist')

plt.xlabel('Age')
plt.title('Age Distribution within classes')
# plt.legend(['1st Class', '2nd Class', '3rd Class'])

In [53]:
change_age_range_survival_ratio = []

for i in range(1,80):
    change_age_range_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))

plt.figure(figsize=(7,7))
plt.plot(change_age_range_survival_ratio)
plt.title('Survival rate change depending on range of Age', y = 1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

In [54]:
i = 10
df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived'])

In [55]:
df_train[df_train['Age'] < i]

In [56]:
df_train[df_train['Age'] < i]['Survived']

In [57]:
df_train[df_train['Age'] < i]['Survived'].sum()

### Pclass, Sex, Age

In [58]:
f, ax = plt.subplots(1, 2, figsize=(18,8))
sns.violinplot('Pclass', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot('Sex', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0, 110, 10))
plt.show()

In [59]:
# split 이 False 인 경우와 scale 을 area로 할 경우 같은 넓를 가짐
f, ax = plt.subplots(1, 2, figsize=(18,8))
sns.violinplot('Pclass', 'Age', hue='Survived', data=df_train, scale='area', split=False, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))

### Embarked

In [60]:
f, ax = plt.subplots(1, 1, figsize=(7,7))
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax)

In [61]:
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by='Survived', ascending=True)

In [62]:
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_index()

In [63]:
f, ax = plt.subplots(2,2,figsize=(20, 15))
sns.countplot('Embarked', data=df_train, ax=ax[0,0])
ax[0, 0].set_title('(1) NO. Of Passangers Board')

sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0,1])
ax[0,1].set_title('(2) Male-Female split for embarked')

sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')

sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')

# 간격맞
plt.subplots_adjust(wspace=0.2, hspace=0.5)
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots_adjust.html

plt.show()

### Family- SibSp + Parch

In [64]:
# Series 는 연산 가능!
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

In [65]:
df_train['FamilySize']

In [66]:
print('Maximum size of Family:', df_train['FamilySize'].max())
print('Minimum size of Family:', df_train['FamilySize'].min())

In [67]:
f, ax=plt.subplots(1,3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1)NO. Of Passanger Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on Familysize', y=1.02)

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize', y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

### Fare

In [68]:
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='skewness:{:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

In [69]:
df_train['Fare'] = df_train['Fare'].map(lambda i:np.log(i) if i>0 else 0)

In [70]:
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='skewness:{:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

In [71]:
df_train['Ticket'].value_counts()

 ### 널값처리

In [72]:
df_train['Age'].isnull().sum()

In [73]:
df_train['Name']

In [74]:
df_train.Name

In [75]:
# Name 열의 형식을 모두 str로
df_train['Name'].str

In [76]:
df_train['Name'].str.extract('([A-Za-z]+)\.')

In [77]:
df_train['Initial'] = df_train['Name'].str.extract('([A-Za-z]+)\.')
df_test['Initial'] = df_test['Name'].str.extract('([A-Za-z]+)\.')

In [78]:
df_train.head()

In [79]:
df_test.head()

In [80]:
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')

In [81]:
# inplcae 바뀐 것을 바로 적용
df_train['Initial'].replace(['Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'],
                           ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr',' Mrs'], inplace=True)

df_test['Initial'].replace(['Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'],
                           ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr',' Mrs'], inplace=True)

In [82]:
df_train.groupby('Initial').mean()

In [83]:
df_train.groupby('Initial')['Survived'].mean().plot.bar()

In [84]:
df_all = pd.concat([df_train, df_test])
df_all

In [85]:
df_all.reset_index()

In [86]:
# drop 인덱스 없애기
df_all.reset_index(drop=True)

In [87]:
df_all.groupby('Initial').mean()

In [88]:
df_train.loc[1:3, :]

In [89]:
df_train.loc[df_train['Survived'] == 1]

In [90]:
df_train['Age'].isnull()

In [91]:
(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr')

In [92]:
df_all.groupby('Initial').mean()

In [93]:
                                                                            # 컬럼 지정
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr'), 'Age'] = 33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mrs'), 'Age'] = 37
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Master'), 'Age'] = 5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Miss'), 'Age'] = 22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Other'), 'Age'] = 45

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mr'), 'Age'] = 33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mrs'), 'Age'] = 37
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Master'), 'Age'] = 5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Miss'), 'Age'] = 22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Other'), 'Age'] = 45

In [94]:
df_train['Age'].isnull().sum()

In [95]:
df_test['Age'].isnull().sum()

In [96]:
df_train['Embarked'].fillna('S', inplace=True)

In [97]:
df_train['Embarked'].isnull().sum()

In [98]:
df_train['Age_cat'] = 0

In [99]:
df_train.head()

 #### Age 하드코딩을 통한 카테고리

In [100]:
df_train.loc[df_train['Age'] < 10, 'Age_cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20), 'Age_cat'] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30), 'Age_cat'] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40), 'Age_cat'] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50), 'Age_cat'] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60), 'Age_cat'] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70), 'Age_cat'] = 6
df_train.loc[(70 <= df_train['Age']), 'Age_cat'] = 7

In [101]:
df_test.loc[df_test['Age'] < 10, 'Age_cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20), 'Age_cat'] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30), 'Age_cat'] = 2
df_test.loc[(30 <= df_test['Age']) & (df_test['Age'] < 40), 'Age_cat'] = 3
df_test.loc[(40 <= df_test['Age']) & (df_test['Age'] < 50), 'Age_cat'] = 4
df_test.loc[(50 <= df_test['Age']) & (df_test['Age'] < 60), 'Age_cat'] = 5
df_test.loc[(60 <= df_test['Age']) & (df_test['Age'] < 70), 'Age_cat'] = 6
df_test.loc[(70 <= df_test['Age']), 'Age_cat'] = 7

In [102]:
df_train.head()

In [103]:
df_test.head()

#### Age , apply를 이용한 카테고리화

In [104]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7        

In [105]:
# 단순 반복형태면 함수 형태로 하는 것이 좋음
df_train['Age_cat2'] = df_train['Age'].apply(category_age)

In [106]:
# all : 모든것이 True이면 True 반환
# any : 둘 중 하나라도 True이면 True 반환, 둘 중 하나라도 True가 없다면 False 반환
(df_train['Age_cat'] == df_train['Age_cat2']).all()

In [107]:
df_train.drop(['Age', 'Age_cat2'], axis=1, inplace=True)
df_test.drop(['Age'], axis=1, inplace=True)

In [108]:
df_train.Initial.unique()

### 모든 데이터를 수치화

In [109]:
# 하나하나 바꿔주는 방법
# df_train.loc[df_train['Initial'] == 'Master', 'Initial'] = 1

# 쉽게하는 방법
df_train['Initial'] = df_train['Initial'].map({'Master':0, 'Miss':1, 'Mr':2, 'Mrs':3, 'Other':4})
df_test['Initial'] = df_test['Initial'].map({'Master':0, 'Miss':1, 'Mr':2, 'Mrs':3, 'Other':4})

In [110]:
# feature 에 어떤 값이 있는지 확인

# 1, type : nparray
df_train.Embarked.unique()

# 2, type : Series
df_train['Embarked'].value_counts()

In [111]:
df_train['Embarked'].value_counts()

In [112]:
df_train['Embarked'] = df_train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df_test['Embarked'] = df_test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

In [113]:
df_train.head()

In [114]:
# df_train.Embarked.isnull().sum()
df_train.Embarked.isnull().any()
# >> null 데이터가 하나도 없다 (True가 없다)

In [115]:
df_train['Sex'].unique()

In [116]:
df_train['Sex'] = df_train['Sex'].map({'female' : 0, 'male': 1})
df_test['Sex'] = df_test['Sex'].map({'female' : 0, 'male': 1})

In [117]:
heatmap_data = df_train[['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked', 'FamilySize', 'Initial', 'Age_cat']]

In [118]:
# seaborn heatmap

colormap = plt.cm.BuGn
plt.figure(figsize=(12,10))
plt.title('Pearson Correalation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size':16}, fmt=".2f")

In [119]:
# 원핫 인코딩, 해당 컬럼을 없앰
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')

In [120]:
 df_train.head()

In [121]:
df_test.head()

In [122]:
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')

# 카테고리가 100개가 넘어가는 경우 one-hot 인코딩을 하는 경우가 비효율적일 수 있다.

In [123]:
df_train.head()

In [124]:
# 안쓰는 컬럼 제거
df_train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [153]:
df_test.head()

In [154]:
df_test.isnull().sum()

In [125]:
df_train.head()

### 모델 예측

In [126]:
# binary-classification 문제

from sklearn.ensemble import RandomForestClassifier # 결정트리 기반 모델
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [157]:
df_test.isnull().sum()

In [158]:
df_test.loc[(df_test['Fare'].isnull()), 'Fare'] = 10

In [159]:
X_train = df_train.drop('Survived', axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values

In [160]:
X_test

In [161]:
# target label 이 있는 지도학습 (Supervised learning)
X_tr, X_vid, y_tr, y_vid = train_test_split(X_train, target_label, test_size=0.3, random_state=2018)

In [162]:
X_tr.shape

In [163]:
model = RandomForestClassifier()
model.fit(X_tr, y_tr)

In [164]:
prediction = model.predict(X_vid)

In [165]:
prediction

In [166]:
print('총 {}명 중 {:.2f}% 정확도로 생존 맞춤'.format(y_vid.shape[0], 100 * metrics.accuracy_score(prediction, y_vid)))

In [167]:
'정확도 {:.2f}'.format((prediction == y_vid).sum() / prediction.shape[0])

### Feature importance
- 학습된 모델은 feature importance 를 가지게 되는데, 이것을 확인하여 만든 모델이 어떤 feature 에 영향을 많이 받는지 확인할 수 있다

In [168]:
# 학습 시키면, 자동으로 생성됨. feature 의 순서대로
model.feature_importances_

In [169]:
df_train.head()

In [170]:
df_test.head()

In [171]:
from pandas import Series

In [172]:
feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)

In [173]:
# 아래의 feature_importance 를 확인하여, 영향력이 미미한 feature 를 뺄 수도 있음.
plt.figure(figsize=(8,8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

In [174]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')

In [175]:
submission.head()

In [176]:
prediction = model.predict(X_test)

In [149]:
X_test.shape

In [177]:
submission['Survived'] = prediction

In [178]:
submission.to_csv('./my_fist_submission.csv', index=False)