In [2]:
# warningsを無視する
import warnings
warnings.filterwarnings('ignore')

# 5.1.1 欠損値の処理

In [3]:
import numpy as np
import pandas as pd

In [5]:
df_train = pd.read_csv("./input/train.csv")
df_test = pd.read_csv("./input/test.csv")
df_gender_submission = pd.read_csv("./input/gender_submission.csv")

In [6]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

## 5.1.1.1 年齢 (Age)の補完

In [7]:
df_train['Age'].mean() # 年齢の平均値を算出

29.69911764705882

In [8]:
# 'Age'の欠損値に30を代入する。
df_train['Age'] = df_train['Age'].fillna(30)
df_test['Age'] = df_test['Age'].fillna(30)

In [9]:
# df_trainでEmbarkedが欠損のデータを表示
df_train[df_train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [10]:
df_train[df_train['Ticket'] == '113572'] 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [11]:
df_test[df_test['Ticket'] == '113572']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [12]:
# 欠損値を'C'で埋め、表示して確認
df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'
df_train.loc[df_train['PassengerId'].isin([62, 830])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,C
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,C


## 5.1.1.3 運賃 (Fare)の補完

In [13]:
# PclassごとにFareの平均値を表示
df_train[['Pclass','Fare']].groupby('Pclass').mean()

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,84.154687
2,20.662183
3,13.67555


In [14]:
# 欠損値があるレコードを確認
df_test[df_test['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [15]:
df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550
df_test[df_test['PassengerId'] == 1044]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,13.67555,,S


In [16]:
print('--df_trainの欠損値--')
print(df_train.isnull().sum()) # df_trainの欠損値を表示
print('-'*10 )
print('--df_testの欠損値--')
print(df_test.isnull().sum()) # df_testの欠損値を表示

--df_trainの欠損値--
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
----------
--df_testの欠損値--
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


# 5.1.2 カテゴリ変数への変換

## 5.1.2.1 Sex (性別)の変換

In [17]:
genders = {'male': 0, 'female': 1} # 辞書を作成 
# Sexをgendersを用いて変換
df_train['Sex'] = df_train['Sex'].map(genders)
df_test['Sex'] = df_test['Sex'].map(genders)

## 5.1.2.2 Embarked (乗船した港)の変換

In [18]:
# ダミー変数化
df_train = pd.get_dummies(df_train, columns=['Embarked'])
df_test = pd.get_dummies(df_test, columns = ['Embarked'])

# 5.1.3 不要な列の削除

In [19]:
df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [20]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,0,22.0,1,0,7.25,0,0,1
1,2,1,1,1,38.0,1,0,71.2833,1,0,0
2,3,1,3,1,26.0,0,0,7.925,0,0,1
3,4,1,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,0,35.0,0,0,8.05,0,0,1


# 5.2識別器に学習させて予測

In [21]:
X_train = df_train.drop(["PassengerId", "Survived"], axis=1) # 不要な列を削除
Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列
X_test  = df_test.drop('PassengerId', axis=1).copy()

In [27]:
from sklearn.ensemble import RandomForestClassifier

# ランダムフォレストのインスタンスを作成
forest = RandomForestClassifier(random_state=1)

# X_trainからY_trainを予測するように学習
forest.fit(X_train,Y_train)

# 正解率を表示
acc_log = round(forest.score(X_train, Y_train) * 100, 2)
print(round(acc_log,2,), '%')

96.07 %


In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [29]:
# 3分割交差検証を指定し、インスタンス化
kf = KFold(n_splits=3)

# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする
for train_index, test_index in kf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]
    y_cv_train = Y_train.iloc[train_index]
    y_cv_test = Y_train.iloc[test_index]
    forest = RandomForestClassifier(random_state=1)
    forest.fit(X_cv_train, y_cv_train) # 学習
    predictions = forest.predict(X_cv_test) # 予測
    # acuuracyを表示
    print(round(accuracy_score(y_cv_test,forest.predict(X_cv_test))*100,2))

77.1
80.13
77.78


In [30]:
# 学習と予測を行う
forest = RandomForestClassifier(random_state=1)
forest.fit(X_train, Y_train)
Y_prediction = forest.predict(X_test)
submission = pd.DataFrame({
        'PassengerId': df_test['PassengerId'],
        'Survived': Y_prediction
    })
submission.to_csv('submission.csv', index=False)
