In [29]:
#ライブラリの読み込み

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
from numpy import nan
import seaborn as sns

#データの読み込み
df = pd.read_csv('train_taitanic.csv')

In [4]:
#データの一部を出力
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
#Sex,Age,Pclassに注目,他の変数を削除
df_0 = df.drop(['PassengerId','Name','SibSp','Parch','Ticket',
                'Fare','Cabin','Embarked'], axis=1)

In [9]:
#各特徴量の欠損ちをカウント
df_0.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
dtype: int64

In [8]:
#男女の平均年齢を出力
df_0.groupby('Sex').Age.mean()

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [14]:
#年齢の欠損値を平均年齢で補完
df_1 = df_0.fillna(round(df_0.mean()))

In [16]:
#Femaleのカラムを作成
df_1['Female'] = df_1['Sex'].map({'male':0,'female':1})

In [27]:
#Pclassをダミー変数化
pclass_d = pd.get_dummies(df_1['Pclass'], prefix='Class')

#共役線型性を防ぐためClass_3を削除
pclass_d = pclass_d.drop(['Class_3'], axis=1)

#Class_1,2カラムを作成
df_2 = df_1.join(pclass_d)

#Pclass,Sexカラムを削除
df_f = df_2.drop(['Pclass','Sex'], axis=1)

In [70]:
#説明変数と目的変数を指定
X = df_f.drop(['Survived'], axis=1).values
y = df_f.loc[:,['Survived']].values.reshape(-1)

#データの分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=0)

In [71]:
#ロジスティック回帰の実行
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1.0)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [72]:
#決定係数を出力
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.797752808989
0.798507462687


In [73]:
#線形SVCの学習
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [74]:
#決定係数を出力
print(svc.score(X_train, y_train))
print(svc.score(X_test, y_test))

0.786516853933
0.787313432836


In [100]:
#カーネルSVMによる学習
rbf_svm = SVC(kernel='rbf', gamma=0.01, C=100)
rbf_svm.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [101]:
#決定係数を出力
print(rbf_svm.score(X_train, y_train))
print(rbf_svm.score(X_test, y_test))

0.821829855538
0.817164179104


In [80]:
#グリッドサーチを実行

# パラメータを指定
param_grid = {'C': [0.1, 1.0, 10, 100, 1000, 10000],
              'gamma': [0.001, 0.01, 0.1, 1, 10]}

# データの分割方法を指定（層化）
from sklearn.model_selection import StratifiedKFold
kf_5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# GridSearchCVのインスタンスを生成
from sklearn.model_selection import GridSearchCV
rbf_svm1 = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=kf_5)
rbf_svm1.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 10, 100, 1000, 10000], 'gamma': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [81]:
rbf_svm1.best_params_

{'C': 10000, 'gamma': 0.001}

In [103]:
print(rbf_svm1.score(X_train, y_train))
print(rbf_svm1.score(X_test, y_test))

0.813804173355
0.798507462687
