## かめさん講座　機械学習入門
#### 34.Grid Search

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

In [2]:
df = sns.load_dataset('titanic')
df = df.dropna()
X = df.loc[:, (df.columns!='survived') & (df.columns!='alive')]
X = pd.get_dummies(X, drop_first=True)
y = df['survived']

In [5]:
# モデル準備
model = SVC(random_state=0)

In [6]:
# ハイパーパラメータを辞書でセット
param_grid = {'kernel': ['linear', 'rbf'], 'C': [2**i for i in range(-2, 3)]}

In [8]:
# K-FoldCV
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=0)

In [11]:
# GridSarchCVオブジェクト作成&実行
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=cv)
grid_search.fit(X, y)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=0),
             estimator=SVC(random_state=0),
             param_grid={'C': [0.25, 0.5, 1, 2, 4],
                         'kernel': ['linear', 'rbf']},
             scoring='accuracy')

In [12]:
print(grid_search.best_params_, grid_search.best_score_)

{'C': 0.25, 'kernel': 'linear'} 0.7618118118118118


### Pipelineなし

In [15]:
# 学習データとテストデータ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [16]:
# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [17]:
# SVMのインスタンス生成 (predict_probaが使えるようにprobability=Trueを指定)
model = SVC(probability=True, random_state=0)

In [18]:
#学習
model.fit(X_train_scaled, y_train)

SVC(probability=True, random_state=0)

In [19]:
# テストデータを予測
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict_proba(X_test_scaled)

### Pipeline あり

In [22]:
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', SVC(probability=True, random_state=0))])
pipeline.fit(X_train, y_train)
y_pred_p = pipeline.predict_proba(X_test)

In [23]:
y_pred_p.all() == y_pred.all()

True

### PipelineとGridSearchCVを組み合わせる

In [24]:
# pipeline オブジェクトの作成
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', SVC(random_state=0))])

In [25]:
# param_dict 作成
svm_param_grid = {'model__kernel': ['linear', 'rbf'], 
                  'model__C': [2**i for i in range(-2, 3)]}

In [26]:
# GridSearchCV 実行
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
grid_search = GridSearchCV(estimator=pipeline, param_grid=svm_param_grid, scoring='accuracy', cv=cv)
grid_search.fit(X, y)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=0),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVC(random_state=0))]),
             param_grid={'model__C': [0.25, 0.5, 1, 2, 4],
                         'model__kernel': ['linear', 'rbf']},
             scoring='accuracy')