# 通し課題模範解答 分類編 DAY 4
- kaggle の kickstarter project に関して，成功・失敗を予測するモデルを作成する
    - https://www.kaggle.com/kemical/kickstarter-projects?select=ks-projects-201801.csv
- DAY 4 では，以下を行う
    - k-近傍法の利用
    - ハイパーパラメータチューニング
        - ランダムサーチ

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../data/df_classification.csv', index_col='ID')
df.head()

Unnamed: 0_level_0,period,log_usd_goal,n_words,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,...,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD,state_successful
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000002330,58,3.185811,6,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1000003930,59,4.477121,8,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1000004038,44,4.653213,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1000007540,29,3.69897,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1000014025,34,4.69897,3,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1


In [3]:
X = df.drop(columns='state_successful')
y = df['state_successful']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [4]:
std = StandardScaler()
X_train.loc[:, ['log_usd_goal', 'period']] = std.fit_transform(X_train.loc[:, ['log_usd_goal', 'period']])
X_test.loc[:, ['log_usd_goal', 'period']] = std.transform(X_test.loc[:, ['log_usd_goal', 'period']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


##  k-近傍法の利用
以下をチューニングしつつ学習．探索に30分ほどかかるので注意する
- n_neighbors: 近傍に含める訓練データ点の数

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

In [6]:
parameters = {'n_neighbors': range(3, 21, 2)}

knc = KNeighborsClassifier()
grid = GridSearchCV(knc, param_grid=parameters, 
                         cv=kf,
                         n_jobs=-1,
                         scoring='accuracy', 
                         verbose=3) 

grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  5.3min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=1234, shuffle=True),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': range(3, 21, 2)}, scoring='accuracy',
             verbose=3)

In [7]:
y_pred = grid.predict(X_test)

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_1 = f1_score(y_test, y_pred)

print(f'正解率: {acc:.3}')
print(f'Precision: {precision:.3}')
print(f'Recall: {recall:.3}')
print(f'F1: {f_1:.3}')

正解率: 0.65
Precision: 0.584
Recall: 0.464
F1: 0.517


In [8]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=19)

性能は決定木などに比肩するが，推論に時間がかかる

## ハイパーパラメータチューニング
- グリッドサーチ: Day2 および Day3 で利用したので割愛
- ランダムサーチ: 本項で扱う
- ベイズ最適化: 発展編で扱う

### ランダムサーチ
Day3までで性能が良かったアダブーストについて，以下を探索しながら学習．1時間ほど掛かる可能性がある
- base_estimator: 弱学習器の種類．ここでは，最大探索深さを変えながら学習
- learning_rate: ブースティングの学習率（新たに作成されるモデルの寄与度の縮退率）
- n_estimators: 弱学習器の数

In [9]:
from scipy.stats import uniform

In [10]:
params = {"base_estimator": [DecisionTreeClassifier(max_depth=x) for x in range(3, 8)],
          "learning_rate": uniform(loc=0.0, scale=1.0),
          "n_estimators": range(100, 300)}

abc = AdaBoostClassifier(random_state=1234)
grid = RandomizedSearchCV(estimator=abc,
                          param_distributions=params, 
                          cv=kf, 
                          n_iter=20,
                          scoring='accuracy',
                          random_state=1234,
                          n_jobs=-1,
                          verbose=3)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 44.9min finished


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=1234, shuffle=True),
                   estimator=AdaBoostClassifier(random_state=1234), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'base_estimator': [DecisionTreeClassifier(max_depth=3),
                                                           DecisionTreeClassifier(max_depth=4),
                                                           DecisionTreeClassifier(max_depth=5),
                                                           DecisionTreeClassifier(max_depth=6),
                                                           DecisionTreeClassifier(max_depth=7)],
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff91a3ec1d0>,
                                        'n_estimators': range(100, 300)},
                   random_state=1234, scoring='accuracy', verbose=3)

In [11]:
y_pred = grid.predict(X_test)

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_1 = f1_score(y_test, y_pred)

print(f'正解率: {acc:.3}')
print(f'Precision: {precision:.3}')
print(f'Recall: {recall:.3}')
print(f'F1: {f_1:.3}')

正解率: 0.671
Precision: 0.617
Recall: 0.487
F1: 0.545


In [12]:
grid.best_estimator_

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                   learning_rate=0.18428708381381365, n_estimators=284,
                   random_state=1234)

- Day3 に比べて性能が良くなった