# 通し課題模範解答 回帰編 DAY 4
- kaggle の kickstarter project に関して，usd_pledged_real を予測するモデルを作成する
    - https://www.kaggle.com/kemical/kickstarter-projects?select=ks-projects-201801.csv
- DAY 4 では，以下を行う
    - k-近傍法の利用
    - ハイパーパラメータチューニング
        - ランダムサーチ

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../data/df_regression.csv', index_col='ID')
df.head()

Unnamed: 0_level_0,usd_pledged_real,usd_goal_real,period,log_usd_goal,log_usd_pledged,n_words,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,...,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000002330,0.0,1533.95,58,3.185811,-5.0,6,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1000003930,2421.0,30000.0,59,4.477121,3.383995,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1000004038,220.0,45000.0,44,4.653213,2.342423,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1000007540,1.0,5000.0,29,3.69897,4e-06,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1000011046,1283.0,19500.0,55,4.290035,3.108227,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
X = df.drop(columns=['log_usd_pledged', 'usd_pledged_real'])
log_y = df['log_usd_pledged']

X_train, X_test, log_y_train, log_y_test = train_test_split(X, log_y, test_size=0.3, random_state=1234)

In [4]:
std = StandardScaler()
X_train.loc[:, ['log_usd_goal', 'period']] = std.fit_transform(X_train.loc[:, ['log_usd_goal', 'period']])
X_test.loc[:, ['log_usd_goal', 'period']] = std.transform(X_test.loc[:, ['log_usd_goal', 'period']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


##  k-近傍法の利用
以下をチューニングしつつ学習．並列化しない場合，探索に30分ほどかかるので注意する
- n_neighbors: 近傍に含める訓練データ点の数

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

In [6]:
parameters = {'n_neighbors': range(3, 21, 2)}

knr = KNeighborsRegressor()
grid = GridSearchCV(knr, param_grid=parameters, 
                         cv=kf, 
                         scoring='neg_mean_squared_error',
                         n_jobs=-1,
                         verbose=3) 

grid.fit(X_train, log_y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.9min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=1234, shuffle=True),
             estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'n_neighbors': range(3, 21, 2)},
             scoring='neg_mean_squared_error', verbose=3)

In [7]:
log_y_pred = grid.predict(X_test)

mae_log = mean_absolute_error(log_y_test, log_y_pred)
mse_log = mean_squared_error(log_y_test, log_y_pred)
rmse_log = np.sqrt(mse_log)

mae = mean_absolute_error(10**log_y_test, 10**log_y_pred)
mse = mean_squared_error(10**log_y_test, 10**log_y_pred)
rmse = np.sqrt(mse)

print(f'対数領域 MAE: {mae_log:.3}')
print(f'対数領域 MSE: {mse_log:.3}')
print(f'対数領域 RMSE: {rmse_log:.3}')

print(f'MAE: {mae:.3}')
print(f'MSE: {mse:.3}')
print(f'RMSE: {rmse:.3}')

対数領域 MAE: 2.07
対数領域 MSE: 8.26
対数領域 RMSE: 2.87
MAE: 8.63e+03
MSE: 5.07e+09
RMSE: 7.12e+04


In [8]:
grid.best_estimator_

KNeighborsRegressor(n_neighbors=19)

性能は決定木などに比肩するが，推論に時間がかかる

## ハイパーパラメータチューニング
- グリッドサーチ: Day2 および Day3 で利用したので割愛
- ランダムサーチ: 本項で扱う
- ベイズ最適化: 発展編で扱う

### ランダムサーチ
Day3までで性能が良かったアダブーストについて，以下を探索しながら学習．CPUのコア数によっては1時間ほど掛かる可能性がある
- base_estimator: 弱学習器の種類．ここでは，最大探索深さを変えながら学習
- learning_rate: ブースティングの学習率（新たに作成されるモデルの寄与度の縮退率）
- n_estimators: 弱学習器の数

In [9]:
from scipy.stats import uniform

In [10]:
params = {"base_estimator": [DecisionTreeRegressor(max_depth=x) for x in range(3, 8)],
          "learning_rate": uniform(loc=0.0, scale=1.0),
          "n_estimators": range(100, 300)}

abr = AdaBoostRegressor(random_state=1234)
grid = RandomizedSearchCV(estimator=abr,
                          param_distributions=params, 
                          cv=kf, 
                          n_iter=20,
                          scoring='neg_mean_squared_error',
                          random_state=1234,
                          n_jobs=-1,
                          verbose=3)

grid.fit(X_train, log_y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 40.4min finished


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=1234, shuffle=True),
                   estimator=AdaBoostRegressor(random_state=1234), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'base_estimator': [DecisionTreeRegressor(max_depth=3),
                                                           DecisionTreeRegressor(max_depth=4),
                                                           DecisionTreeRegressor(max_depth=5),
                                                           DecisionTreeRegressor(max_depth=6),
                                                           DecisionTreeRegressor(max_depth=7)],
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff24539a0f0>,
                                        'n_estimators': range(100, 300)},
                   random_state=1234, scoring='neg_mean_squared_error',
                   verbose=3)

In [11]:
log_y_pred = grid.predict(X_test)

mae_log = mean_absolute_error(log_y_test, log_y_pred)
mse_log = mean_squared_error(log_y_test, log_y_pred)
rmse_log = np.sqrt(mse_log)

mae = mean_absolute_error(10**log_y_test, 10**log_y_pred)
mse = mean_squared_error(10**log_y_test, 10**log_y_pred)
rmse = np.sqrt(mse)

print(f'対数領域 MAE: {mae_log:.3}')
print(f'対数領域 MSE: {mse_log:.3}')
print(f'対数領域 RMSE: {rmse_log:.3}')

print(f'MAE: {mae:.3}')
print(f'MSE: {mse:.3}')
print(f'RMSE: {rmse:.3}')

対数領域 MAE: 2.24
対数領域 MSE: 8.17
対数領域 RMSE: 2.86
MAE: 8.72e+03
MSE: 5.09e+09
RMSE: 7.13e+04


In [12]:
grid.best_estimator_

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                  learning_rate=0.01290753111984666, n_estimators=169,
                  random_state=1234)

- Day3 に比べて性能が良くなった