<a href="https://colab.research.google.com/github/dongcheol-noh/dongcheol-noh.github.io/blob/master/_%EB%94%A5%EB%9F%AC%EB%8B%9D%EA%B3%B5%EB%B6%80/%ED%8A%B9%EA%B0%95%202%EC%9D%BC%EC%B0%A8%20hyperparameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split

In [3]:
!curl -L https://goo.gl/s8qSL5 -o bike_train.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   139    0   139    0     0    387      0 --:--:-- --:--:-- --:--:--     0
100   340  100   340    0     0    384      0 --:--:-- --:--:-- --:--:--   384
100  643k  100  643k    0     0   534k      0  0:00:01  0:00:01 --:--:--  534k


In [4]:
df = pd.read_csv('bike_train.csv', parse_dates=['datetime'])
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
df.dtypes

datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
casual                 int64
registered             int64
count                  int64
dtype: object

In [6]:
df['datetime'].dt.year[0], df['datetime'].dt.month[0], df['datetime'].dt.day[0]

(2011, 1, 1)

In [7]:
df['datetime'].dt.hour[0], df['datetime'].dt.minute[0], df['datetime'].dt.second[0]

(0, 0, 0)

In [8]:
df['datetime'].dt.dayofweek[0]

5

In [9]:
d_month = df['datetime'].dt.month
d_hour = df['datetime'].dt.hour
d_dayofweek = df['datetime'].dt.dayofweek

In [10]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [11]:
df['d_month'] = d_month
df['d_hour'] = d_hour
df['d_dayofweek'] = d_dayofweek

In [12]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,d_month,d_hour,d_dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,5


In [13]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'd_month', 'd_hour', 'd_dayofweek'],
      dtype='object')

In [14]:
features = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'd_month', 'd_hour', 'd_dayofweek']

In [15]:
X, y = df[features], df['count']

In [16]:
X.shape, y.shape

((10886, 11), (10886,))

In [17]:
# simple prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8753695925933203

In [18]:
# different models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)
print("Linear model: ", model.score(X_test, y_test))

model = DecisionTreeRegressor(max_depth=7)
model.fit(X_train, y_train)
print("Decision Tree model: ", model.score(X_test, y_test))

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
print("Random forest: ", model.score(X_test, y_test))

Linear model:  0.34848531176937914
Decision Tree model:  0.6933429965956265
Random forest:  0.8709169352606945


# hyperparameter selection

In [19]:
# Grid search
n_estimators = 30
max_depth_list = [10, 20, 30]
max_feature_list = [0.3, 0.5, 0.9, 1.0]
hp_lists = []

for i in max_depth_list:
  for j in max_feature_list:
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = i,
                                  max_features = j,
                                  random_state = 20,
                                  n_jobs = -1)
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    hp_lists.append({'score:': score,
                     'max_depth': i,
                     'max_features': j})
hp_lists

[{'max_depth': 10, 'max_features': 0.3, 'score:': 0.7331674155511309},
 {'max_depth': 10, 'max_features': 0.5, 'score:': 0.8100583401498691},
 {'max_depth': 10, 'max_features': 0.9, 'score:': 0.841838903997331},
 {'max_depth': 10, 'max_features': 1.0, 'score:': 0.8430634181187013},
 {'max_depth': 20, 'max_features': 0.3, 'score:': 0.8211156387537555},
 {'max_depth': 20, 'max_features': 0.5, 'score:': 0.8567929454618868},
 {'max_depth': 20, 'max_features': 0.9, 'score:': 0.8623243623252879},
 {'max_depth': 20, 'max_features': 1.0, 'score:': 0.859501511798961},
 {'max_depth': 30, 'max_features': 0.3, 'score:': 0.823911198100145},
 {'max_depth': 30, 'max_features': 0.5, 'score:': 0.8570122360589953},
 {'max_depth': 30, 'max_features': 0.9, 'score:': 0.8614291653532529},
 {'max_depth': 30, 'max_features': 1.0, 'score:': 0.8597158680693227}]

In [20]:
# random search
hp_lists = []
num_epoch = 20

for i in range(num_epoch):
  max_depth = np.random.randint(low=5, high=50)
  max_feature = np.random.uniform(low=0.3, high=1.0)
  model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = max_depth,
                                  max_features = max_feature,
                                  random_state = 20,
                                  n_jobs = -1)
  score = cross_val_score(model, X_train, y_train, cv=5).mean()
  hp_lists.append({'score': score,
                   'max_depth': max_depth,
                   'max_features': max_feature})
# hp_lists
pd.DataFrame(hp_lists).sort_values(by='score', ascending=False)

Unnamed: 0,score,max_depth,max_features
14,0.86267,48,0.655549
8,0.862502,33,0.640058
0,0.86246,22,0.864588
7,0.861384,49,0.75184
3,0.861384,41,0.77271
18,0.861342,29,0.91739
19,0.861301,17,0.767234
6,0.86128,40,0.968712
4,0.861193,22,0.978788
5,0.861071,45,0.821779


In [None]:
# fine tuning
hp_lists = []
num_epoch = 20

for i in range(num_epoch):
  max_depth = np.random.randint(low=16, high=35)
  max_feature = np.random.uniform(low=0.62, high=0.9)
  model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = max_depth,
                                  max_features = max_feature,
                                  random_state = 20,
                                  n_jobs = -1)
  score = cross_val_score(model, X_train, y_train, cv=5).mean()
  hp_lists.append({'score': score,
                   'max_depth': max_depth,
                   'max_features': max_feature})
# hp_lists
pd.DataFrame(hp_lists).sort_values(by='score', ascending=False)

Unnamed: 0,score,max_depth,max_features
10,0.866048,29,0.68366
4,0.865936,33,0.70138
14,0.865882,18,0.683804
0,0.865852,34,0.69674
17,0.865146,21,0.63061
6,0.865062,26,0.655465
16,0.864711,33,0.630021
9,0.864576,20,0.68847
13,0.863966,32,0.73009
2,0.863964,26,0.799777


In [21]:
# final model decision
model = RandomForestRegressor(n_estimators=30,
                                  max_depth = 33,
                                  max_features = 0.77,
                                  random_state = 20,
                                  n_jobs = -1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8694654240566186

In [23]:
# most significant features

model.feature_importances_

array([0.01786057, 0.00245217, 0.04820499, 0.01992506, 0.08423349,
       0.07568042, 0.06426407, 0.02900431, 0.04055397, 0.56380071,
       0.05402024])

In [24]:
imp_df = pd.DataFrame({'features': features, 'importance': model.feature_importances_})
imp_df.sort_values(by='importance', ascending=False)

Unnamed: 0,features,importance
9,d_hour,0.563801
4,temp,0.084233
5,atemp,0.07568
6,humidity,0.064264
10,d_dayofweek,0.05402
2,workingday,0.048205
8,d_month,0.040554
7,windspeed,0.029004
3,weather,0.019925
0,season,0.017861


# Use Library functions

In [25]:
from sklearn.model_selection import GridSearchCV

params = [{"max_depth": [10, 20, 30],
           "max_features": [0.3, 0.5, 0.9, 1.0]}]

clf = GridSearchCV(RandomForestRegressor(), params, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_depth': [10, 20, 30],
                          'max_features': [0.3, 0.5, 0.9, 1.0]}])

In [26]:
best_val = clf.best_estimator_
best_score = clf.best_score_
print(best_val,best_score )
print("final score:", clf.score(X_test, y_test))

RandomForestRegressor(max_depth=30, max_features=0.9) 0.8659733172917919
final score: 0.8742959642643284


In [27]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
rf = RandomizedSearchCV(RandomForestRegressor(), random_grid, cv=5)
rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]})

In [28]:
rf.best_params_, rf.best_estimator_, rf.best_score_

({'max_depth': 90, 'max_features': 'auto', 'n_estimators': 1600},
 RandomForestRegressor(max_depth=90, n_estimators=1600),
 0.8631563541926706)

In [29]:
print("Final test score:", rf.score(X_test, y_test))

Final test score: 0.8713046024357773
