# Task2 20minute version: main algorithm
import needed library

In [2]:
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import pickle
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor



Loading data

In [3]:
# loading training set
training = pd.read_csv('training2.csv')
del training['time']

# loading submission set
submission = pd.read_csv('submission2.csv')
submission_ori = submission.copy()

# prepare columns for X and y
target = 'volume'
predictors = [x for x in training.columns if x not in [target]]

# prepare training data
X = training[predictors]
y = training[target]

# prepare submission data
X_submission = submission[predictors] # submission predictors

# initialize model
xgb_model = xgb.XGBRegressor(
    learning_rate = 0.1,
    n_estimators = 300,
    max_depth = 3,
    min_child_weight = 3,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    nthread = 4,
    scale_pos_weight = 1,
    seed = 27)

#  Start Grid Searching

## Step 1: 尋找 最佳的 max_depth 和 min_child_weight
介紹各參數可以解決的問題：
* max_depth
   * 顧名思義為各子分類器（決策樹）的深度。
   * 過深的決策樹很容易造成 overfitting。
* min_child_weight
    * 決定在 training 的過程中，各子節點的權重不能過小。
    * 這樣可以避免分類器分類過細，導致 overfitting。

In [4]:
param_test1 = {
    'max_depth':list(range(3,9,2)),
    'min_child_weight':list(range(3,9,2))  
}

gsearch1 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test1,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch1.fit(X, y)
gsearch1.grid_scores_



[mean: 0.78757, std: 0.12031, params: {'max_depth': 3, 'min_child_weight': 3},
 mean: 0.78502, std: 0.11804, params: {'max_depth': 3, 'min_child_weight': 5},
 mean: 0.78472, std: 0.11681, params: {'max_depth': 3, 'min_child_weight': 7},
 mean: 0.78173, std: 0.14079, params: {'max_depth': 5, 'min_child_weight': 3},
 mean: 0.79199, std: 0.13853, params: {'max_depth': 5, 'min_child_weight': 5},
 mean: 0.79335, std: 0.13251, params: {'max_depth': 5, 'min_child_weight': 7},
 mean: 0.78328, std: 0.13538, params: {'max_depth': 7, 'min_child_weight': 3},
 mean: 0.78370, std: 0.13696, params: {'max_depth': 7, 'min_child_weight': 5},
 mean: 0.78024, std: 0.13203, params: {'max_depth': 7, 'min_child_weight': 7}]

印出最佳的參數組合：

In [5]:
gsearch1.best_params_

{'max_depth': 5, 'min_child_weight': 7}

經過跳要搜尋找到了最佳的 max_depth 為 5，和最佳的 min_child_weight 為 7。  
因此決定再尋找 5 和 7 周圍的數字有沒有比 5 和 7 更好的結果。

In [6]:
# replace the best model to xgb_model
xgb_model = gsearch1.best_estimator_

param_test2 = {
    'max_depth':[4, 5, 6],
    'min_child_weight':[6, 7, 8] 
}

gsearch2 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test2,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch2.fit(X, y)
gsearch2.grid_scores_



[mean: 0.79135, std: 0.13370, params: {'max_depth': 4, 'min_child_weight': 6},
 mean: 0.78342, std: 0.13316, params: {'max_depth': 4, 'min_child_weight': 7},
 mean: 0.78246, std: 0.12870, params: {'max_depth': 4, 'min_child_weight': 8},
 mean: 0.79298, std: 0.13531, params: {'max_depth': 5, 'min_child_weight': 6},
 mean: 0.79335, std: 0.13251, params: {'max_depth': 5, 'min_child_weight': 7},
 mean: 0.79869, std: 0.13838, params: {'max_depth': 5, 'min_child_weight': 8},
 mean: 0.78558, std: 0.13316, params: {'max_depth': 6, 'min_child_weight': 6},
 mean: 0.78580, std: 0.13274, params: {'max_depth': 6, 'min_child_weight': 7},
 mean: 0.78684, std: 0.12891, params: {'max_depth': 6, 'min_child_weight': 8}]

印出最佳的參數組合：

In [34]:
print('best_params:', gsearch2.best_params_, 'best_score:', gsearch2.best_score_)

best_params: {'max_depth': 5, 'min_child_weight': 8} best_score: 0.798687323315


最佳的結果變為：  
best max_depth: 5  
best min_child_weight: 8

## Step2: 尋找 最佳的 gamma
介紹參數 gamma 可以解決的問題：  
* 再決定該節點要不要分離前，確定分離後的 loss 的值下降超過一定的閥值，才會分離該節點。  
* 該閥值就是 gamma
* 若 gamma 越大，模型越保守

In [8]:
# replace the best model to xgb_model
xgb_model = gsearch2.best_estimator_

param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch3 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test3,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch3.fit(X, y)
gsearch3.grid_scores_



[mean: 0.79869, std: 0.13838, params: {'gamma': 0.0},
 mean: 0.80018, std: 0.13888, params: {'gamma': 0.1},
 mean: 0.80018, std: 0.13888, params: {'gamma': 0.2},
 mean: 0.80007, std: 0.13988, params: {'gamma': 0.3},
 mean: 0.79925, std: 0.13919, params: {'gamma': 0.4}]

In [35]:
print('best_params:', gsearch3.best_params_, 'best_score:', gsearch3.best_score_)

best_params: {'gamma': 0.2} best_score: 0.800175985937


最佳的 gamma 為 0.2

## Step3: 尋找 最佳的 subsample 與 colsample_bytree

In [20]:
xgb_model = gsearch3.best_estimator_
param_test4 = {
     'subsample':[i/10.0 for i in range(6,10)],
     'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch4 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test4,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch4.fit(X, y)
gsearch4.grid_scores_



[mean: 0.77298, std: 0.12439, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
 mean: 0.77024, std: 0.12527, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
 mean: 0.76759, std: 0.12628, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
 mean: 0.76875, std: 0.12589, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
 mean: 0.79095, std: 0.12686, params: {'subsample': 0.6, 'colsample_bytree': 0.7},
 mean: 0.77255, std: 0.13071, params: {'subsample': 0.7, 'colsample_bytree': 0.7},
 mean: 0.77246, std: 0.13144, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
 mean: 0.78414, std: 0.13547, params: {'subsample': 0.9, 'colsample_bytree': 0.7},
 mean: 0.78864, std: 0.13231, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
 mean: 0.78652, std: 0.13695, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
 mean: 0.80018, std: 0.13888, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
 mean: 0.79542, std: 0.13317, params: {'subsample': 0.9, 'colsample_bytree': 0.8},
 mea

In [21]:
gsearch4.best_params_

{'colsample_bytree': 0.9, 'subsample': 0.6}

In [22]:
xgb_model = gsearch4.best_estimator_
param_test5 = {
     'subsample':[i/100.0 for i in range(55,70,5)],
     'colsample_bytree':[i/100.0 for i in range(85,100,5)]
}

gsearch5 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test5,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch5.fit(X, y)
gsearch5.grid_scores_



[mean: 0.80168, std: 0.12935, params: {'subsample': 0.55, 'colsample_bytree': 0.85},
 mean: 0.80305, std: 0.13202, params: {'subsample': 0.6, 'colsample_bytree': 0.85},
 mean: 0.80431, std: 0.13145, params: {'subsample': 0.65, 'colsample_bytree': 0.85},
 mean: 0.80168, std: 0.12935, params: {'subsample': 0.55, 'colsample_bytree': 0.9},
 mean: 0.80305, std: 0.13202, params: {'subsample': 0.6, 'colsample_bytree': 0.9},
 mean: 0.80431, std: 0.13145, params: {'subsample': 0.65, 'colsample_bytree': 0.9},
 mean: 0.80411, std: 0.13513, params: {'subsample': 0.55, 'colsample_bytree': 0.95},
 mean: 0.80360, std: 0.13423, params: {'subsample': 0.6, 'colsample_bytree': 0.95},
 mean: 0.79812, std: 0.13926, params: {'subsample': 0.65, 'colsample_bytree': 0.95}]

In [23]:
gsearch5.best_params_

{'colsample_bytree': 0.85, 'subsample': 0.65}

In [24]:
xgb_model = gsearch5.best_estimator_
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test6,
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch6.fit(X, y)
gsearch6.grid_scores_



[mean: 0.80431, std: 0.13145, params: {'reg_alpha': 1e-05},
 mean: 0.80354, std: 0.13294, params: {'reg_alpha': 0.01},
 mean: 0.79486, std: 0.13738, params: {'reg_alpha': 0.1},
 mean: 0.79919, std: 0.13241, params: {'reg_alpha': 1},
 mean: 0.80145, std: 0.12908, params: {'reg_alpha': 100}]

In [31]:
xgb_model = gsearch6.best_estimator_.fit(X, y)

y_submission = xgb_model.predict(X_submission)

In [32]:
submission_ori[target] = y_submission
submission[target] = submission_ori[target]
submission_ori = submission_ori[submission_ori.columns[0:4]]
submission_ori.to_csv('submission2_result.csv', index=False)

In [33]:
submission

Unnamed: 0,tollgate_id,time_window,direction,volume,pressure,sea_pressure,wind_direction,wind_speed,temperature,rel_humidity,precipitation,week,hour,minute
0,1.0,"[2016-10-25 08:00:00,2016-10-25 08:20:00)",0.0,54.137436,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,0.0
1,1.0,"[2016-10-25 08:20:00,2016-10-25 08:40:00)",0.0,54.762512,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,20.0
2,1.0,"[2016-10-25 08:40:00,2016-10-25 09:00:00)",0.0,51.962181,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,40.0
3,1.0,"[2016-10-25 09:00:00,2016-10-25 09:20:00)",0.0,55.969429,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,0.0
4,1.0,"[2016-10-25 09:20:00,2016-10-25 09:40:00)",0.0,55.869850,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,20.0
5,1.0,"[2016-10-25 09:40:00,2016-10-25 10:00:00)",0.0,51.448524,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,40.0
6,1.0,"[2016-10-25 08:00:00,2016-10-25 08:20:00)",1.0,107.641487,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,0.0
7,1.0,"[2016-10-25 08:20:00,2016-10-25 08:40:00)",1.0,109.236069,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,20.0
8,1.0,"[2016-10-25 08:40:00,2016-10-25 09:00:00)",1.0,107.610008,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,8.0,40.0
9,1.0,"[2016-10-25 09:00:00,2016-10-25 09:20:00)",1.0,111.711182,1011.3,1016.3,7.0,3.8,18.0,81.0,0.0,1.0,9.0,0.0
