###### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
% matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

###### Reading in the data to model

In [3]:
X_test = pd.read_csv('../Data/test_x.csv')
X = pd.read_csv('../Data/train_x.csv')
y = pd.read_csv('../Data/train_y.csv', header = None)
print(X_test.shape)
print(X.shape)
print(y.shape)

(1459, 104)
(1446, 104)
(1446, 1)


###### Gradient Boosting (book, p. 198)

In [122]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [97]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, np.ravel(y))

gbrt = GradientBoostingRegressor(max_depth=3, n_estimators=3000, random_state=42, learning_rate = 0.02,
                                min_samples_leaf = 15, min_samples_split=10)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=3, n_estimators=bst_n_estimators, random_state=42, learning_rate = 0.02)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1250, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [98]:
bst_n_estimators  # 270 (OUT OF 300)

1250

In [99]:
min(errors) # 0.014635885043588544

0.014989687644554045

In [100]:
min(errors)**0.5

0.12243237988601727

In [108]:
boost_model = GradientBoostingRegressor(random_state=42, learning_rate = 0.02, n_estimators = 3000)  # 

In [109]:
from sklearn import model_selection
grid_para_boost = [
    {'max_depth': [2, 7, 9],
     'min_samples_split': [10, 12, 14],
     'max_features': [6, 10]}
]

In [110]:
grid_search_boost = model_selection.GridSearchCV(boost_model, grid_para_boost, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
grid_search_boost.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3000, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'max_depth': [2, 7, 9], 'min_samples_split': [10, 12, 14], 'max_features': [6, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [111]:
grid_search_boost.best_params_

{'max_depth': 2, 'max_features': 10, 'min_samples_split': 14}

In [112]:
grid_search_boost.score(X_train, y_train)

-0.0055996425091935764

In [117]:
X_train, X_val, y_train, y_val = train_test_split(X, np.ravel(y))

gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 10, min_samples_split = 14,
    random_state=42, learning_rate = 0.02, n_estimators = 100000, verbose = 1)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)**0.5
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

# gbrt_best = GradientBoostingRegressor()
# gbrt_best.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.1562            0.00s
         2           0.1547            0.00s
         3           0.1533            0.00s
         4           0.1518            0.00s
         5           0.1504            5.21m
         6           0.1485            4.34m
         7           0.1477            3.72m
         8           0.1463            3.26m
         9           0.1447            2.89m
        10           0.1427            2.60m
        20           0.1250            2.60m
        30           0.1100            1.74m
        40           0.0991            1.95m
        50           0.0891            1.56m
        60           0.0806            1.74m
        70           0.0732            1.49m
        80           0.0673            1.30m
        90           0.0623            1.45m
       100           0.0576            1.30m
       200           0.0329            1.15m
       300           0.0233           56.45s
       40

In [118]:
bst_n_estimators  # 270 (OUT OF 300)

6706

In [119]:
min(errors)

0.11978486920254075