# Gradient Boosting Model
This notebook aims to build an XGBoost model with parameters selected by GridSearchCV. This file was executed on AWS.

In [1]:
# Import packages
from xgboost import XGBRegressor # pip install xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

## Importing of Files

In [2]:
# Import dataset
df = pd.read_csv('data/features_df.csv')

In [3]:
# Split the data into training and test sets (30% held out for testing)
y = df.loc[:, ['viewCount']] # dependent variable
X = df.loc[:, ['titleLen', 'subscriberCount', 'avgViewCount', 'humanCount', 'HOW TO & STYLE', 'SPORTS', 'TRAVEL', 'Negative', 'titleINTJ']] # selected independant variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

## Performing Grid Search

In [4]:
XGBModel = XGBRegressor()

In [5]:
# Setting parameters for Grid Search
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

In [6]:
# Run Grid Search
grid_result = GridSearchCV(XGBModel, parameters, cv=2, n_jobs=5, verbose=True)
grid_result.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed: 19.8min finished




GridSearchCV(cv=2, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estima...
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0

In [7]:
# Print Best Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.073854 using {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'subsample': 0.7}


In [8]:
# Print All Results
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

## Building the Model

In [9]:
XGBModel = XGBRegressor(nthread=4, objective='reg:linear', learning_rate=0.03, max_depth=5, min_child_weight=4, subsample=0.7, colsample_bytree=0.7, n_estimators=500)

In [10]:
XGBModel.fit(X_train, y_train, verbose=0) #set verbose=1 to see training progress



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=5,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [11]:
# Get the mean absolute error on the validation data :
XGBpredictions = XGBModel.predict(X_test)

## Getting Best Metrics

In [12]:
MAE = mean_absolute_error(y_test, XGBpredictions)
print('XGBoost validation MAE = ', MAE)

XGBoost validation MAE =  2363008.4038565527


In [13]:
MSE = mean_squared_error(y_test, XGBpredictions)
print('XGBoost validation MSE = ', MSE)

XGBoost validation MSE =  890408446899621.0


In [14]:
import math
print('XGBoost validation RMSE = ', math.sqrt(MSE))

XGBoost validation RMSE =  29839712.580713995
