# XG Boost Steps
#### 1)Import and split
#### 2)Set and fit the model
#### 3)Predict
#### 4)Model Tuning
#### 5)Find best params, set and fit the model again, find final RMSE.

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Import and split 

In [3]:
hit = pd.read_csv("../input/hittlers/Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [4]:
#Set and fit the model

In [5]:
!pip install xgboost

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [6]:
import xgboost as xgb

In [7]:
#We can use special data structure for XGBoost to have high performanced result.

DM_train=xgb.DMatrix(data=X_train, label=y_train)   #enter dependent variable for "label"
DM_test=xgb.DMatrix(data=X_test, label=y_test)  

In [8]:
from xgboost import XGBRegressor

In [9]:
xgb_model=XGBRegressor().fit(X_train,y_train) # i prefer to use classic data structure which i get accustomed to use

In [10]:
#Prediction

In [11]:
y_pred=xgb_model.predict(X_test)

In [12]:
np.sqrt(mean_squared_error(y_pred,y_test))

355.4651481224188

In [13]:
#Model Tuning

In [14]:
xgb_model

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [15]:
# Important params
#booster
#colsample_bytree
#learning_rate  :avoids overfitting. 
#max_depth      
#n_estimators

In [16]:
xgb_grid = {
     'colsample_bytree': [0.4, 0.5,0.6,0.9,1], 
     'n_estimators':[100, 200, 500, 1000],
     'max_depth': [2,3,4,5,6],
     'learning_rate': [0.1, 0.01, 0.5]
}

In [17]:
xgb_cv=GridSearchCV(xgb_model, xgb_grid, cv=10,n_jobs=-1, verbose=2)
xgb_cv.fit(X_train,y_train)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 654 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 1220 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 1950 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2606 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  2.7min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster=None,
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0, gpu_id=-1,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method=None, validate_parameters=False,
                                    verbosity=None),
             iid='deprecated', n_jobs=-1,
   

In [18]:
xgb_cv.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 1000}

In [19]:
# Final tuned model

In [21]:
xgb_tuned= XGBRegressor(colsample_bytree=0.6,
 learning_rate=0.1,
 max_depth= 2,
 n_estimators=1000)

In [22]:
xgb_tuned=xgb_tuned.fit(X_train,y_train)

In [23]:
y_pred=xgb_tuned.predict(X_test)

In [24]:
np.sqrt(mean_squared_error(y_test,y_pred))

355.0512895885908

In [25]:
# We found 413 for KNN, 
#          367 for SVR,
#          363 for Artifical Neural Network.
#          376 for CART
#          349 for Bagged Trees
#          350 for Random Forest
#          344 for GBM
#And now,  355 for XG Boosting

#In these models, the best one is GBM model for "hitters" data set, till now.