# XGBoost

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [5]:
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League','Division','NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary','League','Division','NewLeague'],axis=1).astype('float64')
X = pd.concat([X_,dms[['League_N','Division_W','NewLeague_N']]],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size=0.25,
                                                random_state=42)

### Model & tahmin

In [7]:
import xgboost

In [8]:
from xgboost import XGBRegressor

In [9]:
xgb = XGBRegressor().fit(X_train, y_train)

In [12]:
y_pred = xgb.predict(X_test)

In [13]:
np.sqrt(mean_squared_error(y_test, y_pred))

355.46515176059927

In [14]:
xgb_params = {"learning_rate":[0.1,0.01,0.05],
             "max_depth":[2,3,4,5,8],
             "n_estimators":[100,200,500,1000],
             "colsample_bytree":[0.4,0.7,1]}

In [15]:
xgb_cv_model = GridSearchCV(xgb,xgb_params, cv=10,n_jobs = -1,verbose = 2).fit(X_train,y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  3.6min finished


In [16]:
xgb_cv_model.best_params_

{'colsample_bytree': 0.4,
 'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 1000}

In [17]:
xgb_tuned= XGBRegressor(colsample_bytree=0.7,learning_rate=0.1,max_depth=2,n_estimators=1000).fit(X_train, y_train)

In [18]:
y_pred=xgb_tuned.predict(X_test)

In [19]:
np.sqrt(mean_squared_error(y_test,y_pred))

355.3982548080249