In [1]:
# https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from config import db_password
import numpy as np

In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/nba_champs"
engine = create_engine(db_string)

In [3]:
stats_df = pd.read_sql("select * from \"stats\"", db_string);
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2020-21,WAS,72,34,38,0.472,-1.83,-0.01,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,1,0,2020-21,UTA,72,52,20,0.722,9.25,-0.29,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,2,0,2020-21,TOR,72,27,45,0.375,-0.47,-0.07,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,3,0,2020-21,SAS,72,33,39,0.458,-1.74,0.15,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,4,0,2020-21,SAC,72,31,41,0.431,-3.68,0.23,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [6]:
# Pull out Season, TM and Team_Name into DF
name_df = stats_df[['Season', 'Tm',]]
name_df.head()

Unnamed: 0,Season,Tm
0,2020-21,WAS
1,2020-21,UTA
2,2020-21,TOR
3,2020-21,SAS
4,2020-21,SAC


In [7]:
# Drop LG, Season, TM, Team_Name, G, W, L columns
stats_df.drop(['Season','Tm', 'G', 'W', 'L'], axis=1, inplace=True)

In [8]:
# Remove Position target from features data
y = stats_df.Position
X = stats_df.drop(columns=["Position"])

In [23]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

print(type(xgb_cl))

<class 'xgboost.sklearn.XGBClassifier'>


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
from sklearn.metrics import accuracy_score

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)





0.9829059829059829

In [26]:

param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [28]:
from sklearn.model_selection import GridSearchCV

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X, y)





In [29]:
grid_cv.best_score_

0.9295321990670828

In [30]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.05,
 'max_depth': 4,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [31]:
# Insert the new fixed values to the grid
param_grid["scale_pos_weight"] = [3]
param_grid["subsample"] = [0.8]
param_grid["colsample_bytree"] = [0.5]

# Give new value ranges to other params
param_grid["gamma"] = [3, 5, 7]
param_grid["max_depth"] = [9, 15, 20]
param_grid["reg_lambda"] = [10, 30, 50]
param_grid["learning_rate"] = [0.3, 0.5, 0.7, 1]

In [32]:
grid_cv_2 = GridSearchCV(xgb_cl, param_grid, 
                         cv=3, scoring="roc_auc", n_jobs=-1)

_ = grid_cv_2.fit(X, y)

grid_cv_2.best_score_





0.9338353636028055

In [33]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.05,
 'max_depth': 4,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [35]:
final_cl = xgb.XGBClassifier(
    grid_cv.best_params_,
    objective="binary:logistic",
    colsample_bytree=0.5,
    subsample=0.8
)



In [36]:
from sklearn.metrics import roc_auc_score

_ = final_cl.fit(X_train, y_train)

preds = final_cl.predict(X_test)

XGBoostError: [18:41:33] C:\Users\Administrator\workspace\xgboost-win64_release_1.5.0\src\objective\objective.cc:26: Unknown objective function: `{'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.05, 'max_depth': 4, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 0.8}`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:pseudohubererror
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
