#### import packages and data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [4]:
# import data
dataset = pd.read_csv('/content/drive/MyDrive/colab/yearly_player_average.csv')


# hyperpara trainset
hyper = dataset.copy()

#### feature and target selection

In [10]:
# features
hyper = hyper.dropna(axis=0) # remove nas
X = hyper.drop(columns=["Name", "launch_angle", "launch_speed", "sz_top", "sz_bot"])
y = hyper['WAR']
X = X.drop(columns=['WAR'])


#### Train Test Split | ML import

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import bayes_opt
from sklearn.metrics import mean_squared_error

In [12]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.2, random_state=26)

### HyperParamater Tuning

In [13]:
def bay_opt(X, y, parameters, n_iter=10, int_points = 20, random_state=26):
    def xgb_evaluate(
        learning_rate,
        max_depth,
        min_child_weight,
        subsample,
        colsample_bytree,
        early_stopping_rounds,
        max_leaves
    ):
        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'max_leaves': int(max_leaves),
            'early_stopping_rounds': early_stopping_rounds,
            'colsample_bytree': colsample_bytree,
            'n_estimators': 50000,
            'objective': 'reg:squarederror',
            'n_jobs': -1,
            'random_state': random_state
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X, y, eval_set=[(test_x, test_y)])
        predictions = model.predict(test_x)
        return -mean_squared_error(test_y, predictions)

    optimizer = bayes_opt.BayesianOptimization(
        f=xgb_evaluate,
        pbounds=parameters,
        random_state=random_state,
    )

    optimizer.maximize(n_iter=n_iter, init_points=int_points)

    return optimizer.max['params']

Bay Opt for hyperparameter tuning

In [None]:
params = {
    'learning_rate': (0.05, 0.3),
    'max_leaves': (1, 40),
    'min_child_weight': (1, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.4, 1),
    'early_stopping_rounds': (10, 60),
    'max_depth': (0, 20)
}

best_params = bay_opt(train_x, train_y, parameters=params, int_points=200, n_iter=300)
print(best_params)

Grid Search to Narrow

In [17]:
def grid_search(X, y, params):

    model = xgb.XGBRegressor( n_jobs = -1, n_estimators = 50000)
    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X, y, eval_set=[(test_x, test_y)])

    print("Best parameters found:", grid.best_params_)
    print("Best score found:", grid.best_score_)

    best_estimator = grid.best_estimator_
    feature_importances = best_estimator.feature_importances_

    return best_estimator, feature_importances

colsample_bytree': np.float64(0.9141958521468958), 'early_stopping_rounds': np.float64(39.29889539595385), 'learning_rate': np.float64(0.25320973561082527), 'max_depth': np.float64(14.1139413585835), 'max_leaves': np.float64(6.249346556661566), 'min_child_weight': np.float64(5.275566672767666), 'subsample': np.float64(0.5550436959682293)}

{'colsample_bytree': 0.9, 'early_stopping_rounds': 35, 'learning_rate': 0.2, 'max_depth': 12, 'max_leaves': 5, 'min_child_weight': 6, 'subsample': 0.55}

found para from grid search, slowly converged

In [None]:
grid_para = {
    'learning_rate': [0.1, 0.15, 0.2],
    'max_leaves': [3, 4, 5],
    'min_child_weight': [6, 7, 8],
    'subsample': [0.55],
    'colsample_bytree': [0.85, 0.9],
    'early_stopping_rounds': [25, 30, 35],
    'max_depth': [12, 14, 16]
}
grid_search(X=train_x, y = train_y, params=grid_para)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits
[0]	validation_0-rmse:1.04651
[1]	validation_0-rmse:1.00561
[2]	validation_0-rmse:0.96345
[3]	validation_0-rmse:0.92941
[4]	validation_0-rmse:0.91435
[5]	validation_0-rmse:0.88308
[6]	validation_0-rmse:0.85341
[7]	validation_0-rmse:0.83239
[8]	validation_0-rmse:0.80844
[9]	validation_0-rmse:0.79209
[10]	validation_0-rmse:0.77743
[11]	validation_0-rmse:0.76289
[12]	validation_0-rmse:0.74784
[13]	validation_0-rmse:0.73502
[14]	validation_0-rmse:0.72692
[15]	validation_0-rmse:0.71898
[16]	validation_0-rmse:0.71231
[17]	validation_0-rmse:0.70505
[18]	validation_0-rmse:0.69818
[19]	validation_0-rmse:0.69323
[20]	validation_0-rmse:0.68861
[21]	validation_0-rmse:0.67549
[22]	validation_0-rmse:0.67225
[23]	validation_0-rmse:0.67115
[24]	validation_0-rmse:0.66369
[25]	validation_0-rmse:0.66197
[26]	validation_0-rmse:0.66039
[27]	validation_0-rmse:0.65336
[28]	validation_0-rmse:0.65129
[29]	validation_0-rmse:0.65251
[30]	validation_