# XGBoost

### Import the libraries

In [60]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [61]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from scipy import stats

### Read the data

In [62]:
# read the data
df = pd.read_csv("../experiments/ruby.csv", index_col=0)
df.columns = df.columns.str.replace(' ', '')

##### Prepare the data

In [63]:
# delete unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# convert all columns of DataFrame
df = df.dropna()
# find the objects columns
cols = df.columns[df.dtypes.eq('object')]
# convert to numeric these columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

#### Create X features and Y target

In [64]:
# split data into X and y
array = df.values
X = array[:,0:46]
Y = array[:,46]

In [65]:
# split data into train and test sets
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Create the model

#### R2 Score

In [66]:
params = {
    # Parameters that we are going to tune.
    'max_depth': [6],
    'min_child_weight': [1],
    'subsample': [1],
    'colsample_bytree': [1]
}
kfold = 5

In [67]:
xg = xgb.XGBRegressor()
best_xgb = GridSearchCV(
    xg, param_grid=params, cv=10, verbose=0, n_jobs=-1)

scores = cross_val_score(best_xgb, X, Y, scoring='r2', cv=kfold) 

In [68]:
scores

array([-0.63268688,  0.74029762,  0.75481951,  0.81571358,  0.58315963])

In [69]:
sum = 0.86 + 0.95 + 0.98 + 0.99 + 0.17
mean = (sum / 5)
print(mean)

0.79


In [70]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() *2))

Accuracy: 0.45 (+/- 1.10)


#### MAE

In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [11]:
import numpy as np
# "Learn" the mean from the training data
mean_train = np.mean(y_train)

In [12]:
baseline_predictions = np.ones(y_test.shape) * mean_train
mae_baseline = mean_absolute_error(y_test, baseline_predictions)

In [29]:
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 11.32


In [45]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [46]:
params['eval_metric'] = "mae"
num_boost_round = 999

In [32]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:21.9961
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:15.5291
[2]	Test-mae:11.0729
[3]	Test-mae:8.0883
[4]	Test-mae:6.02956
[5]	Test-mae:4.62596
[6]	Test-mae:3.69566
[7]	Test-mae:3.05892
[8]	Test-mae:2.65432
[9]	Test-mae:2.36501
[10]	Test-mae:2.19882
[11]	Test-mae:2.0594
[12]	Test-mae:1.99554
[13]	Test-mae:1.90427
[14]	Test-mae:1.84084
[15]	Test-mae:1.80882
[16]	Test-mae:1.75398
[17]	Test-mae:1.70516
[18]	Test-mae:1.69106
[19]	Test-mae:1.66982
[20]	Test-mae:1.64115
[21]	Test-mae:1.61912
[22]	Test-mae:1.61346
[23]	Test-mae:1.59977
[24]	Test-mae:1.57969
[25]	Test-mae:1.56969
[26]	Test-mae:1.56022
[27]	Test-mae:1.55343
[28]	Test-mae:1.54727
[29]	Test-mae:1.53705
[30]	Test-mae:1.53646
[31]	Test-mae:1.53069
[32]	Test-mae:1.52992
[33]	Test-mae:1.51572
[34]	Test-mae:1.50281
[35]	Test-mae:1.5004
[36]	Test-mae:1.49608
[37]	Test-mae:1.49092
[38]	Test-mae:1.48836
[39]	Test-mae:1.48849
[40]	Test-mae:1.48338
[41]	Test-mae:1.47378
[42]	Test-mae:1.46627
[43]	Test-m

In [33]:
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 1.30 with 233 rounds


In [41]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)
cv_results



Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,24.051739,0.009429,24.052896,0.033189
1,17.002750,0.006622,17.005990,0.021970
2,12.100714,0.004754,12.108271,0.015183
3,8.735601,0.003514,8.749071,0.012778
4,6.463018,0.003869,6.486533,0.012728
...,...,...,...,...
61,2.556326,0.023903,2.817835,0.029482
62,2.555093,0.024378,2.817767,0.029309
63,2.553809,0.024583,2.817898,0.029183
64,2.551488,0.024177,2.817016,0.029567


In [36]:
cv_results['test-rmse-mean'].min()

2.8197356

#### XGBoost tuning

In [37]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [39]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 2.8186404 for 97 rounds
CV with max_depth=9, min_child_weight=6
	MAE 2.8226608 for 140 rounds
CV with max_depth=9, min_child_weight=7
	MAE 2.8214748 for 129 rounds
CV with max_depth=10, min_child_weight=5
	MAE 2.8136282 for 79 rounds
CV with max_depth=10, min_child_weight=6
	MAE 2.8158504 for 93 rounds
CV with max_depth=10, min_child_weight=7
	MAE 2.8169528 for 109 rounds
CV with max_depth=11, min_child_weight=5
	MAE 2.8128825999999996 for 47 rounds
CV with max_depth=11, min_child_weight=6
	MAE 2.8171797999999995 for 53 rounds
CV with max_depth=11, min_child_weight=7
	MAE 2.8168526 for 65 rounds
Best params: 11, 5, MAE: 2.8128825999999996


In [23]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [24]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 1.2708274 for 37 rounds
CV with subsample=1.0, colsample=0.9
	MAE 1.2720232 for 26 rounds
CV with subsample=1.0, colsample=0.8
	MAE 1.2751796 for 66 rounds
CV with subsample=1.0, colsample=0.7
	MAE 1.2833498 for 79 rounds
CV with subsample=0.9, colsample=1.0
	MAE 1.2734075999999999 for 41 rounds
CV with subsample=0.9, colsample=0.9
	MAE 1.2772919999999999 for 49 rounds
CV with subsample=0.9, colsample=0.8
	MAE 1.2812618 for 29 rounds
CV with subsample=0.9, colsample=0.7
	MAE 1.2843122 for 51 rounds
CV with subsample=0.8, colsample=1.0
	MAE 1.2789802000000001 for 42 rounds
CV with subsample=0.8, colsample=0.9
	MAE 1.2818581999999998 for 25 rounds
CV with subsample=0.8, colsample=0.8
	MAE 1.2847616 for 29 rounds
CV with subsample=0.8, colsample=0.7
	MAE 1.2926218 for 30 rounds
CV with subsample=0.7, colsample=1.0
	MAE 1.2854138000000002 for 24 rounds
CV with subsample=0.7, colsample=0.9
	MAE 1.2858954 for 33 rounds
CV with subsample=0.7, colsampl

In [None]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params, dtrain, num_boost_round=num_boost_round,
        seed=42, nfold=5, metrics=['mae'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs
CV with eta=0.3
	MAE 1.2963634 for 29 rounds

CV with eta=0.2
	MAE 1.2830489999999999 for 52 rounds

CV with eta=0.1
	MAE 1.2678966 for 177 rounds

CV with eta=0.05
	MAE 1.2600076000000002 for 346 rounds

CV with eta=0.01
	MAE 1.2592168 for 998 rounds

CV with eta=0.005
