In [None]:
import sys
!{sys.executable} -m pip install numpy pandas sklearn --user

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as sk

In [None]:
# Read in the csv data
train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")
# Example the contents
print(train_data.shape)

In [None]:
X = train_data.loc[:, "Y":"Y"].values
Y = train_data.loc[:, "f1":"f24"].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.33, random_state=69)

In [None]:
dmTrain = xgb.DMatrix(X_train, label=Y_train)
dmTest = xgb.DMatrix(X_test, label=Y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_train = np.mean(Y_train)
base_predict = np.ones(Y_test.shape) * mean_train
mae_base = mean_absolute_error(Y_test, base_predict)

In [28]:
params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'silent': 1,
}
params['eval_metric'] = "mae"
boost_rounds = 999

model = xgb.train(
    params,
    dmTrain,
    num_boost_round=boost_rounds,
    evals=[(dmTest, "Test")],
    early_stopping_rounds=10,
    verbose_eval=False
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:46888.8
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:49909.4
[2]	Test-mae:52166.4
[3]	Test-mae:53786.2
[4]	Test-mae:54952.2
[5]	Test-mae:55778.1
[6]	Test-mae:56363.9
[7]	Test-mae:56775.4
[8]	Test-mae:57064.9
[9]	Test-mae:57268.4
[10]	Test-mae:57410.9
Stopping. Best iteration:
[0]	Test-mae:46888.8

Best MAE: 46888.76 with 1 rounds


In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(0, 10)
    for min_child_weight in range(0, 10)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    cv_results = xgb.cv(
        params,
        dmTrain,
        num_boost_round=boost_rounds,
        seed=69,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=1000,
        verbose_eval=1000
    )
    
    mean_mae = cv_results['test-mae-mean'].min()
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth, min_child_weight)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
# Parameters subsample and colsample_bytree