In [2]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot

In [3]:
data = read_csv('train.csv')
dataset = data.values

In [4]:
X = dataset[:,0:94]
y = dataset[:,94]

In [5]:
label_encoded_y = LabelEncoder().fit_transform(y)

In [6]:
model = XGBClassifier()
subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(subsample=subsample)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, label_encoded_y)

In [7]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(subsample, means, yerr=stds)
pyplot.title("XGBoost subsample vs Log Loss")
pyplot.xlabel('subsample')
pyplot.ylabel('Log Loss')
pyplot.savefig('subsample.png')

Best: -0.000467 using {'subsample': 0.4}
-0.001142 (0.000244) with: {'subsample': 0.1}
-0.000646 (0.000199) with: {'subsample': 0.2}
-0.000514 (0.000214) with: {'subsample': 0.3}
-0.000467 (0.000267) with: {'subsample': 0.4}
-0.000490 (0.000377) with: {'subsample': 0.5}
-0.000530 (0.000502) with: {'subsample': 0.6}
-0.000556 (0.000563) with: {'subsample': 0.7}
-0.000631 (0.000706) with: {'subsample': 0.8}
-0.000926 (0.001211) with: {'subsample': 1.0}
