In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
import xgboost as xgb

# build function to find optimal n_estimators
# note: if learning_rate, reg_alpha, reg_lambda are not set
# xgboost will use default values from https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
# e.g. eta = 0.3 instead of 0.1
# if they are set, eta, alpha, lambda will be overwritten
# the best n_estimators is Name + 1
def modelfit(model, data, label, cv=5, early_stopping_rounds=50):
    params = model.get_params()
    params["num_class"] = 5
    dtrain = xgb.DMatrix(data, label)
    cvresult = xgb.cv(params, dtrain, num_boost_round=model.get_params()["n_estimators"],
                      nfold=cv, stratified=True,
                      metrics="mlogloss", early_stopping_rounds=early_stopping_rounds, verbose_eval=50)
    print(cvresult.iloc[-1])
    
def modelsubmit(model, train_data, train_label, test_data, label_list):
    model.fit(train_data, train_label, eval_metric="mlogloss")
    test_predict = model.predict_proba(test_data)
    sample = pd.read_csv("input/sample_submission.csv")
    sample[label_list] = test_predict
    sample.to_csv("submit/xgb_grid.csv", index=False)
    
# define scorer for log_loss
scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

In [2]:
from model_xgb import preprocess_data
preprocess_data()

In [3]:
# load data
with open("preprocessed_data.pkl", "rb") as f:
    train_data = pickle.load(f)
    train_label = pickle.load(f)
    test_data = pickle.load(f)
    label_list = pickle.load(f)

In [4]:
# find n_estimators
xgb1 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000,
                         max_depth=5, min_child_weight=1,
                         gamma=0,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
modelfit(xgb1, train_data, train_label)

[0]	train-mlogloss:1.50834+0.00259039	test-mlogloss:1.50932+0.00264449
[50]	train-mlogloss:0.745173+0.00139901	test-mlogloss:0.78491+0.00458113
[100]	train-mlogloss:0.695665+0.00146069	test-mlogloss:0.76512+0.00512111
[150]	train-mlogloss:0.670507+0.00182044	test-mlogloss:0.761591+0.00581021
[200]	train-mlogloss:0.651371+0.00153139	test-mlogloss:0.760524+0.00589777
[250]	train-mlogloss:0.63475+0.00183945	test-mlogloss:0.76061+0.0059674
test-mlogloss-mean     0.760205
test-mlogloss-std      0.005967
train-mlogloss-mean    0.643486
train-mlogloss-std     0.001678
Name: 223, dtype: float64


In [5]:
# grid search max_depth and min_child_weight
param_grid = [{"max_depth": [3, 5, 7, 9],
               "min_child_weight": [1, 3, 5]}]
xgb2 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         gamma=0,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs2 = GridSearchCV(xgb2, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs2.fit(train_data, train_label)
gs2.grid_scores_, gs2.best_params_, gs2.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV] ....... max_depth=3, min_child_weight=1, score=-0.787328 -  20.1s
[CV] max_depth=3, min_child_weight=1 .................................
[CV] ....... max_depth=3, min_child_weight=1, score=-0.765830 -  19.9s
[CV] max_depth=3, min_child_weight=1 .................................
[CV] ....... max_depth=3, min_child_weight=1, score=-0.777115 -  24.9s
[CV] max_depth=3, min_child_weight=1 .................................
[CV] ....... max_depth=3, min_child_weight=1, score=-0.781507 -  20.7s
[CV] max_depth=3, min_child_weight=1 .................................
[CV] ....... max_depth=3, min_child_weight=1, score=-0.763768 -  19.4s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] ....... max_depth=3, min_child_weight=3, score=-0.785794 -  20.7s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] ....... max

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 16.1min


[CV] ....... max_depth=7, min_child_weight=1, score=-0.740879 -  47.4s
[CV] max_depth=7, min_child_weight=1 .................................
[CV] ....... max_depth=7, min_child_weight=1, score=-0.759578 -  46.3s
[CV] max_depth=7, min_child_weight=1 .................................
[CV] ....... max_depth=7, min_child_weight=1, score=-0.759197 -  46.3s
[CV] max_depth=7, min_child_weight=1 .................................
[CV] ....... max_depth=7, min_child_weight=1, score=-0.742745 -  47.3s
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ....... max_depth=7, min_child_weight=3, score=-0.766182 -  47.6s
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ....... max_depth=7, min_child_weight=3, score=-0.740891 -  44.6s
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ....... max_depth=7, min_child_weight=3, score=-0.758391 - 1.2min
[CV] max_depth=7, min_child_weight=3 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 46.3min finished


([mean: -0.77511, std: 0.00904, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: -0.77469, std: 0.00887, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: -0.77435, std: 0.00844, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: -0.75886, std: 0.00934, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: -0.75861, std: 0.00974, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: -0.75784, std: 0.00967, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: -0.75358, std: 0.00988, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: -0.75275, std: 0.00963, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: -0.75274, std: 0.01010, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: -0.75492, std: 0.00982, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: -0.75379, std: 0.00986, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: -0.75239, std: 0.01034, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 9, 'min_chil

In [6]:
# grid search max_depth and min_child_weight, second time
param_grid = [{"max_depth": [9, 11, 13],
               "min_child_weight": [5, 7]}]
xgb3 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         gamma=0,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs3 = GridSearchCV(xgb3, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs3.fit(train_data, train_label)
gs3.grid_scores_, gs3.best_params_, gs3.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_depth=9, min_child_weight=5 .................................
[CV] ....... max_depth=9, min_child_weight=5, score=-0.765112 - 1.7min
[CV] max_depth=9, min_child_weight=5 .................................
[CV] ....... max_depth=9, min_child_weight=5, score=-0.737848 - 2.1min
[CV] max_depth=9, min_child_weight=5 .................................
[CV] ....... max_depth=9, min_child_weight=5, score=-0.758802 -  56.5s
[CV] max_depth=9, min_child_weight=5 .................................
[CV] ....... max_depth=9, min_child_weight=5, score=-0.757481 - 1.0min
[CV] max_depth=9, min_child_weight=5 .................................
[CV] ....... max_depth=9, min_child_weight=5, score=-0.742702 -  56.1s
[CV] max_depth=9, min_child_weight=7 .................................
[CV] ....... max_depth=9, min_child_weight=7, score=-0.765371 -  54.8s
[CV] max_depth=9, min_child_weight=7 .................................
[CV] ....... max_

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 42.0min finished


([mean: -0.75239, std: 0.01034, params: {'max_depth': 9, 'min_child_weight': 5},
  mean: -0.75231, std: 0.01009, params: {'max_depth': 9, 'min_child_weight': 7},
  mean: -0.75654, std: 0.01158, params: {'max_depth': 11, 'min_child_weight': 5},
  mean: -0.75614, std: 0.00944, params: {'max_depth': 11, 'min_child_weight': 7},
  mean: -0.76205, std: 0.01171, params: {'max_depth': 13, 'min_child_weight': 5},
  mean: -0.76077, std: 0.01068, params: {'max_depth': 13, 'min_child_weight': 7}],
 {'max_depth': 9, 'min_child_weight': 7},
 -0.75230698686101383)

In [None]:
# grid search max_depth and min_child_weight, third time
param_grid = [{"max_depth": [8, 9, 10],
               "min_child_weight": [7, 9]}]
xgb4 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         gamma=0,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs4 = GridSearchCV(xgb4, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs4.fit(train_data, train_label)
gs4.grid_scores_, gs4.best_params_, gs4.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_depth=8, min_child_weight=7 .................................
[CV] ....... max_depth=8, min_child_weight=7, score=-0.765518 -  43.9s
[CV] max_depth=8, min_child_weight=7 .................................
[CV] ....... max_depth=8, min_child_weight=7, score=-0.737837 -  49.1s
[CV] max_depth=8, min_child_weight=7 .................................
[CV] ....... max_depth=8, min_child_weight=7, score=-0.757609 -  54.1s
[CV] max_depth=8, min_child_weight=7 .................................
[CV] ....... max_depth=8, min_child_weight=7, score=-0.756022 -  58.4s
[CV] max_depth=8, min_child_weight=7 .................................
[CV] ....... max_depth=8, min_child_weight=7, score=-0.743046 -  54.7s
[CV] max_depth=8, min_child_weight=9 .................................
[CV] ....... max_depth=8, min_child_weight=9, score=-0.766313 -  53.6s
[CV] max_depth=8, min_child_weight=9 .................................
[CV] ....... max_

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 31.9min finished


([mean: -0.75201, std: 0.01011, params: {'max_depth': 8, 'min_child_weight': 7},
  mean: -0.75301, std: 0.01013, params: {'max_depth': 8, 'min_child_weight': 9},
  mean: -0.75231, std: 0.01009, params: {'max_depth': 9, 'min_child_weight': 7},
  mean: -0.75330, std: 0.00965, params: {'max_depth': 9, 'min_child_weight': 9},
  mean: -0.75385, std: 0.01067, params: {'max_depth': 10, 'min_child_weight': 7},
  mean: -0.75458, std: 0.01079, params: {'max_depth': 10, 'min_child_weight': 9}],
 {'max_depth': 8, 'min_child_weight': 7},
 -0.75200706884951218)

In [None]:
# grid search max_depth and min_child_weight, final time
param_grid = [{"max_depth": [8],
               "min_child_weight": [6, 7, 8]}]
xgb5 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         gamma=0,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs5 = GridSearchCV(xgb5, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs5.fit(train_data, train_label)
gs5.grid_scores_, gs5.best_params_, gs5.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] max_depth=8, min_child_weight=6 .................................


In [None]:
# grid search gamma
param_grid = [{"gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}]
xgb6 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         max_depth=8, min_child_weight=6,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs6 = GridSearchCV(xgb6, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs6.fit(train_data, train_label)
gs6.grid_scores_, gs6.best_params_, gs6.best_score_

In [None]:
# grid search gamma, second time
param_grid = [{"gamma": [0.5, 0.6, 0.7, 0.8, 0.9, 1]}]
xgb7 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         max_depth=8, min_child_weight=6,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs7 = GridSearchCV(xgb7, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs7.fit(train_data, train_label)
gs7.grid_scores_, gs7.best_params_, gs7.best_score_

In [None]:
# grid search gamma, second time
param_grid = [{"gamma": [1, 2, 3]}]
xgb7 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         max_depth=8, min_child_weight=6,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs7 = GridSearchCV(xgb7, param_grid, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs7.fit(train_data, train_label)
gs7.grid_scores_, gs7.best_params_, gs7.best_score_

In [None]:
# grid search gamma, second time
param_grid7 = [{"gamma": [1.7, 1.8]}]
xgb7 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=224,
                         max_depth=8, min_child_weight=6,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs7 = GridSearchCV(xgb7, param_grid7, fit_params={"eval_metric": "mlogloss"},
                  scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs7.fit(train_data, train_label)
gs7.grid_scores_, gs7.best_params_, gs7.best_score_

In [None]:
# find n_estimators
xgb8 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
modelfit(xgb8, train_data, train_label)

In [None]:
xgb9 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.8,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
modelsummit(xgb9, train_data, train_label, test_data, label_list)

In [None]:
# grid search subsample, colsample_bytree
param_grid10 = [{"subsample": [0.6, 0.7, 0.8, 0.9],
                 "colsample_bytree": [0.6, 0.7, 0.8, 0.9]}]
xgb10 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs10 = GridSearchCV(xgb10, param_grid10, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs10.fit(train_data, train_label)
gs10.grid_scores_, gs10.best_params_, gs10.best_score_

In [None]:
# grid search subsample, colsample_bytree, second round
param_grid11 = [{"subsample": [0.75, 0.8, 0.85],
                 "colsample_bytree": [0.75, 0.8, 0.85]}]
xgb11 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         reg_alpha=0, reg_lambda=1,
                         objective="multi:softprob")
gs11 = GridSearchCV(xgb11, param_grid11, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs11.fit(train_data, train_label)
gs11.grid_scores_, gs11.best_params_, gs11.best_score_

In [None]:
# grid search subsample, colsample_bytree, second round
param_grid12 = [{"reg_alpha": [1e-5, 0.01, 0.1, 1, 100]}]
xgb12 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_lambda=1,
                         objective="multi:softprob")
gs12 = GridSearchCV(xgb12, param_grid12, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs12.fit(train_data, train_label)
gs12.grid_scores_, gs12.best_params_, gs12.best_score_

In [None]:
# grid search subsample, colsample_bytree, second round
param_grid13 = [{"reg_alpha": [0.001, 0.005, 0.01, 0.05]}]
xgb13 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_lambda=1,
                         objective="multi:softprob")
gs13 = GridSearchCV(xgb13, param_grid13, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs13.fit(train_data, train_label)
gs13.grid_scores_, gs13.best_params_, gs13.best_score_

In [None]:
# grid search subsample, colsample_bytree, second round
param_grid14 = [{"reg_lambda": [1e-5, 0.01, 0.1, 1, 100]}]
xgb14 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_alpha=0.005,
                         objective="multi:softprob")
gs14 = GridSearchCV(xgb14, param_grid14, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs14.fit(train_data, train_label)
gs14.grid_scores_, gs14.best_params_, gs14.best_score_

In [None]:
# grid search subsample, colsample_bytree, second round
param_grid15 = [{"reg_lambda": [0.5, 1, 5]}]
xgb15 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_alpha=0.005,
                         objective="multi:softprob")
gs15 = GridSearchCV(xgb15, param_grid15, fit_params={"eval_metric": "mlogloss"},
                    scoring=scorer, n_jobs=1, cv=5, verbose=3)
gs15.fit(train_data, train_label)
gs15.grid_scores_, gs15.best_params_, gs15.best_score_

In [None]:
# find n_estimators
xgb8 = xgb.XGBClassifier(learning_rate=0.01, n_estimators=5000,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_alpha=0.005, reg_lambda=1,
                         objective="multi:softprob")
modelfit(xgb8, train_data, train_label)

In [None]:
xgbfinal = xgb.XGBClassifier(learning_rate=0.01, n_estimators=1818,
                         max_depth=8, min_child_weight=6,
                         gamma=1.7,
                         subsample=0.8, colsample_bytree=0.75,
                         reg_alpha=0.005, reg_lambda=1,
                         objective="multi:softprob")
modelsumit(xgbfinal, train_data, train_label, test_data, label_list)