In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [2]:
FEATURE_PATH = './feature/'
RESULT_PATH = './result/'

In [3]:
# define the number of iterations
ROUND = 1000

In [4]:
# configuration the parameters of xgboost
xgb_params = {'objective':'multi:softprob',
              'num_class': 8,
              'eta': 0.04,
              'max_depth':6,
              'subsample':0.9,
              'colsample_bytree': 0.7,
              'lambda': 2,
              'alpha': 2,
              'gamma': 1,
              'scale_pos_weight': 20,
              'eval_metric': 'mlogloss',
              'silent': 0,
              'seed': 149}

In [5]:

def xgb_train(X_train, X_val, y_train, y_val, test, num_round):
    # multi-class model
    dtrain = xgb.DMatrix(X_train, y_train)
    dval = xgb.DMatrix(X_val, y_val)
    dtest = xgb.DMatrix(test.drop(['file_id'], axis=1))
    watchlist = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(xgb_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=100)
    p_val = pd.DataFrame(model.predict(dval, ntree_limit=model.best_iteration), index=X_val.index)
    p_test = pd.DataFrame(model.predict(dtest, ntree_limit=model.best_iteration), index=test.index)
    return (model, p_val, p_test)

run

In [6]:
# load feature v1
train_1 = pd.read_csv(FEATURE_PATH + 'train_base_features_v1.csv')
test_1 = pd.read_csv(FEATURE_PATH + 'test_base_features_v1.csv')

In [7]:
# load feature v2
train_2 = pd.read_csv(FEATURE_PATH + 'train_base_features_v2.csv')
test_2 = pd.read_csv(FEATURE_PATH + 'test_base_features_v2.csv')

In [8]:
interaction_feat = train_2.columns[train_2.columns.isin(test_2.columns.values)].values
train_2 = train_2[interaction_feat]
test_2 = test_2[interaction_feat]

In [9]:
 # merge all features
train = train_1.merge(train_2, on=['file_id'], how='left')
test = test_1.merge(test_2, on=['file_id'], how='left')

In [10]:
# train data prepare
X = train.drop(['file_id', 'label'], axis=1)
y = train['label']

In [11]:
# add one_vs_rest prob
extra_feat_val = pd.read_csv(FEATURE_PATH + 'tr_lr_oof_prob.csv')
extra_feat_test = pd.read_csv(FEATURE_PATH + 'te_lr_oof_prob.csv')
prob_list = ['prob' + str(i) for i in range(1)]
X_extra = pd.concat([X, extra_feat_val[prob_list]], axis=1)
test_extra = pd.concat([test, extra_feat_test[prob_list]], axis=1)
print("Loading complete")

Loading complete


In [12]:
# multi-class model training
logloss_rlt = []
p_val_all = pd.DataFrame()
# 8 catagories
p_test_all = pd.DataFrame(np.zeros((test.shape[0], 8)))
skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)

In [13]:
 # start 5-fold CV
for fold_i, (tr_index, val_index) in enumerate(skf.split(X, y)):
    print('FOLD -', fold_i, ' Start...')
    # Prepare train, val dataset
    X_train, X_val = X_extra.iloc[tr_index, :], X_extra.iloc[val_index, :]
    y_train, y_val = y[tr_index], y[val_index]
    # Train model

    model, p_val, p_test = xgb_train(X_train, X_val, y_train, y_val, test_extra, ROUND)
    # Evaluate Model and Concatenate Val-Prediction
    m_log_loss = log_loss(y_val, p_val)
    print('----------------log_loss : ', m_log_loss, ' ---------------------')
    logloss_rlt = logloss_rlt + [m_log_loss]
    truth_prob_df = pd.concat([y_val, p_val], axis=1)
    p_val_all = pd.concat([p_val_all, truth_prob_df], axis=0)
    # Predict Test Dataset
    p_test_all = p_test_all + 0.2 * p_test

FOLD - 0  Start...


  if getattr(data, 'base', None) is not None and \


[0]	train-mlogloss:1.95485	val-mlogloss:1.95782
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.233092	val-mlogloss:0.34075
[200]	train-mlogloss:0.143921	val-mlogloss:0.291105
[300]	train-mlogloss:0.119687	val-mlogloss:0.286073
[400]	train-mlogloss:0.110559	val-mlogloss:0.2853
Stopping. Best iteration:
[389]	train-mlogloss:0.111148	val-mlogloss:0.285177

----------------log_loss :  0.2852046909523883  ---------------------
FOLD - 1  Start...


  if getattr(data, 'base', None) is not None and \


[0]	train-mlogloss:1.95542	val-mlogloss:1.95779
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.233416	val-mlogloss:0.3397
[200]	train-mlogloss:0.143979	val-mlogloss:0.298327
[300]	train-mlogloss:0.119003	val-mlogloss:0.29475
[400]	train-mlogloss:0.109526	val-mlogloss:0.294334
Stopping. Best iteration:
[342]	train-mlogloss:0.114018	val-mlogloss:0.294158

----------------log_loss :  0.29417411262482324  ---------------------
FOLD - 2  Start...


  if getattr(data, 'base', None) is not None and \


[0]	train-mlogloss:1.96065	val-mlogloss:1.96397
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.237949	val-mlogloss:0.327613
[200]	train-mlogloss:0.148979	val-mlogloss:0.277338
[300]	train-mlogloss:0.123847	val-mlogloss:0.272238
[400]	train-mlogloss:0.113911	val-mlogloss:0.271128
[500]	train-mlogloss:0.108479	val-mlogloss:0.271248
Stopping. Best iteration:
[443]	train-mlogloss:0.110958	val-mlogloss:0.271018

----------------log_loss :  0.2710207086031947  ---------------------
FOLD - 3  Start...


  if getattr(data, 'base', None) is not None and \


[0]	train-mlogloss:1.96057	val-mlogloss:1.96449
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.233083	val-mlogloss:0.344684
[200]	train-mlogloss:0.14366	val-mlogloss:0.298305
[300]	train-mlogloss:0.118976	val-mlogloss:0.293352
[400]	train-mlogloss:0.109688	val-mlogloss:0.292324
[500]	train-mlogloss:0.104523	val-mlogloss:0.292122
Stopping. Best iteration:
[468]	train-mlogloss:0.105844	val-mlogloss:0.291964

----------------log_loss :  0.2920076645576843  ---------------------
FOLD - 4  Start...


  if getattr(data, 'base', None) is not None and \


[0]	train-mlogloss:1.95563	val-mlogloss:1.95807
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.236609	val-mlogloss:0.321286
[200]	train-mlogloss:0.147186	val-mlogloss:0.270817
[300]	train-mlogloss:0.122082	val-mlogloss:0.265498
[400]	train-mlogloss:0.112506	val-mlogloss:0.264794
[500]	train-mlogloss:0.1079	val-mlogloss:0.264828
Stopping. Best iteration:
[404]	train-mlogloss:0.112273	val-mlogloss:0.264727

----------------log_loss :  0.26475537604415317  ---------------------


In [15]:
 # generate submit file
rlt = pd.concat([test['file_id'], p_test_all], axis=1)
prob_list = ['prob' + str(i) for i in range(8)]
rlt.columns = ['file_id'] + prob_list
rlt.to_csv(RESULT_PATH + 'submit.csv', index=None)