Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 38 lines (34 sloc) 1.41 KB
#!/usr/bin/python
import numpy as np
import xgboost as xgb
### load data in do training
train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
label = train[:,32]
data = train[:,1:31]
weight = train[:,31]
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
num_round = 120
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
wtrain = dtrain.get_weight()
wtest = dtest.get_weight()
sum_weight = sum(wtrain) + sum(wtest)
wtrain *= sum_weight / sum(wtrain)
wtest *= sum_weight / sum(wtest)
dtrain.set_weight(wtrain)
dtest.set_weight(wtest)
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)