Skip to content

Commit

Permalink
Merge pull request #7 from jrings/master
Browse files Browse the repository at this point in the history
Compatibility with both Python 2(.7) and 3
  • Loading branch information
tqchen committed May 19, 2014
2 parents fce56ba + c5b345b commit 35caa9c
Show file tree
Hide file tree
Showing 12 changed files with 93 additions and 67 deletions.
4 changes: 2 additions & 2 deletions demo/binary_classification/mapfeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def loadfmap( fname ):
return fmap, nmap

def write_nmap( fo, nmap ):
for i in xrange( len(nmap) ):
for i in range( len(nmap) ):
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )

# start here
Expand All @@ -41,7 +41,7 @@ def write_nmap( fo, nmap ):
else:
assert arr[0] == 'e'
fo.write('0')
for i in xrange( 1,len(arr) ):
for i in range( 1,len(arr) ):
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
fo.write('\n')

Expand Down
2 changes: 1 addition & 1 deletion demo/binary_classification/mknfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import random

if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]'
print ('Usage:<filename> <k> [nfold = 5]')
exit(0)

random.seed( 10 )
Expand Down
24 changes: 15 additions & 9 deletions demo/kaggle-higgs/higgs-numpy.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#!/usr/bin/python
# this is the example script to use xgboost to train
import inspect
import os
import sys
import numpy as np
# add path of xgboost python module
sys.path.append('../../python/')
code_path = os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../python")

sys.path.append(code_path)

import xgboost as xgb

test_size = 550000
Expand All @@ -12,19 +18,19 @@
dpath = 'data'

# load in training data, directly use numpy
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
print 'finish loading from csv '
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
print ('finish loading from csv ')

label = dtrain[:,32]
data = dtrain[:,1:31]
# rescale weight to make it same as test set
weight = dtrain[:,31] * float(test_size) / len(label)

sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 )
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )

# print weight statistics
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
Expand All @@ -43,14 +49,14 @@
param['nthread'] = 16

# you can directly throw param in, though we want to watch multiple metrics here
plst = param.items()+[('eval_metric', 'ams@0.15')]
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

watchlist = [ (xgmat,'train') ]
# boost 120 tres
num_round = 120
print 'loading data end, start to boost trees'
print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('higgs.model')

print 'finish training'
print ('finish training')
6 changes: 3 additions & 3 deletions demo/kaggle-higgs/higgs-pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
data = dtest[:,1:31]
idx = dtest[:,0]

print 'finish loading from csv '
print ('finish loading from csv ')
xgmat = xgb.DMatrix( data, missing = -999.0 )
bst = xgb.Booster({'nthread':16})
bst.load_model( modelfile )
ypred = bst.predict( xgmat )

res = [ ( int(idx[i]), ypred[i] ) for i in xrange(len(ypred)) ]
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]

rorder = {}
for k, v in sorted( res, key = lambda x:-x[1] ):
Expand All @@ -47,7 +47,7 @@
ntot += 1
fo.close()

print 'finished writing into prediction file'
print ('finished writing into prediction file')



14 changes: 12 additions & 2 deletions demo/kaggle-higgs/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
#!/bin/bash

python higgs-numpy.py
python higgs-pred.py
python -u higgs-numpy.py
ret=$?
if [[ $ret != 0 ]]; then
echo "ERROR in higgs-numpy.py"
exit $ret
fi
python -u higgs-pred.py
ret=$?
if [[ $ret != 0 ]]; then
echo "ERROR in higgs-pred.py"
exit $ret
fi
20 changes: 10 additions & 10 deletions demo/kaggle-higgs/speedtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@

# load in training data, directly use numpy
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
print 'finish loading from csv '
print ('finish loading from csv ')

label = dtrain[:,32]
data = dtrain[:,1:31]
# rescale weight to make it same as test set
weight = dtrain[:,31] * float(test_size) / len(label)

sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 )
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )

# print weight statistics
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
Expand All @@ -47,20 +47,20 @@
watchlist = [ (xgmat,'train') ]
# boost 10 tres
num_round = 10
print 'loading data end, start to boost trees'
print "training GBM from sklearn"
print ('loading data end, start to boost trees')
print ("training GBM from sklearn")
tmp = time.time()
gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
gbm.fit(data, label)
print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp)
print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
#raw_input()
print "training xgboost"
print ("training xgboost")
threads = [1, 2, 4, 16]
for i in threads:
param['nthread'] = i
tmp = time.time()
plst = param.items()+[('eval_metric', 'ams@0.15')]
bst = xgb.train( plst, xgmat, num_round, watchlist );
print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))
print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))

print 'finish training'
print ('finish training')
2 changes: 1 addition & 1 deletion demo/multiclass_classification/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@
# get prediction
pred = bst.predict( xg_test );

print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) )
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))


26 changes: 13 additions & 13 deletions demo/rank/trans_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@

def save_data(group_data,output_feature,output_group):
if len(group_data) == 0:
return
return

output_group.write(str(len(group_data))+"\n")
for data in group_data:
# only include nonzero features
feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]
output_feature.write(data[0] + " " + " ".join(feats) + "\n")
output_feature.write(data[0] + " " + " ".join(feats) + "\n")

if __name__ == "__main__":
if len(sys.argv) != 4:
print "Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]"
sys.exit(0)
print ("Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]")
sys.exit(0)

fi = open(sys.argv[1])
output_feature = open(sys.argv[2],"w")
Expand All @@ -22,16 +22,16 @@ def save_data(group_data,output_feature,output_group):
group_data = []
group = ""
for line in fi:
if not line:
break
if "#" in line:
line = line[:line.index("#")]
if not line:
break
if "#" in line:
line = line[:line.index("#")]
splits = line.strip().split(" ")
if splits[1] != group:
save_data(group_data,output_feature,output_group)
group_data = []
group = splits[1]
group_data.append(splits)
if splits[1] != group:
save_data(group_data,output_feature,output_group)
group_data = []
group = splits[1]
group_data.append(splits)

save_data(group_data,output_feature,output_group)

Expand Down
6 changes: 3 additions & 3 deletions demo/regression/mapfeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
for l in open( 'machine.data' ):
arr = l.split(',')
fo.write(arr[8])
for i in xrange( 0,6 ):
for i in range( 0,6 ):
fo.write( ' %d:%s' %(i,arr[i+2]) )

if arr[0] not in fmap:
Expand All @@ -24,9 +24,9 @@
# list from machine.names
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];

for i in xrange(0,6):
for i in range(0,6):
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))

for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ):
for v, k in sorted( fmap.items(), key = lambda x:x[1] ):
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
fo.close()
2 changes: 1 addition & 1 deletion demo/regression/mknfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import random

if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]'
print ('Usage:<filename> <k> [nfold = 5]')
exit(0)

random.seed( 10 )
Expand Down
10 changes: 5 additions & 5 deletions python/example/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# this is prediction
preds = bst.predict( dtest )
labels = dtest.get_label()
print 'error=%f' % ( sum(1 for i in xrange(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
Expand All @@ -32,7 +32,7 @@
###
# build dmatrix in python iteratively
#
print 'start running example of build DMatrix in python'
print ('start running example of build DMatrix in python')
dtrain = xgb.DMatrix()
labels = []
for l in open('agaricus.txt.train'):
Expand All @@ -50,7 +50,7 @@

###
# build dmatrix from scipy.sparse
print 'start running example of build DMatrix from scipy.sparse'
print ('start running example of build DMatrix from scipy.sparse')
labels = []
row = []; col = []; dat = []
i = 0
Expand All @@ -68,7 +68,7 @@
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )

print 'start running example of build DMatrix from numpy array'
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat )
Expand All @@ -79,7 +79,7 @@
###
# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
#
print 'start running example to used cutomized objective function'
print ('start running example to used cutomized objective function')

# note: set loss_type properly, loss_type=2 means the prediction will get logistic transformed
# in most case, we may want to set loss_type = 0, to get untransformed score to compute gradient
Expand Down

0 comments on commit 35caa9c

Please sign in to comment.