In [1]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split

In [2]:
dataset1 = pd.read_csv('../input/ProcessDataSet1.csv')
dataset1.label.replace(-1,0,inplace=True) 
dataset2 = pd.read_csv('../input/ProcessDataSet2.csv')
dataset2.label.replace(-1,0,inplace=True)
dataset3 = pd.read_csv('../input/ProcessDataSet3.csv')

dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1,dataset2],axis=0)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(['user_id','label','day_gap_before','coupon_id','day_gap_after'],axis=1)      
                                         
dataset3.drop_duplicates(inplace=True)                       
dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

dataTrain = xgb.DMatrix(dataset12_x,label=dataset12_y)
dataTest = xgb.DMatrix(dataset3_x)

In [3]:
#性能评价函数
def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        tmpdf = i[1] 
        if len(tmpdf['label'].unique()) != 2:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
        aucs.append(auc(fpr,tpr))
    return np.average(aucs)

In [4]:
params={'booster':'gbtree',
	    'objective': 'rank:pairwise',
	    'eval_metric':'auc',
	    'gamma':0.1,
	    'min_child_weight':1.1,
	    'max_depth':5,
	    'lambda':10,
	    'subsample':0.7,
	    'colsample_bytree':0.7,
	    'colsample_bylevel':0.7,
	    'eta': 0.01,
	    'tree_method':'exact',
	    'seed':0,
	    'nthread':12
	    }
watchlist = [(dataTrain,'train')]
model = xgb.train(params,dataTrain,num_boost_round=3500,evals=watchlist)
model.save_model('../output/xgbmodel')

[0]	train-auc:0.83147
[1]	train-auc:0.84042
[2]	train-auc:0.84503
[3]	train-auc:0.84583
[4]	train-auc:0.84725
[5]	train-auc:0.84931
[6]	train-auc:0.84903
[7]	train-auc:0.84980
[8]	train-auc:0.85019
[9]	train-auc:0.85091
[10]	train-auc:0.85113
[11]	train-auc:0.85132
[12]	train-auc:0.85111
[13]	train-auc:0.85144
[14]	train-auc:0.85142
[15]	train-auc:0.85192
[16]	train-auc:0.85247
[17]	train-auc:0.85236
[18]	train-auc:0.85234
[19]	train-auc:0.85265
[20]	train-auc:0.85246
[21]	train-auc:0.85249
[22]	train-auc:0.85251
[23]	train-auc:0.85225
[24]	train-auc:0.85195
[25]	train-auc:0.85216
[26]	train-auc:0.85235
[27]	train-auc:0.85250
[28]	train-auc:0.85263
[29]	train-auc:0.85306
[30]	train-auc:0.85281
[31]	train-auc:0.85292
[32]	train-auc:0.85303
[33]	train-auc:0.85322
[34]	train-auc:0.85334
[35]	train-auc:0.85358
[36]	train-auc:0.85360
[37]	train-auc:0.85355
[38]	train-auc:0.85372
[39]	train-auc:0.85384
[40]	train-auc:0.85396
[41]	train-auc:0.85399
[42]	train-auc:0.85440
[43]	train-auc:0.8544

In [11]:
model=xgb.Booster(params)
model.load_model('../output/xgbmodel') 
#predict test set 
dataset3_preds1 = dataset3_preds
dataset3_preds1['label'] = model.predict(dataTest)
#标签归一化在[0，1]原作者代码这里有错
#修改前
#dataset3_preds.label = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(dataset3_preds.label)
 
#修改后
dataset3_preds1.label = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(np.array(dataset3_preds1.label).reshape(-1,1))
dataset3_preds1.sort_values(by=['coupon_id','label'],inplace=True)
dataset3_preds1.to_csv("../output/xgb_preds.csv",index=None,header=None)
print(dataset3_preds1.describe())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


            user_id      coupon_id  date_received          label
count  1.128030e+05  112803.000000   1.128030e+05  112803.000000
mean   3.684618e+06    9064.658006   2.016072e+07       0.345811
std    2.126358e+06    4147.283515   9.017693e+00       0.134631
min    2.090000e+02       3.000000   2.016070e+07       0.000000
25%    1.843824e+06    5035.000000   2.016071e+07       0.260864
50%    3.683073e+06    9983.000000   2.016072e+07       0.325157
75%    5.525176e+06   13602.000000   2.016072e+07       0.413362
max    7.361024e+06   14045.000000   2.016073e+07       1.000000


In [13]:
model=xgb.Booster()
model.load_model('../output/xgbmodel') 

temp = dataset12[['coupon_id','label']].copy()
temp['pred'] =model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(temp['pred'].values.reshape(-1,1))
print(myauc(temp))

0.7625061811748071


In [14]:
params={'booster':'gbtree',
	    'objective': 'rank:pairwise',
	    'eval_metric':'auc',
	    'gamma':0.1,
	    'min_child_weight':1.1,
	    'max_depth':5,
	    'lambda':10,
	    'subsample':0.7,
	    'colsample_bytree':0.7,
	    'colsample_bylevel':0.7,
	    'eta': 0.01,
	    'tree_method':'exact',
	    'seed':0,
	    'nthread':12
	    }

cvresult = xgb.cv(params, dataTrain, num_boost_round=20000, nfold=5, metrics='auc', seed=0, callbacks=[
            xgb.callback.print_evaluation(show_stdv=False),
            xgb.callback.early_stop(50)
        ])
num_round_best = cvresult.shape[0] - 1
print('Best round num: ', num_round_best)

watchlist = [(dataTrain,'train')]
model1 = xgb.train(params,dataTrain,num_boost_round=num_round_best,evals=watchlist)



[0]	train-auc:0.83140	test-auc:0.83040
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[1]	train-auc:0.83991	test-auc:0.83952
[2]	train-auc:0.84306	test-auc:0.84282
[3]	train-auc:0.84600	test-auc:0.84589
[4]	train-auc:0.84753	test-auc:0.84722
[5]	train-auc:0.84951	test-auc:0.84894
[6]	train-auc:0.85010	test-auc:0.84964
[7]	train-auc:0.85021	test-auc:0.84972
[8]	train-auc:0.85083	test-auc:0.85028
[9]	train-auc:0.85126	test-auc:0.85071
[10]	train-auc:0.85181	test-auc:0.85109
[11]	train-auc:0.85168	test-auc:0.85105
[12]	train-auc:0.85210	test-auc:0.85132
[13]	train-auc:0.85251	test-auc:0.85170
[14]	train-auc:0.85273	test-auc:0.85184
[15]	train-auc:0.85295	test-auc:0.85203
[16]	train-auc:0.85294	test-auc:0.85198
[17]	train-auc:0.85306	test-auc:0.85216
[18]	train-auc:0.85356	test-auc:0.85267
[19]	train-auc:0.85349	test-auc:0.85259
[20]	train-auc:0.85361	test-auc:0.85276
[21]	train-auc:0.85365	test-a

XGBoostError: [13:50:49] /Users/runner/miniforge3/conda-bld/xgboost_1588600962499/work/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null:  LocalFileSystem::Open "..output/xgbmodel1": No such file or directory
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000001a172ec11e dmlc::LogMessageFatal::~LogMessageFatal() + 110
  [bt] (1) 2   libxgboost.dylib                    0x0000001a1749c1ef dmlc::io::LocalFileSystem::Open(dmlc::io::URI const&, char const*, bool) + 1039
  [bt] (2) 3   libxgboost.dylib                    0x0000001a174805fc dmlc::Stream::Create(char const*, char const*, bool) + 76
  [bt] (3) 4   libxgboost.dylib                    0x0000001a172e731a XGBoosterSaveModel + 122
  [bt] (4) 5   libffi.6.dylib                      0x0000000105b0ba8c ffi_call_unix64 + 76
  [bt] (5) 6   ???                                 0x00007ffeeacd8920 0x0 + 140732837759264



In [15]:
model1.save_model('../output/xgbmodel1')
print('------------------------train done------------------------------')

------------------------train done------------------------------


In [18]:
feature_score = model.get_fscore()
feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)#value逆序排序

fs = []
for (key,value) in feature_score:
    fs.append("{0},{1}\n".format(key,value))
 
with open('../output/xgb_feature_score1.csv','w') as f:
    f.writelines("feature,score\n")
    f.writelines(fs)