In [1]:
import sys
import os
import random
import collections
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error#均方误差

In [2]:
# There are 13 integer features and 26 categorical features
continous_features = range(1, 14)
categorial_features = range(14, 40)

# 整数的每一个特征的总数的95% clip_point
# Clip integer features. The clip point for each integer feature
# is derived from the 95% quantile of the total values in each feature
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]

class ContinuousFeatureGenerator:
    """
    Normalize the integer features to [0, 1] by min-max normalization
    """
    def __init__(self, num_feature):
        self.num_feature = num_feature
        self.min = [sys.maxsize] * num_feature
        self.max = [-sys.maxsize] * num_feature

    def build(self, datafile, continous_features):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    val = features[continous_features[i]]
                    if val != '':
                        val = int(val)
                        if val > continous_clip[i]:
                            val = continous_clip[i]
                        self.min[i] = min(self.min[i], val)
                        self.max[i] = max(self.max[i], val)

    def gen(self, idx, val):
        if val == '':
            return 0.0
        val = float(val)
        return (val - self.min[idx]) / (self.max[idx] - self.min[idx])

class CategoryDictGenerator:
    """
    Generate dictionary for each of the categorical features
    """
    def __init__(self, num_feature):
        self.dicts = []
        self.num_feature = num_feature
        for i in range(0, num_feature):
            self.dicts.append(collections.defaultdict(int))

    def build(self, datafile, categorial_features, cutoff=0):
        """
        1.统计每个类别下的所有特征出现次数
        2.每个类别筛选出现次数大于cutoff的特征; 给这些特征编号
        """
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    if features[categorial_features[i]] != '':
                        self.dicts[i][features[categorial_features[i]]] += 1
        for i in range(0, self.num_feature):
            self.dicts[i] = filter(lambda x: x[1] >= cutoff,
                                   self.dicts[i].items())

            self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
            vocabs, _ = list(zip(*self.dicts[i]))
            self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
            self.dicts[i]['<unk>'] = 0

    def gen(self, idx, key):
        """
        出现次数低于cutoff的特征,return 0
        否则return int(此整数意味着该特征key的类别index中的序号)
        """
        if key not in self.dicts[idx]:     
            res = self.dicts[idx]['<unk>']
        else:
            res = self.dicts[idx][key]
        return res

    #FIXME: map(func,iteratable) 对于迭代器调用func
    #return (26个类别)每个类别下的有效特征数(sparse稀疏)
    def dicts_sizes(self):
        return list(map(len, self.dicts))

In [5]:
def preprocess(datadir, outdir):
    """
    All the 13 integer features are normalzied to continous values and these
    continous features are combined into one vecotr with dimension 13.

    Each of the 26 categorical features are one-hot encoded and all the one-hot
    vectors are combined into one sparse binary vector.
    
    1.idFeatures 整数特征（稠密？）
    2.sparseFeatures 类别特征（稀疏？）
    """
    idFeatures = ContinuousFeatureGenerator(len(continous_features))
    idFeatures.build(os.path.join(datadir, 'train.txt'), continous_features)

    sparseFeatures = CategoryDictGenerator(len(categorial_features))
    sparseFeatures.build(os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200)#200 50

    sparseFeatures_sizes = sparseFeatures.dicts_sizes()
    categorial_feature_offset = [0]
    for i in range(1, len(categorial_features)):
        offset = categorial_feature_offset[i - 1] + sparseFeatures_sizes[i - 1]
        categorial_feature_offset.append(offset)

    random.seed(0)

    # 90% of the data are used for training, and 10% of the data are used
    # for validation.
    train_lgb = open(os.path.join(outdir, 'train_lgb.txt'), 'w')
    valid_lgb = open(os.path.join(outdir, 'valid_lgb.txt'), 'w')

    with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
        with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
            with open(os.path.join(datadir, 'train.txt'), 'r') as f:
                for line in f:
                    features = line.rstrip('\n').split('\t')
                    continous_feats = []
                    continous_vals = []
                    for i in range(0, len(continous_features)):

                        val = idFeatures.gen(i, features[continous_features[i]])
                        # FIXME continous_vals和continous_feats区别？？
                        continous_vals.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                        continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                    categorial_vals = []
                    categorial_lgb_vals = []
                    for i in range(0, len(categorial_features)):
                        val = sparseFeatures.gen(i, features[categorial_features[i]]) + categorial_feature_offset[i]
                        categorial_vals.append(str(val))
                        val_lgb = sparseFeatures.gen(i, features[categorial_features[i]])
                        categorial_lgb_vals.append(str(val_lgb))

                    continous_vals = ','.join(continous_vals)
                    categorial_vals = ','.join(categorial_vals)
                    label = features[0]
                    
                    ##### 注意xgboost的输出形式 continous_feats categorial_lgb_vals
                    if random.randint(0, 9999) % 10 != 0:
                        out_train.write(','.join(
                            [continous_vals, categorial_vals, label]) + '\n')
                        
                        train_lgb.write('\t'.join(label) + '\t')
                        train_lgb.write('\t'.join(continous_feats) + '\t')
                        train_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

                    else:
                        out_valid.write(','.join(
                            [continous_vals, categorial_vals, label]) + '\n')

                        valid_lgb.write('\t'.join(label) + '\t')
                        valid_lgb.write('\t'.join(continous_feats) + '\t')
                        valid_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

    train_lgb.close()
    valid_lgb.close()

    test_lgb = open(os.path.join(outdir, 'test_lgb.txt'), 'w')
    with open(os.path.join(outdir, 'test.txt'), 'w') as out:
        with open(os.path.join(datadir, 'test.txt'), 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')

                continous_feats = []
                continous_vals = []
                for i in range(0, len(continous_features)):
                    val = idFeatures.gen(i, features[continous_features[i] - 1])
                    continous_vals.append(
                        "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                    continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                categorial_vals = []
                categorial_lgb_vals = []
                for i in range(0, len(categorial_features)):
                    val = sparseFeatures.gen(i,
                                    features[categorial_features[i] -
                                             1]) + categorial_feature_offset[i]
                    categorial_vals.append(str(val))

                    val_lgb = sparseFeatures.gen(i, features[categorial_features[i] - 1])
                    categorial_lgb_vals.append(str(val_lgb))

                continous_vals = ','.join(continous_vals)
                categorial_vals = ','.join(categorial_vals)

                out.write(','.join([continous_vals, categorial_vals]) + '\n')

                                                                
                test_lgb.write('\t'.join(continous_feats) + '\t')
                test_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

    test_lgb.close()
    return sparseFeatures_sizes

In [3]:
#inputdir and outputdir
inputdir = "/media/data/cuixuange/Criteo_dataset/rawData"
outputdir = "/media/data/cuixuange/Criteo_dataset/xgboost_data"
dict_sizes = preprocess(inputdir,outputdir)

NameError: name 'preprocess' is not defined

In [4]:
##########test
#e.g. XGboost input： label,Idfeatures,catoryFeatures(注意：数值为0的分别是怎么来的？？ cutoff)
with open(outputdir+"/train.txt","r") as f:
    for line in f:
        print(line)
        break
with open(outputdir+"/train_lgb.txt","r") as f:
    for line in f:
        print(line)
        break
        
# for category_feature_nums in dict_sizes:
#     print(category_feature_nums)

0.05,0.004983,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.125,0.2,0,0.04,2,86,363,737,1229,1262,1268,2246,2291,2774,2977,3929,4314,5210,5549,6055,6468,6504,7064,7250,7252,7646,7656,7668,8139,8170,0

0	0.05	0.004983	0.05	0	0.021594	0.008	0.15	0.04	0.362	0.125	0.2	0	0.04	2	16	0	0	1	1	0	3	1	481	0	0	0	3	317	0	1	27	1	2	0	0	2	0	2	0



In [11]:
import xgboost as xgb
#输入格式两种: 1.csv 2.libsvm

df_train = pd.read_csv("/media/data/cuixuange/Criteo_dataset/xgboost_data/train_lgb.txt", header=None, sep="\t")
df_valid = pd.read_csv("/media/data/cuixuange/Criteo_dataset/xgboost_data/valid_lgb.txt", header=None, sep="\t")

iter_num=32
params = {
        'task': 'train',
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': ['rmse', 'auc', 'logloss'],
    
        'max_depth': 5,    #最后一层是32个节点？？？
#         'num_trees': 32,    # 没有这个参数  通过num_boost_round控制迭代次数
        'max_leaf_nodes':30,
        'eta': 0.05,
        'colsample_bytree': 0.9,
        'subsample': 0.8,
#         'bagging_freq': 5,   #xgboost没有这一项？
        'verbosity': 0,  
        'tree_method':"exact"
#         "tree_method":"gpu_exact"  #gpu_hist 尚未安装gpu版本的xgboost
}


In [12]:
#数据形式需要是Dmatrix （注意xgboost只处理）
y_train = df_train[0].values
y_valid = df_valid[0].values
X_train = df_train.drop(0, axis=1).values
X_valid = df_valid.drop(0, axis=1).values
print(X_train.shape)
print(X_train[0])

#类别特征已经使用整数表示  是否还要转化为one-hot编码？（避免整数之间的有序关系影响？）
# 决策树按照特征的分布来切分 而非数值大小
xgb_train = xgb.DMatrix(data=X_train, label=y_train, feature_names=["I1","I2","I3","I4","I5","I6","I7","I8","I9","I10","I11","I12","I13","C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14","C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26"])
xgb_eval = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=["I1","I2","I3","I4","I5","I6","I7","I8","I9","I10","I11","I12","I13","C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14","C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26"])    

watchlist = [(xgb_train,'train'),(xgb_eval,'val')]
evals_result_dicts={}
gbm = xgb.train(params=params,
                dtrain=xgb_train,
                evals=watchlist,
                num_boost_round=iter_num,#迭代次数等于基学习器的个数
                evals_result=evals_result_dicts,
                early_stopping_rounds=5)

(899991, 39)
[5.0000e-02 4.9830e-03 5.0000e-02 0.0000e+00 2.1594e-02 8.0000e-03
 1.5000e-01 4.0000e-02 3.6200e-01 1.2500e-01 2.0000e-01 0.0000e+00
 4.0000e-02 2.0000e+00 1.6000e+01 0.0000e+00 0.0000e+00 1.0000e+00
 1.0000e+00 0.0000e+00 3.0000e+00 1.0000e+00 4.8100e+02 0.0000e+00
 0.0000e+00 0.0000e+00 3.0000e+00 3.1700e+02 0.0000e+00 1.0000e+00
 2.7000e+01 1.0000e+00 2.0000e+00 0.0000e+00 0.0000e+00 2.0000e+00
 0.0000e+00 2.0000e+00 0.0000e+00]
[0]	train-rmse:0.491988	train-auc:0.709839	train-logloss:0.677292	val-rmse:0.492058	val-auc:0.70533	val-logloss:0.677379
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 5 rounds.
[1]	train-rmse:0.484696	train-auc:0.714723	train-logloss:0.662941	val-rmse:0.484832	val-auc:0.709599	val-logloss:0.66323
[2]	train-rmse:0.477924	train-auc:0.715683	train-logloss:0.649871	val-rmse:0.478136	val-auc:0.710607	val-logloss:0.650247
[3]	train-rmse:0.471719	train-auc:0.7166

In [14]:
# save model to file
gbm.save_model('/media/data/cuixuange/Criteo_dataset/model/xgboost_model.txt')

# # predict
# ntree_limit=gbm.best_ntree_limit 最好一次的结果
y_pred = gbm.predict(xgb_eval, ntree_limit=gbm.best_ntree_limit)


# # eval
## 在测试集上,计算预测值和真实值的rmse均方根误差
print('The rmse of prediction is:', mean_squared_error(y_valid, y_pred) ** 0.5)


The rmse of prediction is: 0.4119949802317371


In [18]:
"""
查看每一个特征的重要程度
"""
print(gbm.get_score(importance_type='gain'))


# """
# 特征的重要性排序
# """
# def ret_feat_impt(gbm):
#     gain = gbm.get_score(importance_type='gain').reshape(-1, 1) / sum(gbm.feature_importance("gain"))
#     col = np.array(gbm.feature_name()).reshape(-1, 1)
#     return sorted(np.column_stack((col, gain)),key=lambda x: x[1],reverse=True)
# ret_feat_impt(gbm)

{'I11': 1428.951213182143, 'I1': 1409.1892370469443, 'I8': 305.8931632657471, 'I7': 1527.7021724249996, 'C9': 186.76292754444447, 'C23': 96.59272583699997, 'I6': 1002.7175735968285, 'I13': 303.08045438034947, 'C7': 59.80811584782608, 'I5': 136.02166216776703, 'C25': 80.3793927576087, 'C2': 60.36467494705882, 'C14': 95.71104377362636, 'I3': 145.98547949431946, 'C18': 52.745481670588234, 'C24': 61.54558010909091, 'C17': 110.71008439327733, 'I9': 170.2615961915315, 'I4': 184.16804658260872, 'I12': 63.84120720194445, 'C20': 63.126253089189184, 'I2': 25.425209624375, 'C11': 17.231822235000003, 'C4': 63.41161501294118, 'C6': 50.92944414090909, 'C12': 50.33730708, 'C13': 36.8681209, 'C16': 71.09249264, 'C3': 21.186661903333334, 'C26': 48.87206676625001, 'C22': 65.01264411818181, 'I10': 91.436251775, 'C21': 49.1305542, 'C15': 77.31409185833333, 'C1': 6.6306057}


In [19]:
"""GBDT 模型参数的保存"""
import pickle

#256次的booster
gbm.dump_model(fout="/media/data/cuixuange/Criteo_dataset/model/xgboost_dump.parameter",dump_format="json")
# 使用pickle 直接保存对象,省得创建模型对象+load_model
#这里选择保存obj=(model,parameter)
pickle.dump(gbm, open('{}.pickle'.format('/media/data/cuixuange/Criteo_dataset/model/xgboost_dump'), 'wb'))

In [20]:
print(gbm.best_ntree_limit)

32
