In [1]:
import sys
import pickle
import numpy as np
import datetime as dt
import xgboost as xgb
from datetime import datetime

In [2]:
#读取label encoders，特征(转换后)与原始值之间的映射
with open("../data/les.pickle","rb") as f:
    les = pickle.load(f)

In [3]:
with open("../data/config.pickle","rb") as f:
    config = pickle.load(f)
item2config = {}
for record in config:
    location_map = {"all": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5}
    A = float(record["a_b"].split('_')[0])
    B = float(record["a_b"].split('_')[1])
    item_id = int(record["item_id"])
    location = location_map[record["store_code"]]
    item2config.setdefault(item_id, {}).setdefault(location, [A, B])

In [4]:
class XGBoost(object):  
    
    def normalize(self, y):
        prob = np.log(y + 1) / self.max_log_y
        return prob
    
    def denormalize(self, prob):
        return np.exp(prob * self.max_log_y) - 1
    
    def __init__(self, X_train, y_train, X_val, y_val, cost_infos):
        self.max_log_y = max(np.max(np.log(y_train + 1)), np.max(np.log(y_val + 1)))
        
        self.train_avg_cost = cost_infos[0]
        self.train_less_cost = cost_infos[1]
        self.train_more_cost = cost_infos[2]
        self.val_avg_cost = cost_infos[3]
        self.val_less_cost = cost_infos[4]
        self.val_more_cost = cost_infos[5]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        evallist = [(dval,'eval')]
        param = {'nthread': -1,
                 'max_depth': 7,
                 'eta': 0.01,
                 'silent': 1,
                 'colsample_bytree': 0.7,
                 'subsample': 0.7}
        num_round = 100
        
        #损失函数为 Loss(y_true, y_pred)
        #当y_pred > y_true时，0.5 * B * (y_pred - y_true) ^ 2
        #当y_true > y_pred时，0.5 * A * (y_pred - y_true) ^ 2
        #分段求一阶导和二阶导
        def obj(y_pred, dtrain):
            y_true = dtrain.get_label()
            print '----'
            print y_true[0:10]
            print y_pred[0:10]
            print self.train_less_cost[0:10]
            print self.train_more_cost[0:10]
            print '----'
            
            y_delta = (y_pred - y_true)
            pos_indexs = y_delta >= 0
            neg_indexs = y_delta < 0

            grad = np.zeros(len(y_true))
            grad[pos_indexs] = self.train_more_cost[pos_indexs] * y_delta[pos_indexs]
            grad[neg_indexs] = self.train_less_cost[neg_indexs] * y_delta[neg_indexs]
            
            hess = np.zeros(len(y_true))
            hess[pos_indexs] = self.train_more_cost[pos_indexs]
            hess[neg_indexs] = self.train_less_cost[neg_indexs]
            
            return grad, hess    
        
        #这里测评会比最后测评的损失大，因为这里是预测每1天，最后测评时预测14天，这里只是观察loss的变化
        def evalerror(preds, dval):
            labels = dval.get_label()
            less_diff = labels - preds
            more_diff = preds - labels
            less_indexs = (less_diff) > 0
            more_indexs = (more_diff) > 0
            cost = 0
            cost += np.sum(less_diff[less_indexs] * self.val_less_cost[less_indexs])
            cost += np.sum(more_diff[more_indexs] * self.val_more_cost[more_indexs])
            return 'error', cost
        
        self.bst = xgb.train(param, dtrain, num_round, evallist)
        
    def predict(self, feature):
        dtest = xgb.DMatrix(feature)
        y_predict = self.bst.predict(dtest)
        return y_predict

In [5]:
#获取14天预测值的总和
def getPredictResultOfTwoWeek(X_val, y_val, y_predict, les):
    item_predict_dict = {}
    item_target_dict = {}
    item_ids = les[0].inverse_transform(X_val[:, 0])
    locations = les[1].inverse_transform(X_val[:, 1])     
    for i in range(len(y_predict)):   
        item_id = int(item_ids[i])
        location = int(locations[i])
        predict_sale = y_predict[i]
        target_sale = y_val[i]
        item_predict_dict.setdefault(item_id, {}).setdefault(location, 0)
        item_target_dict.setdefault(item_id, {}).setdefault(location, 0)
        item_predict_dict[item_id][location] += predict_sale
        item_target_dict[item_id][location] += target_sale
    return item_predict_dict, item_target_dict

In [6]:
#评测
def evaluate(X_val, y_val, y_predict, config, les):
    item_predict_dict, item_target_dict = getPredictResultOfTwoWeek(X_val, y_val, y_predict, les)
    cost = 0
    for item_id in item_predict_dict.keys():
        for location in item_predict_dict[item_id].keys():
            local_predict_sale = item_predict_dict[item_id][location]
            local_target_sale = item_target_dict[item_id][location] 
            A = item2config[int(item_id)][location][0]
            B = item2config[int(item_id)][location][1] 
            cost += A * max(local_target_sale - local_predict_sale, 0) + B * max(local_predict_sale - local_target_sale, 0)
    return cost

In [7]:
(X_train, y_train) = pickle.load(open("../data/train_xy.pickle", "rb"))
(X_val, y_val) = pickle.load(open("../data/val_xy.pickle", "rb"))
X_test = pickle.load(open("../data/test_x.pickle", "rb"))

In [67]:
#简单地做一些特征选择
selected_feature_index = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
selected_feature_index += [val for val in range(16, 30)]
X_train = X_train[:, selected_feature_index]
X_val = X_val[:, selected_feature_index]
X_test = X_test[:, selected_feature_index]

In [8]:
#训练集中过多或过少的损失
train_avg_cost = []
train_less_cost = []
train_more_cost = []
item_id_list = les[0].inverse_transform(X_train[:, 0])
location_list = les[1].inverse_transform(X_train[:, 1]) 
for i in range(len(X_train)):
    x = X_train[i]
    item_id = item_id_list[i]
    store_code = location_list[i]
    A = item2config[item_id][store_code][0]
    B = item2config[item_id][store_code][1]    
    train_less_cost.append(A)
    train_more_cost.append(B)
    train_avg_cost.append(0.5 * (A+B))

In [9]:
#验证集中过多或过少的损失
val_avg_cost = []
val_less_cost = []
val_more_cost = []
item_id_list = les[0].inverse_transform(X_val[:, 0])
location_list = les[1].inverse_transform(X_val[:, 1]) 
for i in range(len(X_val)):
    x = X_val[i]
    item_id = item_id_list[i]
    store_code = location_list[i]
    A = item2config[item_id][store_code][0]
    B = item2config[item_id][store_code][1]    
    val_less_cost.append(A)
    val_more_cost.append(B)
    val_avg_cost.append(0.5 * (A+B))

In [10]:
train_avg_cost = np.array(train_avg_cost)    
train_less_cost = np.array(train_less_cost)
train_more_cost = np.array(train_more_cost)
val_avg_cost = np.array(val_avg_cost)    
val_less_cost = np.array(val_less_cost)
val_more_cost = np.array(val_more_cost)
cost_infos = [train_avg_cost, train_less_cost, train_more_cost, val_avg_cost, val_less_cost, val_more_cost]

In [None]:
model = XGBoost(X_train, y_train, X_val, y_val, cost_infos)

[0]	eval-rmse:11.317424
[1]	eval-rmse:11.221126


In [69]:
#这里的预测是用X_val前14天真实的值做为特征
y_predict = model.predict(X_val)

In [70]:
print evaluate(X_val, y_val, y_predict, config, les)

726636.861351


In [71]:
#这里是一天一天预测，将预测值作为特征
y_predict = []
recent_njhs_col = 16
predict_size = 5778
for day in range(0,13):
    X_val_oneday = X_val[day * predict_size : (day + 1) * predict_size]
    y_pre = model.predict(X_val_oneday)
    y_predict = y_predict + list(y_pre)
    X_val[(day + 1) * predict_size : (day + 2) * predict_size, 14] = X_val[0 : predict_size, 14]
    X_val[(day + 1) * predict_size : (day + 2) * predict_size, 15] = X_val[0 : predict_size, 15]
    X_val[(day + 1) * predict_size : (day + 2) * predict_size, [val for val in range(recent_njhs_col + 1, recent_njhs_col + 14)]] = X_val[day * predict_size : (day + 1) * predict_size, [val for val in range(recent_njhs_col, recent_njhs_col + 13)]]
    X_val[(day + 1) * predict_size : (day + 2) * predict_size, recent_njhs_col] = y_pre
X_val_oneday = X_val[13 * predict_size : (13 + 1) * predict_size]    
y_pre = model.predict(X_val_oneday)
y_predict += list(y_pre)

In [72]:
print evaluate(X_val, y_val, y_predict, config, les)

1871911.62489


In [None]:
# predict by real      predict by predict       online test
#   665300.141584           1642929.52312           1511180
# ---------------------------------------------------------
#   761987.420657           1890359.84909               ???
#   748644.131440           1779823.92518               ???
#   701252.662177           1688810.22229               ???

In [46]:
#这里是一天一天预测，预测最后的test
y_predict = []
recent_njhs_col = 16
predict_size = 5778
for day in range(0,13):
    X_test_oneday = X_test[day * predict_size : (day + 1) * predict_size]
    y_pre = model.predict(X_test_oneday)
    y_predict = y_predict + list(y_pre)
    X_test[(day + 1) * predict_size : (day + 2) * predict_size, 14] = X_test[0 : predict_size, 14]
    X_test[(day + 1) * predict_size : (day + 2) * predict_size, 15] = X_test[0 : predict_size, 15]
    X_test[(day + 1) * predict_size : (day + 2) * predict_size, [val for val in range(recent_njhs_col + 1, recent_njhs_col + 14)]] = X_test[day * predict_size : (day + 1) * predict_size, [val for val in range(recent_njhs_col, recent_njhs_col + 13)]]
    X_test[(day + 1) * predict_size : (day + 2) * predict_size, recent_njhs_col] = y_pre
X_test_oneday = X_test[13 * predict_size : (13 + 1) * predict_size]    
y_pre = model.predict(X_test_oneday)
y_predict += list(y_pre)

In [47]:
#汇总14天的结果
item_predict_dict, item_target_dict = getPredictResultOfTwoWeek(X_test, y_predict, y_predict, les)

In [48]:
#输出最后的提交结果
outfile = open("../data/result.csv","wb")
for item_id in item_predict_dict.keys():
    total = 0
    for location in item_predict_dict[item_id].keys():
        y = item_predict_dict[item_id][location]
        if location == 0:
            res = [str(int(item_id)), "all", str(y)]
        else:
            res = [str(int(item_id)), str(int(location)), str(y)]
        total = total + y
        outfile.write(",".join(res) + "\r\n")
outfile.close()