<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#导入相关的包" data-toc-modified-id="导入相关的包-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>导入相关的包</a></span></li><li><span><a href="#导入数据" data-toc-modified-id="导入数据-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>导入数据</a></span></li><li><span><a href="#初步处理" data-toc-modified-id="初步处理-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>初步处理</a></span></li><li><span><a href="#模型参数" data-toc-modified-id="模型参数-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>模型参数</a></span></li><li><span><a href="#交叉验证训练" data-toc-modified-id="交叉验证训练-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>交叉验证训练</a></span></li><li><span><a href="#获取提交结果" data-toc-modified-id="获取提交结果-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>获取提交结果</a></span></li></ul></div>

# 导入相关的包

In [2]:
import time
import numpy as np
import pandas as pd
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
%matplotlib inline

# 导入数据

In [4]:
trains = pd.read_csv('../raw_data/d_train.csv',encoding="gbk")
tests = pd.read_csv("../raw_data/d_test_A.csv",encoding="gbk")
fea_train = pd.read_csv("../raw_data/fea_train.csv")
fea_test = pd.read_csv("../raw_data/fea_test.csv")
trains.drop(trains[trains["年龄"] < 20].index,inplace=True)
trains.drop(trains[trains["年龄"] > 85].index,inplace=True)
trains.drop(trains[trains["血糖"] > 15].index,inplace=True)
trains = pd.merge(trains, fea_train, how="left",on="id")
tests = pd.merge(tests, fea_test, how="left",on="id")

# 初步处理

In [10]:
def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train,test])
    data['性别'] = data['性别'].map({'男': 1,'女': 0})
    # data["性别"] = data['性别'].astype(int)
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-10-09')).dt.days
    # data.drop("体检日期",axis = 1,inplace= True)
    # data.fillna(value = -1, inplace = True)
    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]
    return train_feat,test_feat
# 自定义验证MSE
def evalerror(pred,df):
    label = df.get_label()
    score = mean_squared_error(label,pred) * 0.5
    return ('mse',score)

# 模型参数

In [8]:
params = {
    "objective":"reg:linear",
    "eta":0.015,
    "min_child_weight":7,
    "subsample":0.8,
    "colsample_bytree":0.8,
    "lambda":0.1,
    "seed":42,
    "silent":1,
    "verbose":0,
    "max_depth":6,
    "alpha":0,
    "gamma":0.3
}

# 交叉验证训练

In [11]:
train, test = make_feat(trains, tests)
predictors = [f for f in list(train.columns) if f not in ["血糖","blood_sugar","id","blood_sugar_log"]]
scores = []
t0 = time.time()
train_preds = np.zeros(train.shape[0])
test_preds = np.zeros((test.shape[0],10))
feat_imp = pd.DataFrame()
kf = KFold(len(train),n_folds=10,shuffle=True,random_state=1024)
xgb_test = xgb.DMatrix(test[predictors])
for i,(train_index,test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = train.iloc[train_index]
    train_feat2 = train.iloc[test_index]
    xgb_train1 = xgb.DMatrix(train_feat1[predictors],label=train_feat1["血糖"])
    xgb_train2 = xgb.DMatrix(train_feat2[predictors],label=train_feat2["血糖"])
    watchlist = [(xgb_train2,'val')]

    xgb_model = xgb.train(params, xgb_train1,num_boost_round=3000,
                          early_stopping_rounds=50,
                          evals=watchlist,
                          feval=evalerror,verbose_eval=100)

    train_preds[test_index] += xgb_model.predict(xgb_train2,ntree_limit = xgb_model.best_ntree_limit)
    test_preds[:,i] = xgb_model.predict(xgb_test,ntree_limit = xgb_model.best_ntree_limit)

第0次训练...
[0]	val-mse:12.9675
Will train until val-mse hasn't improved in 50 rounds.
[100]	val-mse:1.14491
[200]	val-mse:0.563343
[300]	val-mse:0.532045
[400]	val-mse:0.527955
[500]	val-mse:0.526679
Stopping. Best iteration:
[469]	val-mse:0.525522

第1次训练...
[0]	val-mse:13.8394
Will train until val-mse hasn't improved in 50 rounds.
[100]	val-mse:1.41955
[200]	val-mse:0.777474
[300]	val-mse:0.734567
[400]	val-mse:0.729968
[500]	val-mse:0.727167
Stopping. Best iteration:
[499]	val-mse:0.726645

第2次训练...
[0]	val-mse:13.4655
Will train until val-mse hasn't improved in 50 rounds.
[100]	val-mse:1.32368
[200]	val-mse:0.68773
[300]	val-mse:0.640367
[400]	val-mse:0.633986
[500]	val-mse:0.632091
[600]	val-mse:0.63163
Stopping. Best iteration:
[637]	val-mse:0.629294

第3次训练...
[0]	val-mse:13.3822
Will train until val-mse hasn't improved in 50 rounds.
[100]	val-mse:1.37573
[200]	val-mse:0.774106
[300]	val-mse:0.741249
[400]	val-mse:0.73636
Stopping. Best iteration:
[374]	val-mse:0.736081

第4次训练...
[0

# 获取提交结果

In [12]:
print('线下得分：    {}'.format(mean_squared_error(train['血糖'],train_preds) * 0.5))
print('CV训练用时{}秒'.format(time.time() - t0))
submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})
print(submission.describe())
# submission.to_csv(r'./submission/sub_xgb_9_2_h.csv',header=False,index=False)

线下得分：    0.6130645724666798
CV训练用时114.94473528862秒
              pred
count  1000.000000
mean      5.669386
std       0.560775
min       4.808103
25%       5.260629
50%       5.546157
75%       5.946548
max       9.558611
