<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#导入相关的包" data-toc-modified-id="导入相关的包-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>导入相关的包</a></span></li><li><span><a href="#读入相关数据" data-toc-modified-id="读入相关数据-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>读入相关数据</a></span></li><li><span><a href="#初步处理" data-toc-modified-id="初步处理-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>初步处理</a></span></li><li><span><a href="#模型参数" data-toc-modified-id="模型参数-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>模型参数</a></span></li><li><span><a href="#交叉验证训练" data-toc-modified-id="交叉验证训练-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>交叉验证训练</a></span></li><li><span><a href="#形成提交结果" data-toc-modified-id="形成提交结果-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>形成提交结果</a></span></li></ul></div>

# 导入相关的包

In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
%matplotlib inline



# 读入相关数据

In [3]:
trains = pd.read_csv('../raw_data/d_train.csv',encoding="gbk")
tests = pd.read_csv("../raw_data/d_test_A.csv",encoding="gbk")
fea_train = pd.read_csv("../raw_data/fea_train.csv")
fea_test = pd.read_csv("../raw_data/fea_test.csv")
trains.drop(trains[trains["年龄"] < 20].index,inplace=True)
trains.drop(trains[trains["年龄"] > 85].index,inplace=True)
trains.drop(trains[trains["血糖"] > 15].index,inplace=True)
trains = pd.merge(trains, fea_train, how="left",on="id")
tests = pd.merge(tests, fea_test, how="left",on="id")

# 初步处理

In [4]:
def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train,test])
    data['性别'] = data['性别'].map({'男': 1,'女': 0})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-10-09')).dt.days
    # data.drop("体检日期",axis = 1,inplace= True)
    # data.fillna(data.median(axis=0))
    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]
    return train_feat,test_feat
def evalerror(pred,df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label,pred) * 0.5
    return ('mse',score,False)

# 模型参数

In [5]:
params = {
    'learning_rate': 0.015,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 32,
    'min_data_in_leaf': 25,
    'bagging_fraction':0.8,
    "feature_fraction":0.7,
    'min_sum_hessian_in_leaf': 1,
    'verbose': -1,
    "max_depth":6,
    "max_bin":150,
    "lambda_l2":0.3
    }

# 交叉验证训练

In [None]:
%%time
train, test = make_feat(trains, tests)
predictors = [f for f in list(train.columns) if f not in ["血糖","blood_sugar","id","blood_sugar_log"]]
scores = []
t0 = time.time()
train_preds = np.zeros(train.shape[0])
test_preds = np.zeros((test.shape[0],10))
feat_imp = pd.DataFrame()
kf = KFold(len(train),n_folds=10,shuffle=True,random_state=1024)
for i,(train_index,test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = train.iloc[train_index]
    train_feat2 = train.iloc[test_index]
    lgb_train1 = lgb.Dataset(train_feat1[predictors],train_feat1['血糖'])
    lgb_train2 = lgb.Dataset(train_feat2[predictors],train_feat2['血糖'])
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=50)
    feat_i = pd.DataFrame(pd.Series(gbm.feature_importance(),index=predictors).sort_values(ascending=False))
    feat_imp = pd.concat([feat_imp, feat_i],axis=1)
    train_preds[test_index] += gbm.predict(train_feat2[predictors],num_iteration=gbm.best_iteration)
    test_preds[:,i] = gbm.predict(test[predictors],num_iteration=gbm.best_iteration)

第0次训练...
Train until valid scores didn't improve in 50 rounds.
[100]	valid_0's l2: 1.12587	valid_0's mse: 0.562937
[200]	valid_0's l2: 1.08168	valid_0's mse: 0.540841
[300]	valid_0's l2: 1.07363	valid_0's mse: 0.536813
Early stopping, best iteration is:
[275]	valid_0's l2: 1.07244	valid_0's mse: 0.536221
第1次训练...
Train until valid scores didn't improve in 50 rounds.
[100]	valid_0's l2: 1.58713	valid_0's mse: 0.793566
[200]	valid_0's l2: 1.52794	valid_0's mse: 0.763969
[300]	valid_0's l2: 1.49794	valid_0's mse: 0.748969
[400]	valid_0's l2: 1.4813	valid_0's mse: 0.740651
[500]	valid_0's l2: 1.47456	valid_0's mse: 0.737279
Early stopping, best iteration is:
[494]	valid_0's l2: 1.47407	valid_0's mse: 0.737033
第2次训练...
Train until valid scores didn't improve in 50 rounds.
[100]	valid_0's l2: 1.37083	valid_0's mse: 0.685414
[200]	valid_0's l2: 1.30031	valid_0's mse: 0.650157
[300]	valid_0's l2: 1.26857	valid_0's mse: 0.634284
[400]	valid_0's l2: 1.25506	valid_0's mse: 0.627531
Early stopping

# 形成提交结果

In [9]:
print('线下得分：    {}'.format(mean_squared_error(train['血糖'],train_preds) * 0.5))
submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})
print(submission.describe())
# submission.to_csv(r'./submission/sub_lgb_9_1_c.csv',header=False,index=False)

线下得分：    0.6145393093157447
CV训练用时49.51361036300659秒
              pred
count  1000.000000
mean      5.677924
std       0.533952
min       4.852984
25%       5.269824
50%       5.561731
75%       5.938684
max       8.459070
