In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error



In [3]:
data_path = 'datas/'

train = pd.read_csv(data_path+'train.csv',encoding='gb2312')
test = pd.read_csv(data_path+'testA.csv',encoding='gb2312')

def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train,test])

    data['性别'] = data['性别'].map({'男':1,'女':0})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-10-09')).dt.days
 
    data.fillna(data.median(axis=0))

    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]

    return train_feat,test_feat



train_feat,test_feat = make_feat(train,test)

predictors = [f for f in test_feat.columns if f not in ['血糖']]


def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label,pred)*0.5
    return ('mse',score,False)

In [6]:
train_feat

Unnamed: 0,*r-谷氨酰基转换酶,*丙氨酸氨基转换酶,*天门冬氨酸氨基转换酶,*总蛋白,*球蛋白,*碱性磷酸酶,id,中性粒细胞%,乙肝e抗体,乙肝e抗原,...,红细胞平均血红蛋白量,红细胞计数,肌酐,血小板体积分布宽度,血小板平均体积,血小板比积,血小板计数,血糖,血红蛋白,高密度脂蛋白胆固醇
0,20.23,23.10,24.96,76.88,27.28,99.59,1,54.1,,,...,31.9,5.21,77.25,17.4,9.9,0.164,166.0,6.06,166.1,1.37
1,79.00,36.25,24.57,79.43,31.67,67.21,2,52.0,,,...,29.9,5.21,87.12,10.3,9.2,0.260,277.0,5.39,156.0,0.93
2,38.17,15.23,20.82,86.23,38.23,63.69,3,48.1,1.37,0.01,...,31.3,4.76,78.19,16.6,8.3,0.199,241.0,5.59,148.8,1.64
3,20.22,10.59,14.99,70.98,26.96,74.08,4,41.7,,,...,31.9,4.29,61.46,10.8,10.3,0.260,252.0,4.30,137.0,1.43
4,22.72,14.78,20.07,78.05,36.22,75.79,5,56.6,,,...,20.6,5.15,,14.0,11.1,0.350,316.0,5.42,106.0,1.27
5,23.35,22.59,23.72,76.46,30.61,81.23,6,42.9,1.22,0.01,...,30.4,4.64,66.66,17.0,8.5,0.211,249.0,5.97,141.3,1.81
6,65.42,25.53,24.97,80.82,34.42,109.03,7,52.9,,,...,28.5,6.03,95.98,13.3,10.8,0.270,246.0,5.11,172.0,1.33
7,25.15,40.03,37.32,74.17,32.54,88.49,8,52.8,,,...,32.2,4.90,92.16,13.0,10.5,0.300,282.0,5.94,158.0,1.28
8,67.09,39.17,21.70,76.12,26.81,102.91,9,53.1,2.23,0.03,...,33.0,4.94,78.57,11.4,9.8,0.270,275.0,5.66,163.0,1.51
9,25.75,15.79,18.89,75.94,29.30,78.21,10,65.6,2.45,0.02,...,29.9,4.81,86.88,12.6,10.7,0.270,247.0,5.48,144.0,1.55


In [4]:
predictors

['*r-谷氨酰基转换酶',
 '*丙氨酸氨基转换酶',
 '*天门冬氨酸氨基转换酶',
 '*总蛋白',
 '*球蛋白',
 '*碱性磷酸酶',
 'id',
 '中性粒细胞%',
 '乙肝e抗体',
 '乙肝e抗原',
 '乙肝核心抗体',
 '乙肝表面抗体',
 '乙肝表面抗原',
 '低密度脂蛋白胆固醇',
 '体检日期',
 '单核细胞%',
 '嗜碱细胞%',
 '嗜酸细胞%',
 '尿素',
 '尿酸',
 '年龄',
 '性别',
 '总胆固醇',
 '淋巴细胞%',
 '甘油三酯',
 '白球比例',
 '白细胞计数',
 '白蛋白',
 '红细胞体积分布宽度',
 '红细胞压积',
 '红细胞平均体积',
 '红细胞平均血红蛋白浓度',
 '红细胞平均血红蛋白量',
 '红细胞计数',
 '肌酐',
 '血小板体积分布宽度',
 '血小板平均体积',
 '血小板比积',
 '血小板计数',
 '血红蛋白',
 '高密度脂蛋白胆固醇']

In [3]:

print('开始训练...')
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

print('开始CV 5折训练...')
scores = []

train_preds = np.zeros(train_feat.shape[0])
test_preds = np.zeros((test_feat.shape[0], 5))
kf = KFold(len(train_feat), n_folds = 5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = train_feat.iloc[train_index]
    train_feat2 = train_feat.iloc[test_index]
    lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖'],categorical_feature=['性别'])
    lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖'])
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=100)
    feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    train_preds[test_index] += gbm.predict(train_feat2[predictors])
    test_preds[:,i] = gbm.predict(test_feat[predictors])
print('线下得分：    {}'.format(mean_squared_error(train_feat['血糖'],train_preds)*0.5))


submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),header=None,
                  index=False, float_format='%.4f')

开始训练...
开始CV 5折训练...
第0次训练...
Training until validation scores don't improve for 100 rounds.




[100]	valid_0's l2: 3.05809	valid_0's mse: 1.52904
[200]	valid_0's l2: 2.93688	valid_0's mse: 1.46844
[300]	valid_0's l2: 2.87788	valid_0's mse: 1.43894
[400]	valid_0's l2: 2.84625	valid_0's mse: 1.42313
[500]	valid_0's l2: 2.83225	valid_0's mse: 1.41612
[600]	valid_0's l2: 2.82329	valid_0's mse: 1.41164
[700]	valid_0's l2: 2.81796	valid_0's mse: 1.40898
[800]	valid_0's l2: 2.81931	valid_0's mse: 1.40966
Early stopping, best iteration is:
[722]	valid_0's l2: 2.81754	valid_0's mse: 1.40877
第1次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 2.03955	valid_0's mse: 1.01978
[200]	valid_0's l2: 1.95976	valid_0's mse: 0.979882
[300]	valid_0's l2: 1.91954	valid_0's mse: 0.959772
[400]	valid_0's l2: 1.89954	valid_0's mse: 0.949768
[500]	valid_0's l2: 1.88848	valid_0's mse: 0.94424
[600]	valid_0's l2: 1.87978	valid_0's mse: 0.93989
[700]	valid_0's l2: 1.87526	valid_0's mse: 0.93763
[800]	valid_0's l2: 1.87567	valid_0's mse: 0.937836
Early stopping, best i