In [171]:
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error



data_path = 'datas/'



train = pd.read_csv(data_path+'train_contain38.csv',encoding='gb2312')
#train = pd.read_csv(data_path+'train_nocontain38.csv',encoding='gb2312')
test = pd.read_csv(data_path+'testA.csv',encoding='gb2312')

def make_feat(train,test):

    data = pd.concat([train,test])

    data['性别'] = data['性别'].map({'男':1,'女':0})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-09-09')).dt.days
    
    
    data["年龄分段"] = np.ceil(data["年龄"] / 10)
    data["年龄分段"].where(data["年龄分段"] <7, 7.0, inplace=True)
    data["年龄分段"].where(data["年龄分段"] >3, 3.0, inplace=True)

    #data.fillna(data.mean(axis=0),inplace=True)
    data.drop(['id'],axis=1,inplace=True)
    train_feat = data[:5642]
    
    
#     train_feat["体检日期"]
    
    cache1=pd.DataFrame()
    for item in train_feat["体检日期"].unique():
        median=train_feat[train_feat["体检日期"]==item].median(axis=0)
        item_df=train_feat[train_feat["体检日期"]==item].copy()
        item_df.fillna(-999,inplace=True)
        cache1=pd.concat([cache1,item_df])
    cache1.fillna(cache1.median(axis=0),inplace=True)
    
    
    test_feat = data[5642:]
    cache2=pd.DataFrame()
    for item in test_feat["体检日期"].unique():
        median=test_feat[test_feat["体检日期"]==item].median(axis=0)
        item_df=test_feat[test_feat["体检日期"]==item].copy()
        item_df.fillna(-999,inplace=True)
        cache2=pd.concat([cache2,item_df])
    cache2.fillna(cache2.median(axis=0),inplace=True)
    
    
    return cache1,cache2

def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label,pred)*0.5
    return ('0.5mse',score,False)

In [180]:
train_feat,test_feat = make_feat(train,test)
#predictors = [f for f in test_feat.columns if f not in ['血糖']]
categorical_features=['性别','年龄','体检日期']

In [181]:
# corr_matrix=train_feat.corr()
# relative_df=corr_matrix["血糖"]
# predictor=relative_df[relative_df.values>0.0].index.tolist()
predictors = [f for f in train_feat.columns if f not in ['血糖']]

In [178]:
from sklearn.model_selection import GridSearchCV

In [None]:
lgb_train1 = lgb.Dataset(train_feat[predictors], train_feat['血糖'],categorical_feature=categorical_features)
lgb_train2 = lgb.Dataset(train_feat[predictors], train_feat['血糖'],categorical_feature=categorical_features)
gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=500)

In [None]:
model=GridSearchCV(gbm,parameters)

In [183]:
print('开始训练...')
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

train_preds_copy=np.zeros((train_feat.shape[0],5))
train_preds = np.zeros(train_feat.shape[0])
test_preds = np.zeros((test_feat.shape[0], 5))
kf = KFold(len(train_feat), n_folds = 5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    
    
    train_feat1 = train_feat.iloc[train_index]
    train_feat2 = train_feat.iloc[test_index]
    
    
    lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖'],categorical_feature=categorical_features)
    lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖'],categorical_feature=categorical_features)
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=500)
    feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    
    train_preds[test_index] += gbm.predict(train_feat2[predictors])
    
    test_preds[:,i] = gbm.predict(test_feat[predictors])
    train_preds_copy[:,i]=gbm.predict(train_feat[predictors])
    
print('线下得分：{}'.format(mean_squared_error(train_feat['血糖'],train_preds)*0.5))


开始训练...
第0次训练...




Training until validation scores don't improve for 500 rounds.
[100]	valid_0's l2: 2.01956	valid_0's 0.5mse: 1.00978
[200]	valid_0's l2: 1.94163	valid_0's 0.5mse: 0.970817
[300]	valid_0's l2: 1.91445	valid_0's 0.5mse: 0.957227
[400]	valid_0's l2: 1.90384	valid_0's 0.5mse: 0.951921
[500]	valid_0's l2: 1.90204	valid_0's 0.5mse: 0.95102
[600]	valid_0's l2: 1.90594	valid_0's 0.5mse: 0.952969
[700]	valid_0's l2: 1.91168	valid_0's 0.5mse: 0.955842
[800]	valid_0's l2: 1.91688	valid_0's 0.5mse: 0.95844
[900]	valid_0's l2: 1.92464	valid_0's 0.5mse: 0.962318
Early stopping, best iteration is:
[468]	valid_0's l2: 1.90111	valid_0's 0.5mse: 0.950557
第1次训练...
Training until validation scores don't improve for 500 rounds.
[100]	valid_0's l2: 1.89349	valid_0's 0.5mse: 0.946747
[200]	valid_0's l2: 1.82592	valid_0's 0.5mse: 0.912962
[300]	valid_0's l2: 1.80973	valid_0's 0.5mse: 0.904864
[400]	valid_0's l2: 1.80564	valid_0's 0.5mse: 0.902818
[500]	valid_0's l2: 1.8067	valid_0's 0.5mse: 0.903349
[600]	val

In [184]:
# train_predict=pd.DataFrame({'train':train_preds})
submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),header=None,
                  index=False, float_format='%.4f')

In [185]:
submission.describe()

Unnamed: 0,pred
count,1000.0
mean,5.773664
std,0.571403
min,4.779484
25%,5.327164
50%,5.667671
75%,6.130102
max,8.401238


In [41]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()

In [168]:
lin_reg.fit(train_feat[predictors],train_feat["血糖"].values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [169]:
lin_reg.predict(test_feat)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [43]:
from sklearn.model_selection  import cross_val_score

In [44]:
np.mean(cross_val_score(lin_reg,train_preds_copy,train_feat["血糖"].values,cv=10,scoring="neg_mean_squared_error"))

-0.8300305673762193

In [20]:
final_predict=lin_reg.predict(test_preds)

In [30]:
test_preds[968]

array([9.18947329, 7.85101792, 7.82053412, 7.2681858 , 7.68962494])

In [23]:
final_predict.min()

3.9581510156214748

In [166]:
submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),header=None,
                  index=False, float_format='%.4f')

In [167]:
submission.describe()

Unnamed: 0,pred
count,1000.0
mean,5.702297
std,0.60319
min,4.704549
25%,5.232757
50%,5.576749
75%,6.059049
max,8.373323
