In [1]:
import numpy as np
import pandas as pd
import jieba
import time
import warnings

warnings.filterwarnings('ignore')

In [2]:
feature_data = pd.read_csv('../tmp/data_fmt_all.csv', encoding='utf-8')
feature_data.head(3)

Unnamed: 0,vid,004997,0101,0102,0104,0105,0106,0107,0108,0109,...,Y19001,Y19004,Y29001,Y29002,Y29003,Y29004,Y29005,Y29021,Y29058,Y79001
0,000330ad1f424114719b7525f400660b,,双侧甲状腺大小形态正常，包膜光整，实质回声均匀，光点稍粗，未见明显异常回声。CDFI：血流显...,甲状腺彩超（含颈部淋巴细胞）未发现明显异常$前列腺:前列腺稍大膀胱、双侧输尿管未发现明显异常...,,,,,,,...,,,,,,,,,,
1,000381f0069cbf7537e6aac8923034ae,,,左肾、右肾、膀胱、双侧输尿管未发现明显异常$子宫:绝经后子宫左附件、右附件未发现明显异常$肝...,,,,,,,...,,,,,,,,,,
2,0003848ebd8d8163603760d53d975693,,"双侧乳腺腺体层增厚,内部结构紊乱,回声不均，未见异常血流。$双侧甲状腺大小形态正常，包膜尚完...","左肾、右肾、膀胱、双侧输尿管未发现明显异常$双侧乳腺小叶增生$甲状腺双叶回声不均匀,建议甲功...",,,,,,,...,,,,,,,,,,


In [3]:
# 眼部检测：13
columns = feature_data.columns
eye_feats = [x for x in columns if x[0:2] == '13']
print(eye_feats)

['1301', '1302', '1303', '1304', '1305', '1308', '131', '1313', '1314', '1315', '1316', '1319', '1320', '1321', '1322', '1325', '1326', '1328', '1329', '1330', '1331', '1332', '1333', '1334', '1335', '1336', '1337', '1338', '134', '1340', '1341', '1343', '1345', '1346', '1349', '1350', '1353', '1354', '1356', '1357', '1358', '1359', '1362', '1363', '137', '139', '1391']


In [4]:
feature_data.drop(eye_feats, axis=1, inplace=True)

In [5]:
feature_data.drop(['30001'], axis=1, inplace=True) # 血型

In [6]:
brain_feats = [x for x in columns if x[0:2] == 'A3']
brain_feats

['A301', 'A302', 'A39001', 'A39002', 'A39003']

In [7]:
feature_data.drop(brain_feats, axis=1, inplace=True)

In [8]:
# 标本已退检 -> np.nan
start = time.time()
feature_data.replace(to_replace='标本已退检', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='弃查', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='未查', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='详见报告', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='详见检验单', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='详见纸质报告', value=np.nan, regex=True, inplace=True)
feature_data.replace(to_replace='详见报告单', value=np.nan, regex=True, inplace=True)
end = time.time()
print('time used: ', end - start)

time used:  728.2200708389282


In [9]:
feature_data.dropna(axis=1, how='all', inplace=True)

In [10]:
obj_type_data_info = feature_data.select_dtypes(include=['object']).describe().T.assign(missing_pct=feature_data.apply(lambda x: (len(x) - x.count()) / float(len(x))))
obj_type_data_info = obj_type_data_info.sort_values(by='missing_pct')
obj_type_data_info.head()

Unnamed: 0,count,unique,top,freq,missing_pct
vid,57298,57298,4d2968bf27508cd193e9c139c9633a93,1,0.0
1814,57001,5938,15,1426,0.005183
0102,56461,46421,肝、胆、胰、脾、左肾、右肾未发现明显异常,297,0.014608
2302,56225,36,健康,53917,0.018727
190,56170,6816,63,516,0.019687


In [11]:
start = time.time()
for feat in obj_type_data_info.index:
    if feat != 'vid':
        feature_data[feat].replace(to_replace='<|>', value='', regex=True, inplace=True)
        try:
            feature_data[feat] = feature_data[feat].astype(float)
        except:
            pass
end = time.time()

print('Time used: ', end - start)

Time used:  243.02489233016968


In [12]:
obj_type_data_info_missing = obj_type_data_info[obj_type_data_info['missing_pct'] > 0.995]
obj_type_feat_missing = obj_type_data_info_missing.index
print(len(obj_type_feat_missing))
obj_type_feat_missing[0:5]

1255


Index(['669035', '300025', 'Q99024', '799009', '069023'], dtype='object')

In [13]:
feature_data.drop(obj_type_data_info_missing.index, axis=1, inplace=True)

In [14]:
obj_type_data_info_new = feature_data.select_dtypes(include=['object']).describe().T.assign(missing_pct=feature_data.apply(lambda x: (len(x) - x.count()) / float(len(x))))
obj_type_data_info_new = obj_type_data_info_new.sort_values(by=['missing_pct'])
obj_type_data_info_new

Unnamed: 0,count,unique,top,freq,missing_pct
vid,57298,57298,4d2968bf27508cd193e9c139c9633a93,1,0.000000
1814,57001,5938,15,1426,0.005183
0102,56461,46421,肝、胆、胰、脾、左肾、右肾未发现明显异常,297,0.014608
2302,56225,36,健康,53917,0.018727
0113,55968,8879,肝脏大小、形态正常，包膜光整，肝内血管走行较清晰，回声均匀。,10765,0.023212
0114,55956,6059,胆囊大小、形态正常，囊壁光整，囊腔内透声好，胆总管无扩张。,30782,0.023421
0116,55844,703,脾脏大小、形态正常，包膜光整，回声均匀。,22850,0.025376
191,55617,15056,320,160,0.029338
1001,55286,3965,正常心电图,13151,0.035115
2404,55137,948,170,811,0.037715


In [15]:
obj_feat = [x for x in obj_type_data_info_new.index]
len(obj_feat)

379

In [16]:
abnormal_num_feat1 = ['1814', '191', '2403', '2404', '1815', '10004', '192', '1117', '1115', '314', '183', '10003',
                      '31', '38' , '312', '32', '313', '2333', '2372', '2406', '2420', '155']
for feat in abnormal_num_feat1:
    feature_data[feat] = feature_data[feat].apply(lambda x: x if (isinstance(x, float) or isinstance(x, int)) else x.strip().split('$')[0])
    feature_data[feat].replace(to_replace='降脂后复查', value=999, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='未见', value=np.nan, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='CLT1D', value=np.nan, regex=True, inplace=True) # 1815
    feature_data[feat].replace(to_replace='4.14.', value='4.14', regex=True, inplace=True)  # 10004
    feature_data[feat].replace(to_replace='.3.70', value='3.70', regex=True, inplace=True) # 10004
    feature_data[feat].replace(to_replace='16.7.07', value='16.70', regex=True, inplace=True) # 192
    feature_data[feat].replace(to_replace='12.01.', value='12.01', regex=True, inplace=True) # 192
    feature_data[feat].replace(to_replace='77..21', value='77.21', regex=True, inplace=True) # 183
    feature_data[feat].replace(to_replace='5.10.', value='5.10', regex=True, inplace=True) # 31
    feature_data[feat].replace(to_replace='8.53.', value='8.53', regex=True, inplace=True) # 38
    feature_data[feat].replace(to_replace='-', value='.', regex=True, inplace=True)  # 312
    feature_data[feat].replace(to_replace='\+\+', value=999, regex=True, inplace=True) # 312
    feature_data[feat].replace(to_replace='2.1.', value='2.1', regex=True, inplace=True) # 
    feature_data[feat].replace(to_replace=' 脂血', value='', regex=True, inplace=True) # 313

    try:
        feature_data[feat] = feature_data[feat].astype(float)
    except:
        pass

In [17]:
feature_data['0104'].replace(to_replace='心内各结构未见明显异常', value=np.nan, regex=True, inplace=True)
feature_data['0104'] = feature_data['0104'].astype(float)

In [18]:
feature_data['1106'].replace(to_replace='1.81.', value=1.81, regex=True, inplace=True)
feature_data['1106'].replace(to_replace='1..75', value=1.75, regex=True, inplace=True)
feature_data['1106'] = feature_data['1106'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['1106'] = feature_data['1106'].astype(float)

In [19]:
feature_data['129'].replace(to_replace='0.81.', value=0.81, regex=True, inplace=True)
feature_data['129'] = feature_data['129'].astype(float)

In [20]:
feature_data['155'].replace(to_replace='(μIU/ml)', value='', regex=True, inplace=True)
feature_data['155'].replace(to_replace='\(\)', value='', regex=True, inplace=True)
feature_data['155'].replace(to_replace='S', value='', regex=True, inplace=True)
feature_data['155'].replace(to_replace=' ', value=np.nan, inplace=True)
feature_data['155'].replace(to_replace='', value=np.nan, inplace=True)
feature_data['155'] = feature_data['155'].astype(float)

In [21]:
feature_data['157'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['157'] = feature_data['157'].astype(float)

In [22]:
feature_data['164'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['164'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['164'].replace(to_replace='＜', value='', regex=True, inplace=True)
feature_data['164'] = feature_data['164'].astype(float)

In [23]:
feature_data['166'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['166'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['166'].replace(to_replace='见报告单', value=np.nan, inplace=True)
feature_data['166'].replace(to_replace='＜', value='', regex=True, inplace=True)
feature_data['166'] = feature_data['166'].astype(float)

In [24]:
feature_data['1127'] = feature_data['1127'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['1127'] = feature_data['1127'].astype(float)

In [25]:
feature_data['1171'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['1171'].replace(to_replace='＜', value='', regex=True, inplace=True)
feature_data['1171'].replace(to_replace='1..27', value=1.27, regex=True, inplace=True)
feature_data['1171'] = feature_data['1171'].astype(float)

In [26]:
feature_data['21A004'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['21A004'] = feature_data['21A004'].astype(float)

In [27]:
feature_data['2163'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['2163'] = feature_data['2163'].astype(float)

In [28]:
feature_data['2168'] = feature_data['2168'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2168'] = feature_data['2168'].astype(float)

In [29]:
feature_data['2247'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['2247'].replace(to_replace='阳性', value=999, inplace=True)
feature_data['2247'].replace(to_replace='弱阳性', value=499, inplace=True)
feature_data['2247'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['2247'].replace(to_replace='+', value=999, inplace=True)
feature_data['2247'] = feature_data['2247'].astype(float)

In [30]:
feature_data['2371'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['2371'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['2371'].replace(to_replace='+', value=999, inplace=True)
feature_data['2371'] = feature_data['2371'].astype(float)

In [31]:
feature_data['2420'] = feature_data['2420'].astype(float)

In [32]:
feature_data['2421'] = feature_data['2421'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2421'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['2421'] = feature_data['2421'].astype(float)

In [33]:
feature_data['2424'].replace(to_replace='光感', value=np.nan, inplace=True)
feature_data['2424'] = feature_data['2424'].astype(float)

In [34]:
feature_data['2425'].replace(to_replace='光感', value=np.nan, inplace=True)
feature_data['2425'].replace(to_replace='指数', value=np.nan, inplace=True)
feature_data['2425'] = feature_data['2425'].astype(float)

In [35]:
feature_data['2413'].replace(to_replace='\(理想指数：75以上\)', value='', regex=True, inplace=True)
feature_data['2413'] = feature_data['2413'].astype(float)

In [36]:
feature_data['269003'].replace(to_replace='14.4.', value=14.4, inplace=True)
feature_data['269003'].replace(to_replace='----', value=np.nan, regex=True, inplace=True)
feature_data['269003'] = feature_data['269003'].astype(float)

In [37]:
feature_data['269004'].replace(to_replace='----', value=np.nan, inplace=True)
feature_data['269004'] = feature_data['269004'].astype(float)

In [38]:
feature_data['269005'].replace(to_replace='----', value=np.nan, inplace=True)
feature_data['269005'] = feature_data['269005'].astype(float)

In [39]:
feature_data['269006'].replace(to_replace='126.0.', value=126.0, inplace=True)
feature_data['269006'] = feature_data['269006'].astype(float)

In [40]:
feature_data['269012'].replace(to_replace='116.0.', value=116.0, inplace=True)
feature_data['269012'] = feature_data['269012'].astype(float)

In [41]:
feature_data['269013'].replace(to_replace='未见', value=np.nan, regex=True, inplace=True)
feature_data['269013'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['269013'].replace(to_replace='-', value='.', regex=True, inplace=True)
feature_data['269013'] = feature_data['269013'].astype(float)  

In [42]:
feature_data['269011'] = feature_data['269011'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['269011'] = feature_data['269011'].astype(float)

In [43]:
feature_data['279034'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['279034'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['279034'].replace(to_replace='+', value=999, inplace=True)
#feature_data['279034'].replace(to_replace='.', value=np.nan, regex=True, inplace=True)
feature_data['279034'] = feature_data['279034'].astype(float)

In [44]:
feature_data['312'].replace(to_replace='\+', value='', regex=True, inplace=True)
feature_data['312'] = feature_data['312'].astype(float)

In [45]:
feature_data['339127'].replace(to_replace='0..52', value=0.52, inplace=True)
feature_data['339127'] = feature_data['339127'].astype(float)

In [46]:
feature_data['339131'].replace(to_replace='未做', value=np.nan, inplace=True)
feature_data['339131'] = feature_data['339131'].astype(float)

In [47]:
feature_data['360'].replace(to_replace='.', value=np.nan, inplace=True)
feature_data['360'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['360'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['360'].replace(to_replace='阳性(+)', value=499, inplace=True)
feature_data['360'].replace(to_replace='+', value=499, inplace=True)
feature_data['360'].replace(to_replace='+-', value=299, inplace=True)
feature_data['360'].replace(to_replace='++', value=699, inplace=True)
feature_data['360'].replace(to_replace='+++', value=999, inplace=True)
feature_data['360'].replace(to_replace='1+', value=499, inplace=True)
feature_data['360'].replace(to_replace='2+', value=699, inplace=True)
feature_data['360'].replace(to_replace='3+', value=999, inplace=True)
feature_data['360'].replace(to_replace='-', value='.', regex=True, inplace=True)
feature_data['360'] = feature_data['360'].astype(float)

In [48]:
feature_data['369044'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['369044'].replace(to_replace='-', value='.', regex=True, inplace=True)
feature_data['369044'].replace(to_replace='未见', value=np.nan, inplace=True)
feature_data['369044'].replace(to_replace='\+', value='', regex=True, inplace=True)
feature_data['369044'] = feature_data['369044'].astype(float)

In [49]:
feature_data['3801'].replace(to_replace='正常', value=np.nan, regex=True, inplace=True)
feature_data['3801'] = feature_data['3801'].astype(float)

In [50]:
feature_data['3803'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['3803'] = feature_data['3803'].astype(float)

In [51]:
feature_data['3804'].replace(to_replace='1、7.71', value=17.71, regex=True, inplace=True)
feature_data['3804'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['3804'] = feature_data['3804'].astype(float)

In [52]:
feature_data['3807'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['3807'] = feature_data['3807'].astype(float)

In [53]:
feature_data['3810'].replace(to_replace='s', value='', regex=True, inplace=True)
feature_data['3810'] = feature_data['3810'].astype(float)

In [54]:
feature_data['979012'].replace(to_replace='未见', value=np.nan, regex=True, inplace=True)
feature_data['979012'] = feature_data['979012'].astype(float)

In [55]:
feature_data['300005'].replace(to_replace='未见', value=np.nan, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='偶见', value=np.nan, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='阴性', value=np.nan, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='未检出', value=np.nan, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='满视野', value=999, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='/LP', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='/HP', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='/hp', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='\+', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='++', value=999, inplace=True)
feature_data['300005'].replace(to_replace='少量', value=50, inplace=True)
feature_data['300005'].replace(to_replace='少数', value=50, inplace=True)
feature_data['300005'].replace(to_replace='少许', value=50, inplace=True)
feature_data['300005'].replace(to_replace='阳性(+)', value=999, inplace=True)
feature_data['300005'].replace(to_replace='阳性', value=999, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='脓白1', value=999, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='个', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='个/LP', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300005'].replace(to_replace=' ', value=np.nan, inplace=True)
feature_data['300005'].replace(to_replace='', value=np.nan, inplace=True)
feature_data['300005'].replace(to_replace='２－５', value=2.5, inplace=True)
feature_data['300005'].replace(to_replace='１－３', value=1.30, inplace=True)
feature_data['300005'].replace(to_replace='０－２', value=0.2, inplace=True)
feature_data['300005'].replace(to_replace='Ⅱ', value=2, inplace=True)
feature_data['300005'].replace(to_replace='-', value='.', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='.', value=np.nan, inplace=True)
feature_data['300005'].replace(to_replace='3.20.', value=3.20, inplace=True)
feature_data['300005'].replace(to_replace='12..15', value=12.15, inplace=True)
feature_data['300005'].replace(to_replace='3..4', value=3.4, inplace=True)
feature_data['300005'].replace(to_replace='\(2.4\)', value=2.40, regex=True, inplace=True)
feature_data['300005'].replace(to_replace='\(', value='', regex=True, inplace=True)
feature_data['300005'].replace(to_replace='\)', value='', regex=True, inplace=True)
feature_data['300005'] = feature_data['300005'].astype(float)

In [56]:
feature_data['300021'] = feature_data['300021'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300021'] = feature_data['300021'].astype(float)

In [57]:
feature_data['300030'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['300030'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['300030'] = feature_data['300030'].astype(float)

In [58]:
feature_data['300031'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['300031'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['300031'] = feature_data['300031'].astype(float)

In [59]:
feature_data['300032'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['300032'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['300032'] = feature_data['300032'].astype(float)

In [60]:
feature_data['300067'] = feature_data['300067'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300067'] = feature_data['300067'].astype(float)

In [61]:
feature_data['300071'].replace(to_replace='0.00-25.00', value=12.50, regex=True, inplace=True)
feature_data['300071'] = feature_data['300071'].astype(float)

In [62]:
feature_data['300073'] = feature_data['300073'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300073'] = feature_data['300073'].astype(float)

In [63]:
feature_data['300084'].replace(to_replace='008.70r', value=8.70, inplace=True)
feature_data['300084'].replace(to_replace='001.20r', value=1.20, inplace=True)
feature_data['300084'].replace(to_replace='11.80/.', value=11.80, inplace=True)
feature_data['300084'] = feature_data['300084'].astype(float)

In [64]:
feature_data['300093'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300093'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['300093'] = feature_data['300093'].astype(float)

In [65]:
feature_data['300099'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300099'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['300099'].replace(to_replace='\(umol/L\)', value=np.nan, regex=True, inplace=True)
feature_data['300099'] = feature_data['300099'].astype(float)

In [66]:
feature_data['300120'].replace(to_replace='阳性（+）', value=999, inplace=True)
feature_data['300120'].replace(to_replace='﹥', value='', regex=True, inplace=True)
feature_data['300120'] = feature_data['300120'].astype(float)

In [67]:
feature_data['300121'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300121'] = feature_data['300121'].astype(float)

In [68]:
feature_data['300122'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['300122'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300122'] = feature_data['300122'].astype(float)

In [69]:
feature_data['300123'].replace(to_replace='阳性', value=999, inplace=True)
feature_data['300123'].replace(to_replace='阳性（+）', value=999, inplace=True)
feature_data['300123'] = feature_data['300123'].astype(float)

In [70]:
feature_data['300124'].replace(to_replace='阳性', value=999, inplace=True)
feature_data['300124'].replace(to_replace='阳性（+）', value=999, inplace=True)
feature_data['300124'] = feature_data['300124'].astype(float)

In [71]:
feature_data['300130'].replace(to_replace='+', value=999, inplace=True)
feature_data['300130'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300130'] = feature_data['300130'].astype(float)

In [72]:
feature_data['300134'].replace(to_replace='＞', value='', regex=True, inplace=True)
feature_data['300134'].replace(to_replace=',5.00', value=5.00, inplace=True)
feature_data['300134'] = feature_data['300134'].astype(float)

In [73]:
feature_data['300152'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['300152'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300152'].replace(to_replace='+++', value=999, inplace=True)
feature_data['300152'] = feature_data['300152'].astype(float)

In [74]:
feature_data['459210'].replace(to_replace='15.73.', value=15.73, inplace=True)
feature_data['459210'] = feature_data['459210'].astype(float)

In [75]:
feature_data['459278'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['459278'].replace(to_replace='+', value=999, inplace=True)
feature_data['459278'] = feature_data['459278'].astype(float)

In [76]:
feature_data['1107'] = feature_data['1107'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['1107'] = feature_data['1107'].astype(float)

In [77]:
feature_data['709001'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['709001'] = feature_data['709001'].astype(float)

In [78]:
feature_data['709002'].replace(to_replace='\(U/ml\)', value='', regex=True, inplace=True)
feature_data['709002'] = feature_data['709002'].astype(float)

In [79]:
feature_data['709005'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709005'] = feature_data['709005'].astype(float)

In [80]:
feature_data['709006'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709006'].replace(to_replace='**.*', value=np.nan, inplace=True)
feature_data['709006'] = feature_data['709006'].astype(float)

In [81]:
feature_data['709007'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709007'].replace(to_replace='**.*', value=np.nan, inplace=True)
feature_data['709007'] = feature_data['709007'].astype(float)

In [82]:
feature_data['709008'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709008'] = feature_data['709008'].astype(float)

In [83]:
feature_data['709009'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709009'] = feature_data['709009'].astype(float)

In [84]:
feature_data['709010'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709010'] = feature_data['709010'].astype(float)

In [85]:
feature_data['709011'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709011'].replace(to_replace='***.**', value=np.nan, inplace=True)
feature_data['709011'].replace(to_replace='0.14.', value=0.14, inplace=True)
feature_data['709011'] = feature_data['709011'].astype(float)

In [86]:
feature_data['709012'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709012'].replace(to_replace='***.**', value=np.nan, inplace=True)
feature_data['709012'] = feature_data['709012'].astype(float)

In [87]:
feature_data['709014'].replace(to_replace='0.1.', value=0.1, inplace=True)
feature_data['709014'].replace(to_replace='.0.04', value=0.04, inplace=True)
feature_data['709014'] = feature_data['709014'].astype(float)

In [88]:
feature_data['709015'].replace(to_replace='5.42.', value=5.42, inplace=True)
feature_data['709015'].replace(to_replace='4.42.', value=4.42, inplace=True)
feature_data['709015'] = feature_data['709015'].astype(float)

In [89]:
feature_data['709017'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709017'] = feature_data['709017'].astype(float)

In [90]:
feature_data['709018'].replace(to_replace='80.285.2', value=80.2, inplace=True)
feature_data['709018'] = feature_data['709018'].astype(float)

In [91]:
feature_data['709021'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709021'] = feature_data['709021'].astype(float)

In [92]:
feature_data['709024'].replace(to_replace='.9.8', value=9.8, inplace=True)
feature_data['709024'] = feature_data['709024'].astype(float)

In [93]:
feature_data['709026'].replace(to_replace='0.22.', value=0.22, inplace=True)
feature_data['709026'].replace(to_replace='0.21.', value=0.21, inplace=True)
feature_data['709026'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['709026'] = feature_data['709026'].astype(float)

In [94]:
feature_data['669002'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['669002'] = feature_data['669002'].astype(float)

In [95]:
feature_data['809001'] = feature_data['809001'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['809001'] = feature_data['809001'].astype(float)

In [96]:
feature_data['809014'].replace(to_replace='2.01.', value=2.01, inplace=True)
feature_data['809014'] = feature_data['809014'].astype(float)

In [97]:
feature_data['899001'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['899001'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['899001'] = feature_data['899001'].astype(float)

In [98]:
feature_data['669023'].replace(to_replace='\(ng/mL\)', value='', regex=True, inplace=True)
feature_data['669023'] = feature_data['669023'].astype(float)

In [99]:
feature_data['1112'].replace(to_replace=' %', value='', regex=True, inplace=True)
feature_data['1112'] = feature_data['1112'].astype(float)

In [100]:
feature_data['1474'] = feature_data['1474'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['1474'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['1474'] = feature_data['1474'].astype(float)

In [101]:
feature_data['10009'] = feature_data['10009'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['10009'].replace(to_replace='4.3.', value=4.3, inplace=True)
feature_data['10009'] = feature_data['10009'].astype(float)

In [102]:
feature_data['100012'] = feature_data['100012'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['100012'] = feature_data['100012'].astype(float)

In [103]:
feature_data['100014'].replace(to_replace='20.908.', value=20.908, inplace=True)
feature_data['100014'] = feature_data['100014'].astype(float)

In [104]:
feature_data['2177'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['2177'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['2177'].replace(to_replace='+', value=999, inplace=True)
feature_data['2177'].replace(to_replace='10.0.', value=10.0, inplace=True)
feature_data['2177'] = feature_data['2177'].astype(float)

In [105]:
feature_data['2386'] = feature_data['2386'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2386'] = feature_data['2386'].astype(float)

In [106]:
feature_data['669009'] = feature_data['669009'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['669009'] = feature_data['669009'].astype(float)

In [107]:
feature_data['809025'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['809025'] = feature_data['809025'].astype(float)

In [108]:
feature_data['2376'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['2376'].replace(to_replace='+', value=999, inplace=True)
feature_data['2376'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['2376'].replace(to_replace='阳性', value=999, inplace=True)
feature_data['2376'] = feature_data['2376'].astype(float)

In [109]:
feature_data['300008'].replace(to_replace='标本已退检', value=np.nan, inplace=True)
feature_data['300008'] = feature_data['300008'].astype(float)

In [110]:
feature_data['300013'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['300013'].replace(to_replace='6.31.0.45', value=6.31, inplace=True)
feature_data['300013'] = feature_data['300013'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300013'] = feature_data['300013'].astype(float)

In [111]:
feature_data['809021'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['809021'] = feature_data['809021'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['809021'] = feature_data['809021'].astype(float)

In [112]:
feature_data['300048'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300048'] = feature_data['300048'].astype(float)

In [113]:
feature_data['2409'].replace(to_replace='% \(正常值 12-22%\)', value='', regex=True, inplace=True)
feature_data['2409'].replace(to_replace='% \(正常值 11-21%\)', value='', regex=True, inplace=True)
feature_data['2409'].replace(to_replace='% \(正常值 21-34%\)', value='', regex=True, inplace=True)
feature_data['2409'].replace(to_replace='\(正常值 12-22%\)', value='', regex=True, inplace=True)
feature_data['2409'] = feature_data['2409'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2409'] = feature_data['2409'].astype(float)

In [114]:
feature_data['2410'] = feature_data['2410'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2410'].replace(to_replace='3.0.0', value=3.00, inplace=True)
feature_data['2410'] = feature_data['2410'].astype(float)

In [115]:
feature_data['2412'] = feature_data['2412'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2412'] = feature_data['2412'].astype(float)

In [116]:
feature_data['300001'] = feature_data['300001'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300001'] = feature_data['300001'].astype(float)

In [117]:
feature_data['669021'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['669021'].replace(to_replace='2.70.', value=2.70, inplace=True)
feature_data['669021'] = feature_data['669021'].astype(float)

In [118]:
feature_data['300014'].replace(to_replace='/', value=np.nan, inplace=True)
feature_data['300014'] = feature_data['300014'].astype(float)

In [119]:
feature_data['300009'].replace(to_replace='.45.21', value=45.21, inplace=True)
feature_data['300009'] = feature_data['300009'].astype(float)

In [120]:
feature_data['669003'].replace(to_replace='--', value=np.nan, inplace=True)
feature_data['669003'] = feature_data['669003'].astype(float)

In [121]:
feature_data['20002'] = feature_data['20002'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['20002'] = feature_data['20002'].astype(float)

In [122]:
feature_data['300074'] = feature_data['300074'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['300074'] = feature_data['300074'].astype(float)

In [123]:
feature_data['809052'].replace(to_replace='32..5', value=32.5, inplace=True)
feature_data['809052'] = feature_data['809052'].astype(float)

In [124]:
feature_data['300076'].replace(to_replace='女性肿瘤指标', value=np.nan, inplace=True)
feature_data['300076'] = feature_data['300076'].astype(float)

In [125]:
feature_data['1873'].replace(to_replace='阴性', value=np.nan, inplace=True)
feature_data['1873'] = feature_data['1873'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['1873'].replace(to_replace='＜', value='', regex=True, inplace=True)
feature_data['1873'].replace(to_replace='+', value=999, inplace=True)
feature_data['1873'] = feature_data['1873'].astype(float)

In [126]:
feature_data['A701'].replace(to_replace='kpa', value='', regex=True, inplace=True)
feature_data['A701'].replace(to_replace='6,3', value=6.3, inplace=True)
feature_data['A701'] = feature_data['A701'].astype(float)

In [127]:
feature_data['A702'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['A702'].replace(to_replace='fo', value=np.nan, inplace=True)
feature_data['A702'] = feature_data['A702'].astype(float)

In [128]:
feature_data['A703'].replace(to_replace='db/m', value='', regex=True, inplace=True)
feature_data['A703'] = feature_data['A703'].astype(float)

In [129]:
feature_data['A704'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['A704'].replace(to_replace='脂肪肝', value=999, inplace=True)
feature_data['A704'] = feature_data['A704'].astype(float)

In [130]:
feature_data['809007'] = feature_data['809007'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['809007'] = feature_data['809007'].astype(float)

In [131]:
feature_data['819007'].replace(to_replace='+-', value=np.nan, inplace=True)
feature_data['819007'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['819007'].replace(to_replace='-', value='.', regex=True, inplace=True)
feature_data['819007'].replace(to_replace='+', value=20, inplace=True)
feature_data['819007'].replace(to_replace='++', value=40, inplace=True)
feature_data['819007'].replace(to_replace='+++', value=60, inplace=True)
#feature_data['819007'].replace(to_replace='.', value=np.nan, inplace=True)
feature_data['819007'].replace(to_replace='未见', value=np.nan, inplace=True)
feature_data['819007'] = feature_data['819007'].astype(float)

In [132]:
feature_data['269026'].replace(to_replace='----', value='',inplace=True)
feature_data['269026'].replace(to_replace=' ', value=np.nan, inplace=True)
feature_data['269026'].replace(to_replace='', value=np.nan, inplace=True)
feature_data['269026'] = feature_data['269026'].astype(float)

In [133]:
feature_data['321'].replace(to_replace='---', value='', inplace=True)
feature_data['321'].replace(to_replace='----', value='', inplace=True)
feature_data['321'].replace(to_replace='-----', value='', inplace=True)
feature_data['321'].replace(to_replace='--', value='', inplace=True)
feature_data['321'].replace(to_replace='', value=np.nan, inplace=True)
feature_data['321'] = feature_data['321'].astype(float)

In [134]:
feature_data['2407'] = feature_data['2407'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2407'] = feature_data['2407'].astype(float)

In [135]:
feature_data['2986'] = feature_data['2986'].apply(lambda x: x if isinstance(x, float) else x.strip().split('$')[0])
feature_data['2986'] = feature_data['2986'].astype(float)

In [136]:
feature_data['300036'].replace(to_replace='+', value=999, inplace=True)
feature_data['300036'].replace(to_replace='+-', value=499, inplace=True)
feature_data['300036'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['300036'].replace(to_replace='阳性', value=999, regex=True, inplace=True)
feature_data['300036'].replace(to_replace='弱阳', value=666, regex=True, inplace=True)
feature_data['300036'].replace(to_replace='阴性', value=np.nan, regex=True, inplace=True)
feature_data['300036'].replace(to_replace='A', value=np.nan, regex=True, inplace=True)
feature_data['300036'] = feature_data['300036'].astype(float)

In [137]:
feature_data['300078'].replace(to_replace='女性肿瘤指标', value=np.nan, inplace=True)
feature_data['300078'].replace(to_replace='2..99', value=2.99, inplace=True)
feature_data['300078'] = feature_data['300078'].astype(float)

In [138]:
feature_data['979001'].replace(to_replace='-----', value=np.nan, inplace=True)
feature_data['979001'] = feature_data['979001'].astype(float)

In [139]:
feature_data['979002'].replace(to_replace='-----', value=np.nan, inplace=True)
feature_data['979002'] = feature_data['979002'].astype(float)

In [140]:
feature_data['979003'].replace(to_replace='-----', value=np.nan, inplace=True)
feature_data['979003'] = feature_data['979003'].astype(float)

In [141]:
feature_data['979007'].replace(to_replace='320.00.', value=320.00, inplace=True)
feature_data['979007'] = feature_data['979007'].astype(float)

In [142]:
feature_data['989004'].replace(to_replace='%', value='', regex=True, inplace=True)
feature_data['989004'].replace(to_replace='-', value=np.nan, inplace=True)
feature_data['989004'] = feature_data['989004'].astype(float)

In [143]:
# 2228 2229 2230
for feat in ['2228', '2229', '2230', '2231', '2233']:
    feature_data[feat].replace(to_replace='阳性 ', value='', regex=True, inplace=True)
    feature_data[feat].replace(to_replace='重度', value=50, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='阳性', value=20, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='阴性 ', value='', regex=True, inplace=True)
    feature_data[feat].replace(to_replace='阴性', value='', regex=True, inplace=True)
    feature_data[feat].replace(to_replace='极弱阳', value=np.nan, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='可疑', value=np.nan, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='阴性（-）', value=np.nan, regex=True, inplace=True)
    
    feature_data[feat].replace(to_replace='', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='+', value=20, inplace=True)
    feature_data[feat].replace(to_replace='+-', value=20, inplace=True)
    feature_data[feat].replace(to_replace='-', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='--', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='(-)', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='（-）', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='$（-）', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='阳性\(低水平\)', value=np.nan, regex=True, inplace=True)
    feature_data[feat] = feature_data[feat].astype(float)


In [144]:
abnormal_num_feat2 = ['1840', '3193', '2405', '1850', '100006', '2333', '2372', '320']
for feat in abnormal_num_feat2:
    feature_data[feat].replace(to_replace='[^0-9.]+', value=np.nan, regex=True, inplace=True)
    feature_data[feat].replace(to_replace='.', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='', value=np.nan,  inplace=True)
    feature_data[feat].replace(to_replace=' ', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='  ', value=np.nan, inplace=True)
    feature_data[feat].replace(to_replace='---', value=np.nan, inplace=True)
    try:
        feature_data[feat] = feature_data[feat].astype(float)
    except:
        pass

In [145]:
feature_data['I49002'].replace(to_replace='３＋', value=3.0, inplace=True)
feature_data['I49002'].replace(to_replace='２＋', value=2.0, inplace=True)
feature_data['I49002'] = feature_data['I49002'].apply(lambda x: x if (isinstance(x, float) or isinstance(x, int)) else x.strip().split('$')[0])

feature_data['I49002'].replace(to_replace='+', value=999, inplace=True)
feature_data['I49002'].replace(to_replace='\+', value='', regex=True, inplace=True)
feature_data['I49002'].replace(to_replace='少许', value=np.nan,  inplace=True)
feature_data['I49002'].replace(to_replace='正常', value=np.nan, inplace=True)
feature_data['I49002'].replace(to_replace='/HP', value='', regex=True, inplace=True)
feature_data['I49002'].replace(to_replace=' ', value=np.nan, inplace=True)
feature_data['I49002'].replace(to_replace='', value=np.nan, inplace=True)
feature_data['I49002'].replace(to_replace='  ', value=np.nan, inplace=True)
feature_data['I49002'] = feature_data['I49002'].astype(float)

In [146]:
obj_type_data_info_new = feature_data.select_dtypes(include=['object']).describe().T.assign(missing_pct=feature_data.apply(lambda x: (len(x) - x.count()) / float(len(x))))
obj_type_data_info_new = obj_type_data_info_new.sort_values(by=['missing_pct'])
obj_type_data_info_new

Unnamed: 0,count,unique,top,freq,missing_pct
vid,57298,57298,4d2968bf27508cd193e9c139c9633a93,1,0.000000
0102,56461,46421,肝、胆、胰、脾、左肾、右肾未发现明显异常,297,0.014608
2302,56225,36,健康,53917,0.018727
0113,55968,8879,肝脏大小、形态正常，包膜光整，肝内血管走行较清晰，回声均匀。,10765,0.023212
0114,55956,6059,胆囊大小、形态正常，囊壁光整，囊腔内透声好，胆总管无扩张。,30782,0.023421
0116,55844,703,脾脏大小、形态正常，包膜光整，回声均匀。,22850,0.025376
1001,55286,3965,正常心电图,13151,0.035115
3192,54978,17,-,51450,0.040490
3191,54978,13,-,52461,0.040490
3197,54978,7,-,52355,0.040490


In [147]:
obj_feat = [x for x in obj_type_data_info_new.index]
len(obj_feat)

226

In [148]:
feature_data[obj_feat].to_csv('./obj_feat_data.csv', index=False, encoding='utf-8')

In [149]:
feature_data['0434'].unique()

array(['甲状腺功能亢进（治疗中）', '无', nan, ..., '时有左上腹隐痛', '高血压史（治疗中）, 宫外孕术后, 甲肝史',
       '血压、血脂、血糖、血粘度、血尿酸均偏高, 脂肪肝史'], dtype=object)

# 获取数值型特征

In [150]:
num_type_data_info = feature_data.select_dtypes(include=['float']).describe().T.assign(missing_pct=feature_data.apply(lambda x : (len(x)-x.count())/float(len(x))))
num_type_data_info = num_type_data_info.sort_values(by='missing_pct')
num_type_data_info

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
1814,57001.0,28.420646,24.561303,0.00,16.000,22.00,33.00,1823.18,0.005183
190,56170.0,70.534148,19.006277,6.15,57.000,69.00,81.15,910.18,0.019687
191,55617.0,333.288002,95.077256,2.10,263.490,324.30,393.78,999.00,0.029338
2404,55120.0,165.087577,8.690431,0.00,159.000,165.00,171.00,200.00,0.038012
2403,55119.0,67.713237,243.447769,0.00,57.000,65.60,75.00,57142.00,0.038029
1840,54948.0,6.113317,0.690761,4.50,6.000,6.00,6.50,9.00,0.041014
2405,54845.0,24.729819,91.800656,0.00,21.800,24.20,26.60,21507.00,0.042811
1815,54247.0,23.633019,13.076784,0.30,18.000,21.11,26.00,1502.24,0.053248
1850,52635.0,5.277747,1.332710,2.31,4.630,5.03,5.50,23.01,0.081382
10004,52054.0,4.811626,8.013462,-1799.76,3.940,4.71,5.59,27.90,0.091522


In [151]:
num_type_data_info_missing = num_type_data_info[num_type_data_info['missing_pct'] > 0.99]
num_type_feat_missing = num_type_data_info_missing.index
print(len(num_type_feat_missing))
#num_type_feat_missing = []

845


In [152]:
feature_data.drop(num_type_data_info_missing.index, axis=1, inplace=True)

In [153]:
num_type_data_info_new = feature_data.select_dtypes(include=['float']).describe().T.assign(missing_pct=feature_data.apply(lambda x: (len(x) - x.count()) / float(len(x))))
num_type_data_info_new =  num_type_data_info_new.sort_values(by=['missing_pct'])
num_type_data_info_new

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
1814,57001.0,28.420646,24.561303,0.000,16.0000,22.0000,33.00000,1823.180,0.005183
190,56170.0,70.534148,19.006277,6.150,57.0000,69.0000,81.15000,910.180,0.019687
191,55617.0,333.288002,95.077256,2.100,263.4900,324.3000,393.78000,999.000,0.029338
2404,55120.0,165.087577,8.690431,0.000,159.0000,165.0000,171.00000,200.000,0.038012
2403,55119.0,67.713237,243.447769,0.000,57.0000,65.6000,75.00000,57142.000,0.038029
1840,54948.0,6.113317,0.690761,4.500,6.0000,6.0000,6.50000,9.000,0.041014
2405,54845.0,24.729819,91.800656,0.000,21.8000,24.2000,26.60000,21507.000,0.042811
1815,54247.0,23.633019,13.076784,0.300,18.0000,21.1100,26.00000,1502.240,0.053248
1850,52635.0,5.277747,1.332710,2.310,4.6300,5.0300,5.50000,23.010,0.081382
10004,52054.0,4.811626,8.013462,-1799.760,3.9400,4.7100,5.59000,27.900,0.091522


In [154]:
num_feat = [x for x in num_type_data_info_new.index]

In [155]:
for feat in num_feat:
    feature_data[feat] = feature_data[feat].astype('float')
    feature_data[feat].fillna(-999, inplace=True)
    feature_data[feat].replace(to_replace='[^0-9.]+', value='', regex=True, inplace=True)

In [156]:
all_feat = ['vid'] + num_feat
num_data = feature_data[all_feat]

In [157]:
len(num_data)

57298

# 训练模型

In [158]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [159]:
train = pd.read_csv('../data/meinian_round1_train_20180408.csv', encoding='gbk')
train.head()

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,002d1e4859fafd9ded2a2e1e7c839b62,165,100,2.08,1.29,3.24
1,92dd479df5e30ab6a0a1cf85ac53efc3,141,97,2.64,1.36,4.75
2,6bb59d517c4c70f8f50844d24fbd0355,120,80,1.37,1.25,2.66
3,0ebb42adae512906f7e1135da734ea63,100,70,1.27,2.21,1.73
4,ebe7811e919109c42c092abbd98b4ca6,110,80,0.8,1.87,2.21


In [160]:
test = pd.read_csv('../data/meinian_round1_test_b_20180505.csv', encoding='gbk')
test.head()

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,001f849a29f618fa98b23289682d7b37,,,,,
1,0029a7c3e6d454b22c9b1d5233defc2b,,,,,
2,002fe4622e31ccc987a770ca02cdc812,,,,,
3,00372d4cde397f592b0004ebb5a25a2f,,,,,
4,003955abc4de39314f4f872351689046,,,,,


In [161]:
train = pd.merge(train, num_data, on='vid', how='left')
train.head()

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白,1814,190,191,2404,...,369108,2250,3802,069004,069003,069002,2156,300057,2161,3302
0,002d1e4859fafd9ded2a2e1e7c839b62,165,100,2.08,1.29,3.24,32.0,55.0,215.0,166.5,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,92dd479df5e30ab6a0a1cf85ac53efc3,141,97,2.64,1.36,4.75,189.0,71.0,376.0,181.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,6bb59d517c4c70f8f50844d24fbd0355,120,80,1.37,1.25,2.66,281.0,78.0,436.0,173.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,15.48,2.87,1.33,-999.0
3,0ebb42adae512906f7e1135da734ea63,100,70,1.27,2.21,1.73,26.0,65.9,205.0,161.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,ebe7811e919109c42c092abbd98b4ca6,110,80,0.8,1.87,2.21,24.0,46.0,236.0,158.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [162]:
test = pd.merge(test, num_data, on='vid', how='left')
test.head()

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白,1814,190,191,2404,...,369108,2250,3802,069004,069003,069002,2156,300057,2161,3302
0,001f849a29f618fa98b23289682d7b37,,,,,,32.0,61.0,296.0,169.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,0029a7c3e6d454b22c9b1d5233defc2b,,,,,,7.84,67.35,432.52,153.5,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,002fe4622e31ccc987a770ca02cdc812,,,,,,43.0,75.0,418.0,173.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,00372d4cde397f592b0004ebb5a25a2f,,,,,,54.5,57.9,263.46,163.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,003955abc4de39314f4f872351689046,,,,,,23.57,58.2,382.0,148.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [163]:
train.to_csv('./train_new.csv', index=False, encoding='utf-8')
test.to_csv('./test_new.csv', index=False, encoding='utf-8')

In [164]:
len(test)

9532

In [165]:
target_feat = ['收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白', '血清低密度脂蛋白']

In [166]:
len(train)

38199

In [167]:
for feat in target_feat:  
    train[feat].replace('未查', np.nan, inplace=True)
    train[feat].replace('弃查', np.nan, inplace=True)
    train[feat].replace(to_replace='[^0-9.]+', value='', regex=True, inplace=True)
    train[feat].replace(to_replace='2.2.8', value='2.28', inplace=True)
    train[feat] = train[feat].astype(float)
    train.dropna(subset=[feat], inplace=True)

In [168]:
features = [feat for feat in train.columns if feat not in target_feat + ['vid']]

In [169]:
train_data = train[features]
test_data = test[features]
X_train, X_test, y_train, y_test = train_test_split(train_data, train[target_feat],
                                                   test_size = 0.15,
                                                   random_state=100)

In [170]:
X_test_pred = pd.DataFrame()

In [171]:
# 定义评价函数
def evaluation(pred, target):
    target = target.get_label()
    bias = np.ones((1, len(pred)))
    log_pred = np.log(pred + bias)
    log_target = np.log(target + bias)
    distance = log_pred - log_target
    distance_sq = np.square(distance)
    loss = np.sum(distance_sq) / float(len(pred))
    return 'loss_value', float(loss)

In [172]:
# 收缩压
sp_model = xgb.XGBRegressor(n_estimators = 1000, max_depth = 5, learning_rate = 0.03)
# sp_model = xgb.XGBRegressor(n_estimators=500, max_depth=5, learning_rate=0.1)
print(sp_model)
# sp_model = xgb.XGBRegressor(n_estimators=1000, 
#                         max_depth=9, 
#                         learning_rate=0.03, 
#                         subsample=0.8, 
#                         silent=True, 
#                         seed=1,
#                         objective='reg:linear', 
#                         reg_alpha=1, 
#                         reg_lambda=1, 
#                         gamma=0, 
#                         missing=None, 
#                         colsample_bytree=0.3,
#                         scale_pos_weight=1, 
#                         min_child_weight=1, 
#                         max_delta_step=0, 
#                         base_socre=0.5)

sp_model.fit(X_train, y_train['收缩压'], eval_metric=evaluation, verbose=True, eval_set=[(X_test, y_test['收缩压'])], 
             early_stopping_rounds=100)
X_test_pred['收缩压'] = sp_model.predict(X_test)

# sp_model.fit(train_data, train['收缩压'])
test['收缩压'] = sp_model.predict(test_data)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
[0]	validation_0-rmse:123.397	validation_0-loss_value:10.0898
Multiple eval metrics have been passed: 'validation_0-loss_value' will be used for early stopping.

Will train until validation_0-loss_value hasn't improved in 100 rounds.
[1]	validation_0-rmse:119.773	validation_0-loss_value:7.02427
[2]	validation_0-rmse:116.261	validation_0-loss_value:5.3659
[3]	validation_0-rmse:112.854	validation_0-loss_value:4.29859
[4]	validation_0-rmse:109.554	validation_0-loss_value:3.5473
[5]	validation_0-rmse:106.354	validation_0-loss_value:2.98769
[6]	validation_0-rmse:103.254	validation_0-loss_value:2.55447
[7]	validati

[120]	validation_0-rmse:17.2975	validation_0-loss_value:0.016926
[121]	validation_0-rmse:17.2768	validation_0-loss_value:0.016896
[122]	validation_0-rmse:17.2567	validation_0-loss_value:0.016867
[123]	validation_0-rmse:17.239	validation_0-loss_value:0.016843
[124]	validation_0-rmse:17.22	validation_0-loss_value:0.016815
[125]	validation_0-rmse:17.2034	validation_0-loss_value:0.016793
[126]	validation_0-rmse:17.1856	validation_0-loss_value:0.016768
[127]	validation_0-rmse:17.1707	validation_0-loss_value:0.016748
[128]	validation_0-rmse:17.1576	validation_0-loss_value:0.016732
[129]	validation_0-rmse:17.1454	validation_0-loss_value:0.016717
[130]	validation_0-rmse:17.1305	validation_0-loss_value:0.016697
[131]	validation_0-rmse:17.1175	validation_0-loss_value:0.016681
[132]	validation_0-rmse:17.1077	validation_0-loss_value:0.01667
[133]	validation_0-rmse:17.0965	validation_0-loss_value:0.016656
[134]	validation_0-rmse:17.0858	validation_0-loss_value:0.016643
[135]	validation_0-rmse:17.07

[247]	validation_0-rmse:16.75	validation_0-loss_value:0.016277
[248]	validation_0-rmse:16.7495	validation_0-loss_value:0.016276
[249]	validation_0-rmse:16.748	validation_0-loss_value:0.016274
[250]	validation_0-rmse:16.7475	validation_0-loss_value:0.016273
[251]	validation_0-rmse:16.7465	validation_0-loss_value:0.016271
[252]	validation_0-rmse:16.7454	validation_0-loss_value:0.01627
[253]	validation_0-rmse:16.7439	validation_0-loss_value:0.016266
[254]	validation_0-rmse:16.7425	validation_0-loss_value:0.016264
[255]	validation_0-rmse:16.7422	validation_0-loss_value:0.016264
[256]	validation_0-rmse:16.741	validation_0-loss_value:0.016261
[257]	validation_0-rmse:16.7399	validation_0-loss_value:0.01626
[258]	validation_0-rmse:16.7398	validation_0-loss_value:0.01626
[259]	validation_0-rmse:16.7379	validation_0-loss_value:0.016256
[260]	validation_0-rmse:16.7358	validation_0-loss_value:0.016252
[261]	validation_0-rmse:16.7347	validation_0-loss_value:0.01625
[262]	validation_0-rmse:16.733	va

[374]	validation_0-rmse:16.6452	validation_0-loss_value:0.016091
[375]	validation_0-rmse:16.6451	validation_0-loss_value:0.016091
[376]	validation_0-rmse:16.6446	validation_0-loss_value:0.01609
[377]	validation_0-rmse:16.6442	validation_0-loss_value:0.016089
[378]	validation_0-rmse:16.643	validation_0-loss_value:0.016087
[379]	validation_0-rmse:16.6416	validation_0-loss_value:0.016084
[380]	validation_0-rmse:16.6406	validation_0-loss_value:0.016082
[381]	validation_0-rmse:16.6402	validation_0-loss_value:0.016081
[382]	validation_0-rmse:16.6387	validation_0-loss_value:0.016079
[383]	validation_0-rmse:16.6381	validation_0-loss_value:0.016078
[384]	validation_0-rmse:16.6382	validation_0-loss_value:0.016078
[385]	validation_0-rmse:16.6382	validation_0-loss_value:0.016078
[386]	validation_0-rmse:16.6376	validation_0-loss_value:0.016078
[387]	validation_0-rmse:16.6368	validation_0-loss_value:0.016076
[388]	validation_0-rmse:16.6372	validation_0-loss_value:0.016076
[389]	validation_0-rmse:16.

[501]	validation_0-rmse:16.5915	validation_0-loss_value:0.015996
[502]	validation_0-rmse:16.5911	validation_0-loss_value:0.015995
[503]	validation_0-rmse:16.5909	validation_0-loss_value:0.015995
[504]	validation_0-rmse:16.5909	validation_0-loss_value:0.015995
[505]	validation_0-rmse:16.5907	validation_0-loss_value:0.015995
[506]	validation_0-rmse:16.5904	validation_0-loss_value:0.015994
[507]	validation_0-rmse:16.5895	validation_0-loss_value:0.015993
[508]	validation_0-rmse:16.5889	validation_0-loss_value:0.015991
[509]	validation_0-rmse:16.5891	validation_0-loss_value:0.015992
[510]	validation_0-rmse:16.5887	validation_0-loss_value:0.015991
[511]	validation_0-rmse:16.5884	validation_0-loss_value:0.015991
[512]	validation_0-rmse:16.5874	validation_0-loss_value:0.015989
[513]	validation_0-rmse:16.5872	validation_0-loss_value:0.015989
[514]	validation_0-rmse:16.5876	validation_0-loss_value:0.01599
[515]	validation_0-rmse:16.5868	validation_0-loss_value:0.015988
[516]	validation_0-rmse:16

[628]	validation_0-rmse:16.5715	validation_0-loss_value:0.015964
[629]	validation_0-rmse:16.5716	validation_0-loss_value:0.015964
[630]	validation_0-rmse:16.5712	validation_0-loss_value:0.015964
[631]	validation_0-rmse:16.5715	validation_0-loss_value:0.015964
[632]	validation_0-rmse:16.5714	validation_0-loss_value:0.015964
[633]	validation_0-rmse:16.5702	validation_0-loss_value:0.015962
[634]	validation_0-rmse:16.5702	validation_0-loss_value:0.015962
[635]	validation_0-rmse:16.5701	validation_0-loss_value:0.015962
[636]	validation_0-rmse:16.5696	validation_0-loss_value:0.015961
[637]	validation_0-rmse:16.5693	validation_0-loss_value:0.015961
[638]	validation_0-rmse:16.5697	validation_0-loss_value:0.015962
[639]	validation_0-rmse:16.5694	validation_0-loss_value:0.015961
[640]	validation_0-rmse:16.5691	validation_0-loss_value:0.01596
[641]	validation_0-rmse:16.5692	validation_0-loss_value:0.01596
[642]	validation_0-rmse:16.569	validation_0-loss_value:0.01596
[643]	validation_0-rmse:16.56

[755]	validation_0-rmse:16.5602	validation_0-loss_value:0.015947
[756]	validation_0-rmse:16.5606	validation_0-loss_value:0.015948
[757]	validation_0-rmse:16.5606	validation_0-loss_value:0.015948
[758]	validation_0-rmse:16.56	validation_0-loss_value:0.015946
[759]	validation_0-rmse:16.5599	validation_0-loss_value:0.015946
[760]	validation_0-rmse:16.5598	validation_0-loss_value:0.015946
[761]	validation_0-rmse:16.5601	validation_0-loss_value:0.015946
[762]	validation_0-rmse:16.5605	validation_0-loss_value:0.015947
[763]	validation_0-rmse:16.5606	validation_0-loss_value:0.015947
[764]	validation_0-rmse:16.561	validation_0-loss_value:0.015948
[765]	validation_0-rmse:16.5617	validation_0-loss_value:0.015949
[766]	validation_0-rmse:16.5616	validation_0-loss_value:0.015949
[767]	validation_0-rmse:16.562	validation_0-loss_value:0.01595
[768]	validation_0-rmse:16.5619	validation_0-loss_value:0.015949
[769]	validation_0-rmse:16.5623	validation_0-loss_value:0.01595
[770]	validation_0-rmse:16.5621

[882]	validation_0-rmse:16.553	validation_0-loss_value:0.015934
[883]	validation_0-rmse:16.5528	validation_0-loss_value:0.015934
[884]	validation_0-rmse:16.5533	validation_0-loss_value:0.015935
[885]	validation_0-rmse:16.5531	validation_0-loss_value:0.015935
[886]	validation_0-rmse:16.553	validation_0-loss_value:0.015935
[887]	validation_0-rmse:16.5535	validation_0-loss_value:0.015936
[888]	validation_0-rmse:16.5534	validation_0-loss_value:0.015936
[889]	validation_0-rmse:16.5536	validation_0-loss_value:0.015936
[890]	validation_0-rmse:16.5537	validation_0-loss_value:0.015936
[891]	validation_0-rmse:16.5537	validation_0-loss_value:0.015936
[892]	validation_0-rmse:16.5537	validation_0-loss_value:0.015936
[893]	validation_0-rmse:16.5535	validation_0-loss_value:0.015936
[894]	validation_0-rmse:16.5531	validation_0-loss_value:0.015935
[895]	validation_0-rmse:16.5529	validation_0-loss_value:0.015935
[896]	validation_0-rmse:16.5529	validation_0-loss_value:0.015935
[897]	validation_0-rmse:16.

In [173]:
# 舒张压
dp_model = xgb.XGBRegressor(n_estimators = 1000, max_depth = 5, learning_rate = 0.02)
print(dp_model)

# dp_model = xgb.XGBRegressor(n_estimators=1000, 
#                        max_depth=9, 
#                        learning_rate=0.03, 
#                        subsample=0.8, 
#                        silent=True, 
#                        seed=1,
#                        objective='reg:linear', 
#                        reg_alpha=1, 
#                        reg_lambda=1,
#                        gamma=0, 
#                        missing=None, 
#                        colsample_bytree=0.3,
#                        scale_pos_weight=1, 
#                        min_child_weight=1, 
#                        max_delta_step=0, 
#                        base_socre=0.5
#                        )

dp_model.fit(X_train, y_train['舒张压'], eval_metric=evaluation, verbose=True, eval_set=[(X_test, y_test['舒张压'])], 
             early_stopping_rounds=100)
X_test_pred['舒张压'] = dp_model.predict(X_test)

# dp_model.fit(train_data, train['舒张压'])
test['舒张压'] = dp_model.predict(test_data)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
[0]	validation_0-rmse:75.9542	validation_0-loss_value:10.4897
Multiple eval metrics have been passed: 'validation_0-loss_value' will be used for early stopping.

Will train until validation_0-loss_value hasn't improved in 100 rounds.
[1]	validation_0-rmse:74.4685	validation_0-loss_value:8.05036
[2]	validation_0-rmse:73.0134	validation_0-loss_value:6.53763
[3]	validation_0-rmse:71.5882	validation_0-loss_value:5.48595
[4]	validation_0-rmse:70.1919	validation_0-loss_value:4.70414
[5]	validation_0-rmse:68.825	validation_0-loss_value:4.09719
[6]	validation_0-rmse:67.4852	validation_0-loss_value:3.61035
[7]	validat

[120]	validation_0-rmse:12.9837	validation_0-loss_value:0.026055
[121]	validation_0-rmse:12.9149	validation_0-loss_value:0.025766
[122]	validation_0-rmse:12.8491	validation_0-loss_value:0.025493
[123]	validation_0-rmse:12.7853	validation_0-loss_value:0.025231
[124]	validation_0-rmse:12.7237	validation_0-loss_value:0.024981
[125]	validation_0-rmse:12.6646	validation_0-loss_value:0.024744
[126]	validation_0-rmse:12.6075	validation_0-loss_value:0.024516
[127]	validation_0-rmse:12.5518	validation_0-loss_value:0.024297
[128]	validation_0-rmse:12.4988	validation_0-loss_value:0.02409
[129]	validation_0-rmse:12.447	validation_0-loss_value:0.023891
[130]	validation_0-rmse:12.397	validation_0-loss_value:0.0237
[131]	validation_0-rmse:12.3491	validation_0-loss_value:0.023519
[132]	validation_0-rmse:12.3023	validation_0-loss_value:0.023343
[133]	validation_0-rmse:12.2579	validation_0-loss_value:0.023179
[134]	validation_0-rmse:12.2146	validation_0-loss_value:0.02302
[135]	validation_0-rmse:12.1733

[247]	validation_0-rmse:11.0296	validation_0-loss_value:0.019606
[248]	validation_0-rmse:11.0283	validation_0-loss_value:0.019604
[249]	validation_0-rmse:11.0273	validation_0-loss_value:0.019603
[250]	validation_0-rmse:11.0261	validation_0-loss_value:0.019602
[251]	validation_0-rmse:11.0249	validation_0-loss_value:0.0196
[252]	validation_0-rmse:11.0231	validation_0-loss_value:0.019597
[253]	validation_0-rmse:11.0217	validation_0-loss_value:0.019594
[254]	validation_0-rmse:11.0188	validation_0-loss_value:0.019587
[255]	validation_0-rmse:11.0179	validation_0-loss_value:0.019586
[256]	validation_0-rmse:11.0173	validation_0-loss_value:0.019586
[257]	validation_0-rmse:11.0161	validation_0-loss_value:0.019585
[258]	validation_0-rmse:11.0146	validation_0-loss_value:0.019583
[259]	validation_0-rmse:11.0126	validation_0-loss_value:0.019578
[260]	validation_0-rmse:11.0112	validation_0-loss_value:0.019575
[261]	validation_0-rmse:11.0097	validation_0-loss_value:0.019572
[262]	validation_0-rmse:11.

[374]	validation_0-rmse:10.9334	validation_0-loss_value:0.019397
[375]	validation_0-rmse:10.9333	validation_0-loss_value:0.019396
[376]	validation_0-rmse:10.9333	validation_0-loss_value:0.019396
[377]	validation_0-rmse:10.933	validation_0-loss_value:0.019396
[378]	validation_0-rmse:10.9328	validation_0-loss_value:0.019395
[379]	validation_0-rmse:10.9327	validation_0-loss_value:0.019395
[380]	validation_0-rmse:10.932	validation_0-loss_value:0.019393
[381]	validation_0-rmse:10.9315	validation_0-loss_value:0.019391
[382]	validation_0-rmse:10.9312	validation_0-loss_value:0.01939
[383]	validation_0-rmse:10.931	validation_0-loss_value:0.01939
[384]	validation_0-rmse:10.9305	validation_0-loss_value:0.019389
[385]	validation_0-rmse:10.9303	validation_0-loss_value:0.019388
[386]	validation_0-rmse:10.9297	validation_0-loss_value:0.019386
[387]	validation_0-rmse:10.9295	validation_0-loss_value:0.019386
[388]	validation_0-rmse:10.9289	validation_0-loss_value:0.019384
[389]	validation_0-rmse:10.928

[501]	validation_0-rmse:11.002	validation_0-loss_value:0.01933
[502]	validation_0-rmse:11.0152	validation_0-loss_value:0.019338
[503]	validation_0-rmse:11.0147	validation_0-loss_value:0.019336
[504]	validation_0-rmse:11.0141	validation_0-loss_value:0.019334
[505]	validation_0-rmse:11.0136	validation_0-loss_value:0.019333
[506]	validation_0-rmse:11.0134	validation_0-loss_value:0.019332
[507]	validation_0-rmse:11.0127	validation_0-loss_value:0.01933
[508]	validation_0-rmse:11.0126	validation_0-loss_value:0.01933
[509]	validation_0-rmse:11.0125	validation_0-loss_value:0.019329
[510]	validation_0-rmse:11.0121	validation_0-loss_value:0.019327
[511]	validation_0-rmse:11.0116	validation_0-loss_value:0.019326
[512]	validation_0-rmse:11.0243	validation_0-loss_value:0.019333
[513]	validation_0-rmse:11.0242	validation_0-loss_value:0.019333
[514]	validation_0-rmse:11.0238	validation_0-loss_value:0.019332
[515]	validation_0-rmse:11.0237	validation_0-loss_value:0.019331
[516]	validation_0-rmse:11.02

In [174]:
# 血清甘油三酯
tg_model = xgb.XGBRegressor(n_estimators=1000, 
                       max_depth=5, 
                       learning_rate=0.02)
# tg_model = xgb.XGBRegressor(n_estimators=1000, 
#                        max_depth=9, 
#                        learning_rate=0.03, 
#                        subsample=0.8, 
#                        silent=True, 
#                        seed=1,
#                        objective='reg:linear', 
#                        reg_alpha=0.5, 
#                        reg_lambda=1, 
#                        gamma=0, 
#                        missing=None, 
#                        colsample_bytree=0.3,
#                        scale_pos_weight=1, 
#                        min_child_weight=1, 
#                        max_delta_step=0, 
#                        base_socre=0.5)

tg_model.fit(X_train, y_train['血清甘油三酯'], eval_metric=evaluation, verbose=True, eval_set=[(X_test, y_test['血清甘油三酯'])], 
             early_stopping_rounds=100)
X_test_pred['血清甘油三酯'] = tg_model.predict(X_test)

# tg_model.fit(train_data, train['血清甘油三酯'])
test['血清甘油三酯'] = tg_model.predict(test_data)

[0]	validation_0-rmse:1.6985	validation_0-loss_value:0.345462
Multiple eval metrics have been passed: 'validation_0-loss_value' will be used for early stopping.

Will train until validation_0-loss_value hasn't improved in 100 rounds.
[1]	validation_0-rmse:1.68027	validation_0-loss_value:0.329709
[2]	validation_0-rmse:1.66257	validation_0-loss_value:0.31505
[3]	validation_0-rmse:1.64539	validation_0-loss_value:0.301422
[4]	validation_0-rmse:1.62874	validation_0-loss_value:0.288727
[5]	validation_0-rmse:1.61264	validation_0-loss_value:0.276914
[6]	validation_0-rmse:1.59701	validation_0-loss_value:0.265853
[7]	validation_0-rmse:1.5818	validation_0-loss_value:0.255473
[8]	validation_0-rmse:1.56703	validation_0-loss_value:0.245752
[9]	validation_0-rmse:1.55289	validation_0-loss_value:0.236679
[10]	validation_0-rmse:1.53912	validation_0-loss_value:0.228142
[11]	validation_0-rmse:1.52582	validation_0-loss_value:0.220132
[12]	validation_0-rmse:1.51269	validation_0-loss_value:0.212599
[13]	vali

[126]	validation_0-rmse:1.11471	validation_0-loss_value:0.083201
[127]	validation_0-rmse:1.11442	validation_0-loss_value:0.083209
[128]	validation_0-rmse:1.11363	validation_0-loss_value:0.08317
[129]	validation_0-rmse:1.11339	validation_0-loss_value:0.083166
[130]	validation_0-rmse:1.11339	validation_0-loss_value:0.083203
[131]	validation_0-rmse:1.11316	validation_0-loss_value:0.083204
[132]	validation_0-rmse:1.11284	validation_0-loss_value:0.083185
[133]	validation_0-rmse:1.11243	validation_0-loss_value:0.08318
[134]	validation_0-rmse:1.11218	validation_0-loss_value:0.083159
[135]	validation_0-rmse:1.11158	validation_0-loss_value:0.083137
[136]	validation_0-rmse:1.11122	validation_0-loss_value:0.083113
[137]	validation_0-rmse:1.11114	validation_0-loss_value:0.083135
[138]	validation_0-rmse:1.11092	validation_0-loss_value:0.083151
[139]	validation_0-rmse:1.11021	validation_0-loss_value:0.083095
[140]	validation_0-rmse:1.11016	validation_0-loss_value:0.083118
[141]	validation_0-rmse:1.1

[253]	validation_0-rmse:1.09639	validation_0-loss_value:0.082705
[254]	validation_0-rmse:1.09616	validation_0-loss_value:0.082697
[255]	validation_0-rmse:1.09616	validation_0-loss_value:0.082689
[256]	validation_0-rmse:1.09608	validation_0-loss_value:0.082681
[257]	validation_0-rmse:1.09595	validation_0-loss_value:0.082658
[258]	validation_0-rmse:1.09591	validation_0-loss_value:0.08265
[259]	validation_0-rmse:1.0958	validation_0-loss_value:0.082627
[260]	validation_0-rmse:1.09583	validation_0-loss_value:0.082631
[261]	validation_0-rmse:1.09576	validation_0-loss_value:0.08262
[262]	validation_0-rmse:1.09574	validation_0-loss_value:0.082621
[263]	validation_0-rmse:1.09573	validation_0-loss_value:0.082628
[264]	validation_0-rmse:1.09561	validation_0-loss_value:0.082604
[265]	validation_0-rmse:1.09569	validation_0-loss_value:0.08261
[266]	validation_0-rmse:1.09584	validation_0-loss_value:0.082619
[267]	validation_0-rmse:1.096	validation_0-loss_value:0.082635
[268]	validation_0-rmse:1.09593

[380]	validation_0-rmse:1.09195	validation_0-loss_value:0.081779
[381]	validation_0-rmse:1.09195	validation_0-loss_value:0.081773
[382]	validation_0-rmse:1.09195	validation_0-loss_value:0.081769
[383]	validation_0-rmse:1.09193	validation_0-loss_value:0.081762
[384]	validation_0-rmse:1.09192	validation_0-loss_value:0.08176
[385]	validation_0-rmse:1.09195	validation_0-loss_value:0.081758
[386]	validation_0-rmse:1.09202	validation_0-loss_value:0.081761
[387]	validation_0-rmse:1.09197	validation_0-loss_value:0.081761
[388]	validation_0-rmse:1.09188	validation_0-loss_value:0.081748
[389]	validation_0-rmse:1.09189	validation_0-loss_value:0.081742
[390]	validation_0-rmse:1.09187	validation_0-loss_value:0.081733
[391]	validation_0-rmse:1.09186	validation_0-loss_value:0.081729
[392]	validation_0-rmse:1.09186	validation_0-loss_value:0.081725
[393]	validation_0-rmse:1.09176	validation_0-loss_value:0.081705
[394]	validation_0-rmse:1.09176	validation_0-loss_value:0.081705
[395]	validation_0-rmse:1.

[507]	validation_0-rmse:1.09129	validation_0-loss_value:0.0815
[508]	validation_0-rmse:1.09126	validation_0-loss_value:0.081496
[509]	validation_0-rmse:1.09125	validation_0-loss_value:0.081495
[510]	validation_0-rmse:1.0913	validation_0-loss_value:0.081497
[511]	validation_0-rmse:1.0913	validation_0-loss_value:0.081499
[512]	validation_0-rmse:1.09124	validation_0-loss_value:0.081489
[513]	validation_0-rmse:1.09122	validation_0-loss_value:0.081486
[514]	validation_0-rmse:1.09122	validation_0-loss_value:0.081481
[515]	validation_0-rmse:1.09123	validation_0-loss_value:0.081485
[516]	validation_0-rmse:1.09121	validation_0-loss_value:0.081481
[517]	validation_0-rmse:1.09123	validation_0-loss_value:0.081486
[518]	validation_0-rmse:1.09123	validation_0-loss_value:0.081488
[519]	validation_0-rmse:1.09112	validation_0-loss_value:0.081475
[520]	validation_0-rmse:1.09117	validation_0-loss_value:0.081477
[521]	validation_0-rmse:1.09117	validation_0-loss_value:0.081472
[522]	validation_0-rmse:1.091

[634]	validation_0-rmse:1.0908	validation_0-loss_value:0.08134
[635]	validation_0-rmse:1.09079	validation_0-loss_value:0.081336
[636]	validation_0-rmse:1.09078	validation_0-loss_value:0.081331
[637]	validation_0-rmse:1.09075	validation_0-loss_value:0.081329
[638]	validation_0-rmse:1.09072	validation_0-loss_value:0.081322
[639]	validation_0-rmse:1.09067	validation_0-loss_value:0.081313
[640]	validation_0-rmse:1.09073	validation_0-loss_value:0.081317
[641]	validation_0-rmse:1.09072	validation_0-loss_value:0.081313
[642]	validation_0-rmse:1.09075	validation_0-loss_value:0.081319
[643]	validation_0-rmse:1.09066	validation_0-loss_value:0.081319
[644]	validation_0-rmse:1.09065	validation_0-loss_value:0.081315
[645]	validation_0-rmse:1.09068	validation_0-loss_value:0.081314
[646]	validation_0-rmse:1.09059	validation_0-loss_value:0.0813
[647]	validation_0-rmse:1.09056	validation_0-loss_value:0.081296
[648]	validation_0-rmse:1.09055	validation_0-loss_value:0.081295
[649]	validation_0-rmse:1.090

[761]	validation_0-rmse:1.09041	validation_0-loss_value:0.081219
[762]	validation_0-rmse:1.09038	validation_0-loss_value:0.081213
[763]	validation_0-rmse:1.09032	validation_0-loss_value:0.081207
[764]	validation_0-rmse:1.09035	validation_0-loss_value:0.081206
[765]	validation_0-rmse:1.0904	validation_0-loss_value:0.081209
[766]	validation_0-rmse:1.0904	validation_0-loss_value:0.081208
[767]	validation_0-rmse:1.09041	validation_0-loss_value:0.081211
[768]	validation_0-rmse:1.09041	validation_0-loss_value:0.081213
[769]	validation_0-rmse:1.09035	validation_0-loss_value:0.081205
[770]	validation_0-rmse:1.0904	validation_0-loss_value:0.081205
[771]	validation_0-rmse:1.09035	validation_0-loss_value:0.081205
[772]	validation_0-rmse:1.09032	validation_0-loss_value:0.081201
[773]	validation_0-rmse:1.09032	validation_0-loss_value:0.081196
[774]	validation_0-rmse:1.09029	validation_0-loss_value:0.081191
[775]	validation_0-rmse:1.09022	validation_0-loss_value:0.081177
[776]	validation_0-rmse:1.09

[888]	validation_0-rmse:1.09011	validation_0-loss_value:0.081175
[889]	validation_0-rmse:1.09009	validation_0-loss_value:0.081172
[890]	validation_0-rmse:1.0901	validation_0-loss_value:0.081173
[891]	validation_0-rmse:1.09009	validation_0-loss_value:0.08117
[892]	validation_0-rmse:1.09006	validation_0-loss_value:0.081166
[893]	validation_0-rmse:1.09	validation_0-loss_value:0.08116
[894]	validation_0-rmse:1.08999	validation_0-loss_value:0.08116
[895]	validation_0-rmse:1.08998	validation_0-loss_value:0.08116
[896]	validation_0-rmse:1.08997	validation_0-loss_value:0.081154
[897]	validation_0-rmse:1.08995	validation_0-loss_value:0.08115
[898]	validation_0-rmse:1.08995	validation_0-loss_value:0.081147
[899]	validation_0-rmse:1.08995	validation_0-loss_value:0.081146
[900]	validation_0-rmse:1.08996	validation_0-loss_value:0.081146
[901]	validation_0-rmse:1.08993	validation_0-loss_value:0.081136
[902]	validation_0-rmse:1.08992	validation_0-loss_value:0.081135
[903]	validation_0-rmse:1.08989	va

In [175]:
# 血清高密度脂蛋白
hdl_model = xgb.XGBRegressor(n_estimators=1200, 
                       max_depth=5, 
                       learning_rate=0.05)
# hdl_model = xgb.XGBRegressor(n_estimators=1000, 
#                        max_depth=9, 
#                        learning_rate=0.03, 
#                        subsample=0.8, 
#                        silent=True, 
#                        seed=1,
#                        objective='reg:linear', 
#                        reg_alpha=0.5, 
#                        reg_lambda=1, 
#                        gamma=0, 
#                        missing=None, 
#                        colsample_bytree=0.3,
#                        scale_pos_weight=1, 
#                        min_child_weight=1, 
#                        max_delta_step=0, 
#                        base_socre=0.5)

hdl_model.fit(X_train, y_train['血清高密度脂蛋白'], eval_metric=evaluation, verbose=True, eval_set=[(X_test, y_test['血清高密度脂蛋白'])], 
              early_stopping_rounds=100)
X_test_pred['血清高密度脂蛋白'] = hdl_model.predict(X_test)

# hdl_model.fit(train_data, train['血清高密度脂蛋白'])
test['血清高密度脂蛋白'] = hdl_model.predict(test_data)

[0]	validation_0-rmse:0.922948	validation_0-loss_value:0.205112
Multiple eval metrics have been passed: 'validation_0-loss_value' will be used for early stopping.

Will train until validation_0-loss_value hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.882289	validation_0-loss_value:0.181828
[2]	validation_0-rmse:0.843902	validation_0-loss_value:0.161653
[3]	validation_0-rmse:0.807696	validation_0-loss_value:0.144129
[4]	validation_0-rmse:0.773524	validation_0-loss_value:0.128854
[5]	validation_0-rmse:0.741266	validation_0-loss_value:0.115496
[6]	validation_0-rmse:0.71088	validation_0-loss_value:0.103808
[7]	validation_0-rmse:0.682298	validation_0-loss_value:0.093566
[8]	validation_0-rmse:0.655428	validation_0-loss_value:0.084573
[9]	validation_0-rmse:0.630184	validation_0-loss_value:0.076662
[10]	validation_0-rmse:0.606447	validation_0-loss_value:0.069685
[11]	validation_0-rmse:0.584188	validation_0-loss_value:0.063534
[12]	validation_0-rmse:0.563355	validation_0-loss_value:0.0

[124]	validation_0-rmse:0.287178	validation_0-loss_value:0.013327
[125]	validation_0-rmse:0.287083	validation_0-loss_value:0.013319
[126]	validation_0-rmse:0.286988	validation_0-loss_value:0.013311
[127]	validation_0-rmse:0.286911	validation_0-loss_value:0.013304
[128]	validation_0-rmse:0.286797	validation_0-loss_value:0.013293
[129]	validation_0-rmse:0.286721	validation_0-loss_value:0.013286
[130]	validation_0-rmse:0.286608	validation_0-loss_value:0.013276
[131]	validation_0-rmse:0.286508	validation_0-loss_value:0.013267
[132]	validation_0-rmse:0.286428	validation_0-loss_value:0.01326
[133]	validation_0-rmse:0.286351	validation_0-loss_value:0.013254
[134]	validation_0-rmse:0.286266	validation_0-loss_value:0.013246
[135]	validation_0-rmse:0.286198	validation_0-loss_value:0.01324
[136]	validation_0-rmse:0.286025	validation_0-loss_value:0.013225
[137]	validation_0-rmse:0.286	validation_0-loss_value:0.013223
[138]	validation_0-rmse:0.285918	validation_0-loss_value:0.013216
[139]	validatio

[249]	validation_0-rmse:0.279267	validation_0-loss_value:0.012627
[250]	validation_0-rmse:0.27925	validation_0-loss_value:0.012626
[251]	validation_0-rmse:0.279233	validation_0-loss_value:0.012624
[252]	validation_0-rmse:0.2792	validation_0-loss_value:0.012621
[253]	validation_0-rmse:0.279125	validation_0-loss_value:0.012614
[254]	validation_0-rmse:0.279121	validation_0-loss_value:0.012615
[255]	validation_0-rmse:0.279101	validation_0-loss_value:0.012612
[256]	validation_0-rmse:0.279058	validation_0-loss_value:0.012608
[257]	validation_0-rmse:0.279038	validation_0-loss_value:0.012606
[258]	validation_0-rmse:0.279012	validation_0-loss_value:0.012604
[259]	validation_0-rmse:0.279024	validation_0-loss_value:0.012604
[260]	validation_0-rmse:0.278966	validation_0-loss_value:0.0126
[261]	validation_0-rmse:0.278935	validation_0-loss_value:0.012597
[262]	validation_0-rmse:0.278922	validation_0-loss_value:0.012596
[263]	validation_0-rmse:0.278881	validation_0-loss_value:0.012592
[264]	validatio

[374]	validation_0-rmse:0.276193	validation_0-loss_value:0.012359
[375]	validation_0-rmse:0.276158	validation_0-loss_value:0.012356
[376]	validation_0-rmse:0.276174	validation_0-loss_value:0.012358
[377]	validation_0-rmse:0.276177	validation_0-loss_value:0.012358
[378]	validation_0-rmse:0.276143	validation_0-loss_value:0.012355
[379]	validation_0-rmse:0.276132	validation_0-loss_value:0.012354
[380]	validation_0-rmse:0.276077	validation_0-loss_value:0.012349
[381]	validation_0-rmse:0.276055	validation_0-loss_value:0.012347
[382]	validation_0-rmse:0.276036	validation_0-loss_value:0.012345
[383]	validation_0-rmse:0.276007	validation_0-loss_value:0.012342
[384]	validation_0-rmse:0.275999	validation_0-loss_value:0.012341
[385]	validation_0-rmse:0.275985	validation_0-loss_value:0.01234
[386]	validation_0-rmse:0.275976	validation_0-loss_value:0.012339
[387]	validation_0-rmse:0.275934	validation_0-loss_value:0.012336
[388]	validation_0-rmse:0.275907	validation_0-loss_value:0.012334
[389]	valid

[499]	validation_0-rmse:0.274835	validation_0-loss_value:0.012244
[500]	validation_0-rmse:0.274833	validation_0-loss_value:0.012244
[501]	validation_0-rmse:0.274792	validation_0-loss_value:0.012241
[502]	validation_0-rmse:0.274793	validation_0-loss_value:0.012241
[503]	validation_0-rmse:0.274809	validation_0-loss_value:0.012242
[504]	validation_0-rmse:0.274795	validation_0-loss_value:0.012241
[505]	validation_0-rmse:0.274805	validation_0-loss_value:0.012242
[506]	validation_0-rmse:0.274791	validation_0-loss_value:0.012241
[507]	validation_0-rmse:0.274777	validation_0-loss_value:0.012239
[508]	validation_0-rmse:0.274772	validation_0-loss_value:0.012239
[509]	validation_0-rmse:0.274773	validation_0-loss_value:0.012239
[510]	validation_0-rmse:0.274763	validation_0-loss_value:0.012238
[511]	validation_0-rmse:0.274761	validation_0-loss_value:0.012238
[512]	validation_0-rmse:0.27474	validation_0-loss_value:0.012236
[513]	validation_0-rmse:0.274749	validation_0-loss_value:0.012237
[514]	valid

[624]	validation_0-rmse:0.273865	validation_0-loss_value:0.012165
[625]	validation_0-rmse:0.273832	validation_0-loss_value:0.012162
[626]	validation_0-rmse:0.273823	validation_0-loss_value:0.012161
[627]	validation_0-rmse:0.273823	validation_0-loss_value:0.012161
[628]	validation_0-rmse:0.273826	validation_0-loss_value:0.012162
[629]	validation_0-rmse:0.273827	validation_0-loss_value:0.012162
[630]	validation_0-rmse:0.273812	validation_0-loss_value:0.01216
[631]	validation_0-rmse:0.273822	validation_0-loss_value:0.012161
[632]	validation_0-rmse:0.273825	validation_0-loss_value:0.012161
[633]	validation_0-rmse:0.273831	validation_0-loss_value:0.012162
[634]	validation_0-rmse:0.273844	validation_0-loss_value:0.012163
[635]	validation_0-rmse:0.273844	validation_0-loss_value:0.012163
[636]	validation_0-rmse:0.273812	validation_0-loss_value:0.01216
[637]	validation_0-rmse:0.273818	validation_0-loss_value:0.012161
[638]	validation_0-rmse:0.273839	validation_0-loss_value:0.012163
[639]	valida

[749]	validation_0-rmse:0.27335	validation_0-loss_value:0.012121
[750]	validation_0-rmse:0.273333	validation_0-loss_value:0.012119
[751]	validation_0-rmse:0.27332	validation_0-loss_value:0.012118
[752]	validation_0-rmse:0.273321	validation_0-loss_value:0.012118
[753]	validation_0-rmse:0.27333	validation_0-loss_value:0.012119
[754]	validation_0-rmse:0.273335	validation_0-loss_value:0.012119
[755]	validation_0-rmse:0.273335	validation_0-loss_value:0.012119
[756]	validation_0-rmse:0.273355	validation_0-loss_value:0.012121
[757]	validation_0-rmse:0.273352	validation_0-loss_value:0.012121
[758]	validation_0-rmse:0.273359	validation_0-loss_value:0.012122
[759]	validation_0-rmse:0.273352	validation_0-loss_value:0.012121
[760]	validation_0-rmse:0.273351	validation_0-loss_value:0.012121
[761]	validation_0-rmse:0.273355	validation_0-loss_value:0.012121
[762]	validation_0-rmse:0.273345	validation_0-loss_value:0.012121
[763]	validation_0-rmse:0.27334	validation_0-loss_value:0.01212
[764]	validatio

[874]	validation_0-rmse:0.272784	validation_0-loss_value:0.012074
[875]	validation_0-rmse:0.27278	validation_0-loss_value:0.012073
[876]	validation_0-rmse:0.272779	validation_0-loss_value:0.012073
[877]	validation_0-rmse:0.272777	validation_0-loss_value:0.012073
[878]	validation_0-rmse:0.272781	validation_0-loss_value:0.012073
[879]	validation_0-rmse:0.272801	validation_0-loss_value:0.012075
[880]	validation_0-rmse:0.27281	validation_0-loss_value:0.012076
[881]	validation_0-rmse:0.272815	validation_0-loss_value:0.012077
[882]	validation_0-rmse:0.27281	validation_0-loss_value:0.012076
[883]	validation_0-rmse:0.272815	validation_0-loss_value:0.012077
[884]	validation_0-rmse:0.272815	validation_0-loss_value:0.012077
[885]	validation_0-rmse:0.272812	validation_0-loss_value:0.012077
[886]	validation_0-rmse:0.272795	validation_0-loss_value:0.012075
[887]	validation_0-rmse:0.272803	validation_0-loss_value:0.012076
[888]	validation_0-rmse:0.272806	validation_0-loss_value:0.012076
[889]	validat

[999]	validation_0-rmse:0.272454	validation_0-loss_value:0.012048
[1000]	validation_0-rmse:0.27245	validation_0-loss_value:0.012048
[1001]	validation_0-rmse:0.272454	validation_0-loss_value:0.012049
[1002]	validation_0-rmse:0.272445	validation_0-loss_value:0.012048
[1003]	validation_0-rmse:0.272452	validation_0-loss_value:0.012048
[1004]	validation_0-rmse:0.272453	validation_0-loss_value:0.012048
[1005]	validation_0-rmse:0.27245	validation_0-loss_value:0.012048
[1006]	validation_0-rmse:0.272451	validation_0-loss_value:0.012048
[1007]	validation_0-rmse:0.272445	validation_0-loss_value:0.012048
[1008]	validation_0-rmse:0.272447	validation_0-loss_value:0.012048
[1009]	validation_0-rmse:0.272412	validation_0-loss_value:0.012045
[1010]	validation_0-rmse:0.272408	validation_0-loss_value:0.012044
[1011]	validation_0-rmse:0.272403	validation_0-loss_value:0.012044
[1012]	validation_0-rmse:0.272408	validation_0-loss_value:0.012044
[1013]	validation_0-rmse:0.272418	validation_0-loss_value:0.01204

[1122]	validation_0-rmse:0.272284	validation_0-loss_value:0.012039
[1123]	validation_0-rmse:0.272283	validation_0-loss_value:0.012039
[1124]	validation_0-rmse:0.27227	validation_0-loss_value:0.012038
[1125]	validation_0-rmse:0.27227	validation_0-loss_value:0.012038
[1126]	validation_0-rmse:0.272273	validation_0-loss_value:0.012038
[1127]	validation_0-rmse:0.272281	validation_0-loss_value:0.012038
[1128]	validation_0-rmse:0.272289	validation_0-loss_value:0.012039
[1129]	validation_0-rmse:0.272291	validation_0-loss_value:0.01204
[1130]	validation_0-rmse:0.272294	validation_0-loss_value:0.01204
[1131]	validation_0-rmse:0.272295	validation_0-loss_value:0.01204
[1132]	validation_0-rmse:0.272296	validation_0-loss_value:0.01204
[1133]	validation_0-rmse:0.272292	validation_0-loss_value:0.01204
[1134]	validation_0-rmse:0.272289	validation_0-loss_value:0.012039
[1135]	validation_0-rmse:0.272259	validation_0-loss_value:0.012037
[1136]	validation_0-rmse:0.272257	validation_0-loss_value:0.012037
[1

In [176]:
# 血清低密度脂蛋白
ldl_model = xgb.XGBRegressor(n_estimators=1200, 
                       max_depth=5, 
                       learning_rate=0.05)
print(ldl_model)
# ldl_model = xgb.XGBRegressor(n_estimators=1000, 
#                        max_depth=9, 
#                        learning_rate=0.03, 
#                        subsample=0.8, 
#                        silent=True, 
#                        seed=1,
#                        objective='reg:linear', 
#                        reg_alpha=0.5, 
#                        reg_lambda=1, 
#                        gamma=0, 
#                        missing=None, 
#                        colsample_bytree=0.3,
#                        scale_pos_weight=1, 
#                        min_child_weight=1, 
#                        max_delta_step=0, 
#                        base_socre=0.5)

ldl_model.fit(X_train, y_train['血清低密度脂蛋白'], eval_metric=evaluation, verbose=True, eval_set=[(X_test, y_test['血清低密度脂蛋白'])], 
              early_stopping_rounds=100)
X_test_pred['血清低密度脂蛋白'] = ldl_model.predict(X_test)

# ldl_model.fit(train_data, train['血清低密度脂蛋白'])
test['血清低密度脂蛋白'] = ldl_model.predict(test_data)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1200,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
[0]	validation_0-rmse:2.32091	validation_0-loss_value:0.727887
Multiple eval metrics have been passed: 'validation_0-loss_value' will be used for early stopping.

Will train until validation_0-loss_value hasn't improved in 100 rounds.
[1]	validation_0-rmse:2.21876	validation_0-loss_value:0.624459
[2]	validation_0-rmse:2.12232	validation_0-loss_value:0.539311
[3]	validation_0-rmse:2.03121	validation_0-loss_value:0.468532
[4]	validation_0-rmse:1.94537	validation_0-loss_value:0.40927
[5]	validation_0-rmse:1.86473	validation_0-loss_value:0.359361
[6]	validation_0-rmse:1.78869	validation_0-loss_value:0.317046
[7]	

[119]	validation_0-rmse:0.727654	validation_0-loss_value:0.037227
[120]	validation_0-rmse:0.727488	validation_0-loss_value:0.037212
[121]	validation_0-rmse:0.727234	validation_0-loss_value:0.037191
[122]	validation_0-rmse:0.727183	validation_0-loss_value:0.037187
[123]	validation_0-rmse:0.726872	validation_0-loss_value:0.037151
[124]	validation_0-rmse:0.726555	validation_0-loss_value:0.037123
[125]	validation_0-rmse:0.72628	validation_0-loss_value:0.037098
[126]	validation_0-rmse:0.726131	validation_0-loss_value:0.037087
[127]	validation_0-rmse:0.725953	validation_0-loss_value:0.03707
[128]	validation_0-rmse:0.725806	validation_0-loss_value:0.037056
[129]	validation_0-rmse:0.725631	validation_0-loss_value:0.03704
[130]	validation_0-rmse:0.72537	validation_0-loss_value:0.037013
[131]	validation_0-rmse:0.72526	validation_0-loss_value:0.037005
[132]	validation_0-rmse:0.724935	validation_0-loss_value:0.03697
[133]	validation_0-rmse:0.724617	validation_0-loss_value:0.036937
[134]	validation

[244]	validation_0-rmse:0.710476	validation_0-loss_value:0.035459
[245]	validation_0-rmse:0.710449	validation_0-loss_value:0.035457
[246]	validation_0-rmse:0.710257	validation_0-loss_value:0.035437
[247]	validation_0-rmse:0.710079	validation_0-loss_value:0.035419
[248]	validation_0-rmse:0.709959	validation_0-loss_value:0.035405
[249]	validation_0-rmse:0.709873	validation_0-loss_value:0.035394
[250]	validation_0-rmse:0.709899	validation_0-loss_value:0.035395
[251]	validation_0-rmse:0.709863	validation_0-loss_value:0.035391
[252]	validation_0-rmse:0.70971	validation_0-loss_value:0.035379
[253]	validation_0-rmse:0.709705	validation_0-loss_value:0.035375
[254]	validation_0-rmse:0.709553	validation_0-loss_value:0.035365
[255]	validation_0-rmse:0.709622	validation_0-loss_value:0.035373
[256]	validation_0-rmse:0.709629	validation_0-loss_value:0.035373
[257]	validation_0-rmse:0.70953	validation_0-loss_value:0.035359
[258]	validation_0-rmse:0.709479	validation_0-loss_value:0.035355
[259]	valida

[369]	validation_0-rmse:0.703604	validation_0-loss_value:0.034742
[370]	validation_0-rmse:0.703601	validation_0-loss_value:0.034741
[371]	validation_0-rmse:0.7036	validation_0-loss_value:0.034741
[372]	validation_0-rmse:0.703577	validation_0-loss_value:0.03474
[373]	validation_0-rmse:0.703574	validation_0-loss_value:0.034742
[374]	validation_0-rmse:0.703623	validation_0-loss_value:0.034749
[375]	validation_0-rmse:0.703582	validation_0-loss_value:0.034746
[376]	validation_0-rmse:0.703497	validation_0-loss_value:0.034736
[377]	validation_0-rmse:0.703495	validation_0-loss_value:0.034735
[378]	validation_0-rmse:0.703459	validation_0-loss_value:0.034731
[379]	validation_0-rmse:0.703389	validation_0-loss_value:0.034725
[380]	validation_0-rmse:0.703347	validation_0-loss_value:0.03472
[381]	validation_0-rmse:0.703355	validation_0-loss_value:0.03472
[382]	validation_0-rmse:0.703292	validation_0-loss_value:0.034714
[383]	validation_0-rmse:0.703283	validation_0-loss_value:0.034714
[384]	validatio

[494]	validation_0-rmse:0.700919	validation_0-loss_value:0.034466
[495]	validation_0-rmse:0.700885	validation_0-loss_value:0.034461
[496]	validation_0-rmse:0.700836	validation_0-loss_value:0.034457
[497]	validation_0-rmse:0.700843	validation_0-loss_value:0.03446
[498]	validation_0-rmse:0.70085	validation_0-loss_value:0.034459
[499]	validation_0-rmse:0.700865	validation_0-loss_value:0.03446
[500]	validation_0-rmse:0.700864	validation_0-loss_value:0.03446
[501]	validation_0-rmse:0.70089	validation_0-loss_value:0.034463
[502]	validation_0-rmse:0.700865	validation_0-loss_value:0.03446
[503]	validation_0-rmse:0.700869	validation_0-loss_value:0.03446
[504]	validation_0-rmse:0.700862	validation_0-loss_value:0.03446
[505]	validation_0-rmse:0.700838	validation_0-loss_value:0.034458
[506]	validation_0-rmse:0.70083	validation_0-loss_value:0.034456
[507]	validation_0-rmse:0.700765	validation_0-loss_value:0.03445
[508]	validation_0-rmse:0.700846	validation_0-loss_value:0.034457
[509]	validation_0-r

[619]	validation_0-rmse:0.699271	validation_0-loss_value:0.03428
[620]	validation_0-rmse:0.699238	validation_0-loss_value:0.034277
[621]	validation_0-rmse:0.699188	validation_0-loss_value:0.034272
[622]	validation_0-rmse:0.699172	validation_0-loss_value:0.034271
[623]	validation_0-rmse:0.699113	validation_0-loss_value:0.034265
[624]	validation_0-rmse:0.699126	validation_0-loss_value:0.034265
[625]	validation_0-rmse:0.69913	validation_0-loss_value:0.034265
[626]	validation_0-rmse:0.699138	validation_0-loss_value:0.034265
[627]	validation_0-rmse:0.699176	validation_0-loss_value:0.034268
[628]	validation_0-rmse:0.699147	validation_0-loss_value:0.034265
[629]	validation_0-rmse:0.699163	validation_0-loss_value:0.034266
[630]	validation_0-rmse:0.699151	validation_0-loss_value:0.034265
[631]	validation_0-rmse:0.699168	validation_0-loss_value:0.034266
[632]	validation_0-rmse:0.699166	validation_0-loss_value:0.034266
[633]	validation_0-rmse:0.699152	validation_0-loss_value:0.034265
[634]	valida

[744]	validation_0-rmse:0.698206	validation_0-loss_value:0.034177
[745]	validation_0-rmse:0.698216	validation_0-loss_value:0.034179
[746]	validation_0-rmse:0.698206	validation_0-loss_value:0.034179
[747]	validation_0-rmse:0.698198	validation_0-loss_value:0.034179
[748]	validation_0-rmse:0.698179	validation_0-loss_value:0.034177
[749]	validation_0-rmse:0.698186	validation_0-loss_value:0.034178
[750]	validation_0-rmse:0.698174	validation_0-loss_value:0.034177
[751]	validation_0-rmse:0.698208	validation_0-loss_value:0.03418
[752]	validation_0-rmse:0.698163	validation_0-loss_value:0.034177
[753]	validation_0-rmse:0.698148	validation_0-loss_value:0.034176
[754]	validation_0-rmse:0.698136	validation_0-loss_value:0.034175
[755]	validation_0-rmse:0.698144	validation_0-loss_value:0.034176
[756]	validation_0-rmse:0.698137	validation_0-loss_value:0.034174
[757]	validation_0-rmse:0.698159	validation_0-loss_value:0.034177
[758]	validation_0-rmse:0.698125	validation_0-loss_value:0.034174
[759]	valid

[869]	validation_0-rmse:0.697508	validation_0-loss_value:0.034087
[870]	validation_0-rmse:0.697494	validation_0-loss_value:0.034086
[871]	validation_0-rmse:0.697477	validation_0-loss_value:0.034084
[872]	validation_0-rmse:0.697455	validation_0-loss_value:0.034083
[873]	validation_0-rmse:0.697458	validation_0-loss_value:0.034082
[874]	validation_0-rmse:0.697438	validation_0-loss_value:0.03408
[875]	validation_0-rmse:0.697446	validation_0-loss_value:0.03408
[876]	validation_0-rmse:0.697437	validation_0-loss_value:0.034079
[877]	validation_0-rmse:0.697455	validation_0-loss_value:0.03408
[878]	validation_0-rmse:0.697502	validation_0-loss_value:0.034083
[879]	validation_0-rmse:0.697459	validation_0-loss_value:0.034078
[880]	validation_0-rmse:0.697413	validation_0-loss_value:0.034074
[881]	validation_0-rmse:0.697455	validation_0-loss_value:0.034077
[882]	validation_0-rmse:0.697461	validation_0-loss_value:0.034077
[883]	validation_0-rmse:0.697386	validation_0-loss_value:0.034071
[884]	validat

[994]	validation_0-rmse:0.696644	validation_0-loss_value:0.033986
[995]	validation_0-rmse:0.696663	validation_0-loss_value:0.033987
[996]	validation_0-rmse:0.696577	validation_0-loss_value:0.033978
[997]	validation_0-rmse:0.696628	validation_0-loss_value:0.033982
[998]	validation_0-rmse:0.696576	validation_0-loss_value:0.033976
[999]	validation_0-rmse:0.696624	validation_0-loss_value:0.03398
[1000]	validation_0-rmse:0.696657	validation_0-loss_value:0.033981
[1001]	validation_0-rmse:0.696684	validation_0-loss_value:0.033983
[1002]	validation_0-rmse:0.696619	validation_0-loss_value:0.033976
[1003]	validation_0-rmse:0.696588	validation_0-loss_value:0.033973
[1004]	validation_0-rmse:0.696595	validation_0-loss_value:0.033972
[1005]	validation_0-rmse:0.696637	validation_0-loss_value:0.033976
[1006]	validation_0-rmse:0.696633	validation_0-loss_value:0.033975
[1007]	validation_0-rmse:0.696657	validation_0-loss_value:0.033977
[1008]	validation_0-rmse:0.696651	validation_0-loss_value:0.033977
[1

[1117]	validation_0-rmse:0.696223	validation_0-loss_value:0.033929
[1118]	validation_0-rmse:0.696198	validation_0-loss_value:0.033926
[1119]	validation_0-rmse:0.696166	validation_0-loss_value:0.033923
[1120]	validation_0-rmse:0.696156	validation_0-loss_value:0.033922
[1121]	validation_0-rmse:0.696152	validation_0-loss_value:0.033922
[1122]	validation_0-rmse:0.696061	validation_0-loss_value:0.033911
[1123]	validation_0-rmse:0.696098	validation_0-loss_value:0.033913
[1124]	validation_0-rmse:0.696086	validation_0-loss_value:0.033911
[1125]	validation_0-rmse:0.696081	validation_0-loss_value:0.033911
[1126]	validation_0-rmse:0.696101	validation_0-loss_value:0.033913
[1127]	validation_0-rmse:0.696084	validation_0-loss_value:0.033911
[1128]	validation_0-rmse:0.695967	validation_0-loss_value:0.033898
[1129]	validation_0-rmse:0.696	validation_0-loss_value:0.033902
[1130]	validation_0-rmse:0.695998	validation_0-loss_value:0.033901
[1131]	validation_0-rmse:0.696011	validation_0-loss_value:0.03390

In [177]:
from math import log1p,pow
# 计算误差损失（评估分数）
def calc_logloss(true_df,pred_df):
    loss_sum=0
    rows=true_df.shape[0]
    for c in true_df.columns:
        #预测结果必须要>0,否则log函数会报错，导致最终提交结果没有分数
        true_df[c]=true_df[c].apply(lambda x:log1p(x)) #+ 1
        pred_df[c]=pred_df[c].apply(lambda x:log1p(x)) #+ 1
        true_df[c+'new']=pred_df[c]-true_df[c]
        true_df[c+'new']=true_df[c+'new'].apply(lambda x:pow(x,2))
        loss_item=(true_df[c+'new'].sum())/rows
        loss_sum+=loss_item
        print('%s的loss：%f'%(c,loss_item))
    print('五项指标平均loss分数：',loss_sum/5)

In [178]:
columns = ['收缩压','舒张压','血清甘油三酯','血清高密度脂蛋白','血清低密度脂蛋白']
#X_test_pred = np.ndarray.round(X_test_pred, 3)#保留三位小数
y_test = pd.DataFrame(y_test, columns=columns)
#y_pred_res = pd.DataFrame(y_pred, columns=columns)
calc_logloss(y_test, X_test_pred)

收缩压的loss：0.003854
舒张压的loss：0.004534
血清甘油三酯的loss：0.026839
血清高密度脂蛋白的loss：0.003925
血清低密度脂蛋白的loss：0.009060
五项指标平均loss分数： 0.009642337874053544


In [179]:
test[target_feat].head()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,128.063919,79.440369,1.713544,1.489995,2.921557
1,127.858917,71.089302,1.254103,1.384393,2.768798
2,134.434799,84.086586,2.493349,1.251173,2.86818
3,129.48587,80.440987,1.745231,1.556447,2.22686
4,125.692802,72.833405,1.824537,1.530622,3.443168


In [180]:
for feat in ['收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白', '血清低密度脂蛋白']:
    test[feat] = test[feat].apply(lambda x: abs(x) if x < 0 else x)

In [181]:
test[['vid'] + target_feat].to_csv('test_new.csv', index=False, header=False, encoding='utf-8', float_format='%.3f')