In [7]:
import pandas as pd

# 用户信息
user_info = pd.read_csv('data/member_info_0926.txt', header=None, sep='\t')
user_info.columns = ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
                     'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
                     'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']

# 问题信息
question_info = pd.read_csv('data/question_info_0926.txt', header=None, sep='\t')
question_info.columns = ['qid','q_start_date','q_head_sw','q_head_w','q_desc_sw','q_desc_w',
                         'q_topic']

# 训练数据
train = pd.read_csv('data/invite_info_0926.txt', header=None, sep='\t')
train.columns = ['qid', 'uid', 'i_start_date','answer']
train = pd.merge(train, user_info, how='left', on='uid')
train = pd.merge(train, question_info, how='left', on='qid')

# 测试数据
test = pd.read_csv('data/invite_info_evaluate_0926.txt', header=None, sep='\t')
test.columns = ['qid', 'uid', 'i_start_date']
test = pd.merge(test, user_info, how='left', on='uid')
test = pd.merge(test, question_info, how='left', on='qid')

In [146]:
data = pd.concat([train, test], axis=0, sort=True)

In [147]:
# 用于保存提交结果
result_append = data[['qid', 'uid', 'i_start_date']][train.shape[0]:]

**时间解析**

In [148]:
data['i_start_day'] = data['i_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['i_start_hour'] = data['i_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

data['q_start_day'] = data['q_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['q_start_hour'] = data['q_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

**历史回答数据**

In [149]:
answer_info = pd.read_csv('data/answer_info_0926.txt', header=None, sep='\t')
answer_info.columns = ['aid', 'qid', 'uid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 
                       'round_table', 'picture', 'video', 'num_word', 'num_agree', 'num_cancel',
                       'num_commend', 'num_collect', 'num_thank', 'num_report', 'num_helpless', 'num_disagree']

answer_info = answer_info.drop(['aid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 'round_table', 
                                'picture', 'video'], axis=1)

In [150]:
# 该条邀请的回答详情，包括各种点赞数等所有数量（若无回答，补0）
data = data.merge(answer_info, on=['qid', 'uid'], how='left').fillna(0)

# 数据类型转换
int_columns = ['num_word', 'num_agree', 'num_cancel','num_commend', 'num_collect', 
               'num_thank', 'num_report', 'num_helpless', 'num_disagree']
data[int_columns] = data[int_columns].astype('int64')
data['answer'] = data['answer'].astype('int64')

**用户历史回答统计特征**

In [151]:
user_stat_raw = pd.read_csv('feature/user_stat_raw.txt', sep='\t')

In [152]:
data = data.merge(user_stat_raw, on='uid', how='left')

# 数据类型转换
u_stat_columns = ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
                'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
                'u_total_disagree_raw', 'u_total_invite_raw', 'u_total_answer_raw']
data[u_stat_columns] = data[u_stat_columns].fillna(0).astype('int64')

In [160]:
# 防标签泄露
for c in u_stat_columns:
    if c == 'u_total_invite_raw':
        continue
    temp = c.split('_')
    c_1 = 'num_' + temp[2]    # 当前 num
    c_2 = '_'.join([temp[0], temp[1], temp[2]])   # 处理后的新列名
    
    # total raw 减去当前的 num
    if c == 'u_total_answer_raw':
        data[c_2] = data[c] - data['answer']
        data[c_2] = data[c_2].replace(-1, 0)    # invite 的数据里 answer 为 1 的用户有时并没有在 answer info 里
    else:
        data[c_2] = data[c] - data[c_1] 


In [479]:
# data['u_total_answer'] = data['u_total_answer'].replace(-1, 0)

**类别特征 encoding**

In [171]:
from sklearn.preprocessing import LabelEncoder
class_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
              'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']
encoder = LabelEncoder()
for feat in class_feat:
    encoder.fit(data[feat])
    data[feat] = encoder.transform(data[feat])

**计数特征**

In [172]:
for feat in ['uid','qid','gender', 'visit_freq','u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d',
       'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [174]:
data.columns

Index(['answer', 'gender', 'i_start_date', 'q_desc_sw', 'q_desc_w',
       'q_head_sw', 'q_head_w', 'q_start_date', 'q_topic', 'qid', 'u_2_cat_a',
       'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_focus_topic',
       'u_like_topic', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c',
       'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid', 'visit_freq',
       'i_start_day', 'i_start_hour', 'q_start_day', 'q_start_hour',
       'num_word', 'num_agree', 'num_cancel', 'num_commend', 'num_collect',
       'num_thank', 'num_report', 'num_helpless', 'num_disagree',
       'u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw',
       'u_total_commend_raw', 'u_total_collect_raw', 'u_total_thank_raw',
       'u_total_report_raw', 'u_total_helpless_raw', 'u_total_disagree_raw',
       'u_total_invite_raw', 'u_total_answer_raw', 'u_total_word',
       'u_total_agree', 'u_total_cancel', 'u_total_commend', 'u_total_collect',
       'u_total_thank', 'u_total_report', 'u_t

**选择训练特征**

In [528]:
used_feat = ['answer']

# question_info
used_feat += ['qid']
# used_feat += ['qid', 'q_desc_sw', 'q_desc_w','q_head_sw', 'q_head_w', 'q_start_date', 'q_topic']


# member_info
used_feat += ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
              'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
              'u_multi_cat_e','u_salt_value']
# used_feat += ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
#               'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
#               'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']
# used_feat += ['gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
#               'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
#               'u_multi_cat_e','u_salt_value']

# answer_info 不用
# used_feat += ['num_word', 'num_agree', 'num_cancel', 'num_commend',
#               'num_collect', 'num_thank', 'num_report', 'num_helpless', 'num_disagree']

# invite_info 不用
# used_feat += ['i_start_date']

# 计数特征
used_feat += ['uid_count','qid_count','gender_count', 'visit_freq_count','u_2_cat_a_count', 
              'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count',
              'u_multi_cat_a_count','u_multi_cat_b_count','u_multi_cat_c_count',
              'u_multi_cat_d_count','u_multi_cat_e_count']
# used_feat += ['gender_count', 'visit_freq_count','u_2_cat_a_count', 
#               'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count',
#               'u_multi_cat_a_count','u_multi_cat_b_count','u_multi_cat_c_count',
#               'u_multi_cat_d_count','u_multi_cat_e_count']
# used_feat += ['u_total_invite_raw']  # 与 uid_count 重合

# 时间解析特征
used_feat += ['i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour']
# used_feat += ['q_start_day']


# 过滤后的用户历史回答特征
# used_feat += ['u_total_word', 'u_total_agree', 'u_total_cancel','u_total_commend', 
#              'u_total_collect', 'u_total_thank','u_total_report', 'u_total_helpless',
#              'u_total_disagree', 'u_total_answer']
used_feat += ['u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']

# 未过滤的用户历史回答特征
# used_feat += ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
#               'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
#               'u_total_disagree_raw', 'u_total_answer_raw']
# used_feat += ['u_total_word_raw', 'u_total_agree_raw', 'u_total_collect_raw', 'u_total_answer_raw']


# 验证
t1 = set(data.columns) 
t11 = list(data.columns)
assert len(t1) == len(t11)
print('data:', len(t11))
t2 = set(used_feat)
t22 = list(used_feat)
assert len(t2) == len(t22)
print('used_feat:', len(t22))

print('t1-t2:',len(t1 - t2))
# print(t1-t2)

# used_data = data[used_feat]
# print('columns for train:', used_data.columns)
print('columns for train:', used_feat)

data: 73
used_feat: 38
t1-t2: 35
columns for train: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count', 'u_multi_cat_a_count', 'u_multi_cat_b_count', 'u_multi_cat_c_count', 'u_multi_cat_d_count', 'u_multi_cat_e_count', 'i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour', 'u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']


**划分数据集**

In [529]:
# 验证集做法要同测试集做法一直
import copy
def process_eval_feat(used_feat):
    replace_feat = ['u_total_word', 'u_total_agree', 'u_total_cancel','u_total_commend', 'u_total_collect', 
                    'u_total_thank','u_total_report', 'u_total_helpless','u_total_disagree', 'u_total_answer']
    eval_feat = copy.copy(used_feat)
    for i, feat in enumerate(used_feat):
        if feat in replace_feat:
            eval_feat[i] = feat + '_raw'
    return eval_feat

In [530]:
# 获取使用特征中的类别特征
def get_cat_feat(used_feat):
    all_cat_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
                    'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e', 'u_2_cat_a',
                    'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e']
    used_cat_feat = []
    for feat in used_feat:
        if feat in all_cat_feat:
            used_cat_feat.append(feat)
    return used_cat_feat

In [531]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

is_eval = True   # 若为 true，取训练集中的后面的 days_eval 天作为验证集
days_eval = 1
cat_feat = get_cat_feat(used_feat)

if not is_eval:
    temp = data[used_feat]
    y_train = temp[:train.shape[0]]['answer']
    X_train = temp[:train.shape[0]].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=cat_feat)
    eval_set_lgb = train_set_lgb

else:
    train_idx = data['i_start_day'][:train.shape[0]] < (3867 - days_eval + 1)
    
    temp = data[used_feat]
    y_train = temp[:train.shape[0]][train_idx]['answer']
    X_train = temp[used_feat][:train.shape[0]][train_idx].drop(['answer'], axis=1)
    
    eval_feature = process_eval_feat(used_feat)
    print('used_feat:', used_feat)
    print('eval_feature:', eval_feature)
    temp = data[eval_feature]
    y_eval = temp[:train.shape[0]][~train_idx]['answer']
    X_eval = temp[eval_feature][:train.shape[0]][~train_idx].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=cat_feat)
    eval_set_lgb = lgb.Dataset(X_eval, y_eval, free_raw_data=False, categorical_feature=class_feat, reference=train_set_lgb)

X_test = data[used_feat][train.shape[0]:].drop(['answer'], axis=1)

used_feat: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count', 'u_multi_cat_a_count', 'u_multi_cat_b_count', 'u_multi_cat_c_count', 'u_multi_cat_d_count', 'u_multi_cat_e_count', 'i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour', 'u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']
eval_feature: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_

In [532]:
print(y_train.shape)
print(X_train.shape)

print(y_eval.shape)
print(X_eval.shape)

(9141216,)
(9141216, 37)
(347946,)
(347946, 37)


**训练**

In [533]:
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 64, 
    'learning_rate': 0.01, 
    'max_bin': 425, 
    'subsample_for_bin': 50000, 
    'objective': 'binary', 
    'min_split_gain': 0,
    'min_child_weight': 5,
    'min_child_samples': 10, 
    'subsample': 0.8, 
    'subsample_freq': 1,
    'colsample_bytree': 1, 
    'reg_alpha': 3, 
    'reg_lambda': 5, 
    'seed': 1000, 
    'n_jobs': -1, 
    'silent': True,
    'metric': ['auc', 'binary_logloss'],
}

In [None]:
gbm = lgb.train(params, 
                train_set_lgb, 
                valid_sets=eval_set_lgb, 
                early_stopping_rounds=10,
                num_boost_round = 2000
               )

[1]	valid_0's auc: 0.744319	valid_0's binary_logloss: 0.432834
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.74656	valid_0's binary_logloss: 0.431729
[3]	valid_0's auc: 0.746631	valid_0's binary_logloss: 0.430648
[4]	valid_0's auc: 0.747448	valid_0's binary_logloss: 0.429564
[5]	valid_0's auc: 0.747631	valid_0's binary_logloss: 0.42854
[6]	valid_0's auc: 0.748319	valid_0's binary_logloss: 0.427523
[7]	valid_0's auc: 0.748162	valid_0's binary_logloss: 0.426526
[8]	valid_0's auc: 0.748638	valid_0's binary_logloss: 0.42557
[9]	valid_0's auc: 0.749882	valid_0's binary_logloss: 0.424632
[10]	valid_0's auc: 0.749885	valid_0's binary_logloss: 0.42372
[11]	valid_0's auc: 0.750827	valid_0's binary_logloss: 0.422821
[12]	valid_0's auc: 0.750819	valid_0's binary_logloss: 0.42193
[13]	valid_0's auc: 0.751136	valid_0's binary_logloss: 0.421061
[14]	valid_0's auc: 0.751399	valid_0's binary_logloss: 0.420231
[15]	valid_0's auc: 0.752338	valid_0's binary_logloss: 0.

[129]	valid_0's auc: 0.771986	valid_0's binary_logloss: 0.376176
[130]	valid_0's auc: 0.772082	valid_0's binary_logloss: 0.376013
[131]	valid_0's auc: 0.772165	valid_0's binary_logloss: 0.375836
[132]	valid_0's auc: 0.772241	valid_0's binary_logloss: 0.375683
[133]	valid_0's auc: 0.772319	valid_0's binary_logloss: 0.375525
[134]	valid_0's auc: 0.772355	valid_0's binary_logloss: 0.375385
[135]	valid_0's auc: 0.772469	valid_0's binary_logloss: 0.375221
[136]	valid_0's auc: 0.772581	valid_0's binary_logloss: 0.37505
[137]	valid_0's auc: 0.772672	valid_0's binary_logloss: 0.374894
[138]	valid_0's auc: 0.772755	valid_0's binary_logloss: 0.374734
[139]	valid_0's auc: 0.772823	valid_0's binary_logloss: 0.374586
[140]	valid_0's auc: 0.772875	valid_0's binary_logloss: 0.374439
[141]	valid_0's auc: 0.772961	valid_0's binary_logloss: 0.374284
[142]	valid_0's auc: 0.773049	valid_0's binary_logloss: 0.374143
[143]	valid_0's auc: 0.773135	valid_0's binary_logloss: 0.373986
[144]	valid_0's auc: 0.773

[256]	valid_0's auc: 0.780338	valid_0's binary_logloss: 0.363883
[257]	valid_0's auc: 0.780357	valid_0's binary_logloss: 0.36384
[258]	valid_0's auc: 0.780407	valid_0's binary_logloss: 0.363784
[259]	valid_0's auc: 0.780434	valid_0's binary_logloss: 0.36374
[260]	valid_0's auc: 0.780461	valid_0's binary_logloss: 0.363693
[261]	valid_0's auc: 0.780512	valid_0's binary_logloss: 0.363631
[262]	valid_0's auc: 0.780529	valid_0's binary_logloss: 0.363588
[263]	valid_0's auc: 0.780621	valid_0's binary_logloss: 0.36351
[264]	valid_0's auc: 0.780718	valid_0's binary_logloss: 0.363434
[265]	valid_0's auc: 0.780779	valid_0's binary_logloss: 0.363372
[266]	valid_0's auc: 0.780849	valid_0's binary_logloss: 0.363305
[267]	valid_0's auc: 0.780875	valid_0's binary_logloss: 0.363263
[268]	valid_0's auc: 0.780931	valid_0's binary_logloss: 0.363208
[269]	valid_0's auc: 0.780972	valid_0's binary_logloss: 0.363153
[270]	valid_0's auc: 0.781039	valid_0's binary_logloss: 0.36309
[271]	valid_0's auc: 0.781114

[383]	valid_0's auc: 0.786318	valid_0's binary_logloss: 0.358533
[384]	valid_0's auc: 0.786351	valid_0's binary_logloss: 0.358504
[385]	valid_0's auc: 0.78638	valid_0's binary_logloss: 0.358476
[386]	valid_0's auc: 0.78641	valid_0's binary_logloss: 0.358453
[387]	valid_0's auc: 0.786437	valid_0's binary_logloss: 0.358431
[388]	valid_0's auc: 0.786467	valid_0's binary_logloss: 0.358403
[389]	valid_0's auc: 0.7865	valid_0's binary_logloss: 0.358378
[390]	valid_0's auc: 0.786544	valid_0's binary_logloss: 0.358342
[391]	valid_0's auc: 0.78656	valid_0's binary_logloss: 0.358322
[392]	valid_0's auc: 0.786591	valid_0's binary_logloss: 0.358299
[393]	valid_0's auc: 0.786639	valid_0's binary_logloss: 0.358261
[394]	valid_0's auc: 0.786667	valid_0's binary_logloss: 0.358237
[395]	valid_0's auc: 0.786707	valid_0's binary_logloss: 0.358205
[396]	valid_0's auc: 0.786728	valid_0's binary_logloss: 0.358183
[397]	valid_0's auc: 0.786765	valid_0's binary_logloss: 0.358155
[398]	valid_0's auc: 0.786789	

[510]	valid_0's auc: 0.789517	valid_0's binary_logloss: 0.35598
[511]	valid_0's auc: 0.789539	valid_0's binary_logloss: 0.355963
[512]	valid_0's auc: 0.789557	valid_0's binary_logloss: 0.35595
[513]	valid_0's auc: 0.789567	valid_0's binary_logloss: 0.35594
[514]	valid_0's auc: 0.789597	valid_0's binary_logloss: 0.355917
[515]	valid_0's auc: 0.789628	valid_0's binary_logloss: 0.355895
[516]	valid_0's auc: 0.789661	valid_0's binary_logloss: 0.355874
[517]	valid_0's auc: 0.789684	valid_0's binary_logloss: 0.355859
[518]	valid_0's auc: 0.789703	valid_0's binary_logloss: 0.355846
[519]	valid_0's auc: 0.789718	valid_0's binary_logloss: 0.355836
[520]	valid_0's auc: 0.789734	valid_0's binary_logloss: 0.355823
[521]	valid_0's auc: 0.789746	valid_0's binary_logloss: 0.355811
[522]	valid_0's auc: 0.789791	valid_0's binary_logloss: 0.355781
[523]	valid_0's auc: 0.789801	valid_0's binary_logloss: 0.355774
[524]	valid_0's auc: 0.789815	valid_0's binary_logloss: 0.355762
[525]	valid_0's auc: 0.78984

[637]	valid_0's auc: 0.791712	valid_0's binary_logloss: 0.354365
[638]	valid_0's auc: 0.791728	valid_0's binary_logloss: 0.354355
[639]	valid_0's auc: 0.791744	valid_0's binary_logloss: 0.354343
[640]	valid_0's auc: 0.791783	valid_0's binary_logloss: 0.354315
[641]	valid_0's auc: 0.791796	valid_0's binary_logloss: 0.354307
[642]	valid_0's auc: 0.791813	valid_0's binary_logloss: 0.354296
[643]	valid_0's auc: 0.79183	valid_0's binary_logloss: 0.354285
[644]	valid_0's auc: 0.791839	valid_0's binary_logloss: 0.354277
[645]	valid_0's auc: 0.791848	valid_0's binary_logloss: 0.35427
[646]	valid_0's auc: 0.791857	valid_0's binary_logloss: 0.354264
[647]	valid_0's auc: 0.79189	valid_0's binary_logloss: 0.354238
[648]	valid_0's auc: 0.791914	valid_0's binary_logloss: 0.354223
[649]	valid_0's auc: 0.791921	valid_0's binary_logloss: 0.354217
[650]	valid_0's auc: 0.791937	valid_0's binary_logloss: 0.354208
[651]	valid_0's auc: 0.791966	valid_0's binary_logloss: 0.354189
[652]	valid_0's auc: 0.79197

[764]	valid_0's auc: 0.793604	valid_0's binary_logloss: 0.353068
[765]	valid_0's auc: 0.793621	valid_0's binary_logloss: 0.353056
[766]	valid_0's auc: 0.793626	valid_0's binary_logloss: 0.353052
[767]	valid_0's auc: 0.793638	valid_0's binary_logloss: 0.353044
[768]	valid_0's auc: 0.793656	valid_0's binary_logloss: 0.353031
[769]	valid_0's auc: 0.793675	valid_0's binary_logloss: 0.353017
[770]	valid_0's auc: 0.793684	valid_0's binary_logloss: 0.353011
[771]	valid_0's auc: 0.793691	valid_0's binary_logloss: 0.353006
[772]	valid_0's auc: 0.793701	valid_0's binary_logloss: 0.353
[773]	valid_0's auc: 0.793714	valid_0's binary_logloss: 0.352992
[774]	valid_0's auc: 0.793729	valid_0's binary_logloss: 0.352983
[775]	valid_0's auc: 0.793743	valid_0's binary_logloss: 0.352974
[776]	valid_0's auc: 0.793747	valid_0's binary_logloss: 0.352972
[777]	valid_0's auc: 0.793755	valid_0's binary_logloss: 0.352965
[778]	valid_0's auc: 0.79376	valid_0's binary_logloss: 0.352961
[779]	valid_0's auc: 0.793766

[891]	valid_0's auc: 0.794998	valid_0's binary_logloss: 0.352125
[892]	valid_0's auc: 0.795008	valid_0's binary_logloss: 0.352117
[893]	valid_0's auc: 0.795012	valid_0's binary_logloss: 0.352113
[894]	valid_0's auc: 0.79506	valid_0's binary_logloss: 0.352076
[895]	valid_0's auc: 0.795065	valid_0's binary_logloss: 0.352072
[896]	valid_0's auc: 0.795066	valid_0's binary_logloss: 0.352071
[897]	valid_0's auc: 0.795071	valid_0's binary_logloss: 0.352068
[898]	valid_0's auc: 0.795083	valid_0's binary_logloss: 0.352061
[899]	valid_0's auc: 0.795096	valid_0's binary_logloss: 0.352054
[900]	valid_0's auc: 0.795102	valid_0's binary_logloss: 0.35205
[901]	valid_0's auc: 0.795156	valid_0's binary_logloss: 0.352017
[902]	valid_0's auc: 0.795166	valid_0's binary_logloss: 0.352012
[903]	valid_0's auc: 0.795171	valid_0's binary_logloss: 0.352009
[904]	valid_0's auc: 0.795186	valid_0's binary_logloss: 0.351997
[905]	valid_0's auc: 0.795193	valid_0's binary_logloss: 0.351993
[906]	valid_0's auc: 0.7952

[1018]	valid_0's auc: 0.796391	valid_0's binary_logloss: 0.351187
[1019]	valid_0's auc: 0.796401	valid_0's binary_logloss: 0.35118
[1020]	valid_0's auc: 0.796404	valid_0's binary_logloss: 0.351178
[1021]	valid_0's auc: 0.79641	valid_0's binary_logloss: 0.351175
[1022]	valid_0's auc: 0.796413	valid_0's binary_logloss: 0.351172
[1023]	valid_0's auc: 0.796414	valid_0's binary_logloss: 0.351172
[1024]	valid_0's auc: 0.796419	valid_0's binary_logloss: 0.351169
[1025]	valid_0's auc: 0.796424	valid_0's binary_logloss: 0.351166
[1026]	valid_0's auc: 0.796428	valid_0's binary_logloss: 0.351163
[1027]	valid_0's auc: 0.796438	valid_0's binary_logloss: 0.351155
[1028]	valid_0's auc: 0.796439	valid_0's binary_logloss: 0.351155
[1029]	valid_0's auc: 0.796439	valid_0's binary_logloss: 0.351154
[1030]	valid_0's auc: 0.796443	valid_0's binary_logloss: 0.351152
[1031]	valid_0's auc: 0.796446	valid_0's binary_logloss: 0.35115
[1032]	valid_0's auc: 0.79645	valid_0's binary_logloss: 0.351148
[1033]	valid_0

[1143]	valid_0's auc: 0.797329	valid_0's binary_logloss: 0.350533
[1144]	valid_0's auc: 0.797332	valid_0's binary_logloss: 0.350531
[1145]	valid_0's auc: 0.797335	valid_0's binary_logloss: 0.350529
[1146]	valid_0's auc: 0.797337	valid_0's binary_logloss: 0.350528
[1147]	valid_0's auc: 0.797338	valid_0's binary_logloss: 0.350527
[1148]	valid_0's auc: 0.797342	valid_0's binary_logloss: 0.350525
[1149]	valid_0's auc: 0.79735	valid_0's binary_logloss: 0.350521
[1150]	valid_0's auc: 0.797353	valid_0's binary_logloss: 0.350518
[1151]	valid_0's auc: 0.797355	valid_0's binary_logloss: 0.350517
[1152]	valid_0's auc: 0.79736	valid_0's binary_logloss: 0.350514
[1153]	valid_0's auc: 0.797361	valid_0's binary_logloss: 0.350513
[1154]	valid_0's auc: 0.79736	valid_0's binary_logloss: 0.350513
[1155]	valid_0's auc: 0.797354	valid_0's binary_logloss: 0.350517
[1156]	valid_0's auc: 0.797354	valid_0's binary_logloss: 0.350517
[1157]	valid_0's auc: 0.79737	valid_0's binary_logloss: 0.350503
[1158]	valid_0

[1268]	valid_0's auc: 0.797878	valid_0's binary_logloss: 0.350157
[1269]	valid_0's auc: 0.797885	valid_0's binary_logloss: 0.350153
[1270]	valid_0's auc: 0.79789	valid_0's binary_logloss: 0.350151
[1271]	valid_0's auc: 0.797893	valid_0's binary_logloss: 0.350149
[1272]	valid_0's auc: 0.797897	valid_0's binary_logloss: 0.350144
[1273]	valid_0's auc: 0.797904	valid_0's binary_logloss: 0.35014
[1274]	valid_0's auc: 0.797904	valid_0's binary_logloss: 0.350138
[1275]	valid_0's auc: 0.797915	valid_0's binary_logloss: 0.350131
[1276]	valid_0's auc: 0.797919	valid_0's binary_logloss: 0.350129
[1277]	valid_0's auc: 0.797927	valid_0's binary_logloss: 0.350124
[1278]	valid_0's auc: 0.797928	valid_0's binary_logloss: 0.350123
[1279]	valid_0's auc: 0.797931	valid_0's binary_logloss: 0.35012
[1280]	valid_0's auc: 0.797932	valid_0's binary_logloss: 0.350119
[1281]	valid_0's auc: 0.797937	valid_0's binary_logloss: 0.350116
[1282]	valid_0's auc: 0.797939	valid_0's binary_logloss: 0.350115
[1283]	valid_

[1393]	valid_0's auc: 0.798339	valid_0's binary_logloss: 0.349843
[1394]	valid_0's auc: 0.798342	valid_0's binary_logloss: 0.349842
[1395]	valid_0's auc: 0.798345	valid_0's binary_logloss: 0.34984
[1396]	valid_0's auc: 0.798347	valid_0's binary_logloss: 0.349839
[1397]	valid_0's auc: 0.798352	valid_0's binary_logloss: 0.349836
[1398]	valid_0's auc: 0.798351	valid_0's binary_logloss: 0.349835
[1399]	valid_0's auc: 0.798356	valid_0's binary_logloss: 0.349833
[1400]	valid_0's auc: 0.798347	valid_0's binary_logloss: 0.349838
[1401]	valid_0's auc: 0.79835	valid_0's binary_logloss: 0.349836
[1402]	valid_0's auc: 0.798354	valid_0's binary_logloss: 0.349833
[1403]	valid_0's auc: 0.798356	valid_0's binary_logloss: 0.349833
[1404]	valid_0's auc: 0.798356	valid_0's binary_logloss: 0.349833
[1405]	valid_0's auc: 0.798358	valid_0's binary_logloss: 0.349832
[1406]	valid_0's auc: 0.798359	valid_0's binary_logloss: 0.34983
[1407]	valid_0's auc: 0.79836	valid_0's binary_logloss: 0.34983
[1408]	valid_0'

[1518]	valid_0's auc: 0.798655	valid_0's binary_logloss: 0.349629
[1519]	valid_0's auc: 0.798658	valid_0's binary_logloss: 0.349627
[1520]	valid_0's auc: 0.798662	valid_0's binary_logloss: 0.349624
[1521]	valid_0's auc: 0.798662	valid_0's binary_logloss: 0.349624
[1522]	valid_0's auc: 0.798665	valid_0's binary_logloss: 0.349623
[1523]	valid_0's auc: 0.798665	valid_0's binary_logloss: 0.349623
[1524]	valid_0's auc: 0.798668	valid_0's binary_logloss: 0.349621
[1525]	valid_0's auc: 0.798681	valid_0's binary_logloss: 0.349608
[1526]	valid_0's auc: 0.798686	valid_0's binary_logloss: 0.349606
[1527]	valid_0's auc: 0.79869	valid_0's binary_logloss: 0.349604
[1528]	valid_0's auc: 0.798691	valid_0's binary_logloss: 0.349604
[1529]	valid_0's auc: 0.798695	valid_0's binary_logloss: 0.3496
[1530]	valid_0's auc: 0.798698	valid_0's binary_logloss: 0.349599
[1531]	valid_0's auc: 0.798751	valid_0's binary_logloss: 0.349562
[1532]	valid_0's auc: 0.798756	valid_0's binary_logloss: 0.349559
[1533]	valid_

[1643]	valid_0's auc: 0.799115	valid_0's binary_logloss: 0.349313
[1644]	valid_0's auc: 0.799116	valid_0's binary_logloss: 0.349313
[1645]	valid_0's auc: 0.79912	valid_0's binary_logloss: 0.349309
[1646]	valid_0's auc: 0.79912	valid_0's binary_logloss: 0.349307
[1647]	valid_0's auc: 0.799123	valid_0's binary_logloss: 0.349306
[1648]	valid_0's auc: 0.799123	valid_0's binary_logloss: 0.349306
[1649]	valid_0's auc: 0.799124	valid_0's binary_logloss: 0.349305
[1650]	valid_0's auc: 0.799123	valid_0's binary_logloss: 0.349306
[1651]	valid_0's auc: 0.799127	valid_0's binary_logloss: 0.349303
[1652]	valid_0's auc: 0.799129	valid_0's binary_logloss: 0.349303
[1653]	valid_0's auc: 0.79913	valid_0's binary_logloss: 0.349302
[1654]	valid_0's auc: 0.799132	valid_0's binary_logloss: 0.349301
[1655]	valid_0's auc: 0.799136	valid_0's binary_logloss: 0.349296
[1656]	valid_0's auc: 0.79914	valid_0's binary_logloss: 0.349292
[1657]	valid_0's auc: 0.79914	valid_0's binary_logloss: 0.349292
[1658]	valid_0'

**保存模型**

In [517]:
import pickle
pickle.dump(gbm, open('model/gbm.pkl', 'wb'))

**保存结果**

In [None]:
y_pred = gbm.predict(X_test)
print(y_pred.shape)
result_append['answer'] = y_pred
result_append.to_csv('result.txt', index=False, header=False, sep='\t')

**特征重要性**

In [None]:
col = list(used_feat)
del col[0]
importance = gbm.feature_importance()
# importance
# print(col)
# print(importance)

In [None]:
importance_df = pd.DataFrame({'col':col, 'imp': importance})
importance_df['imp_rate'] = (importance_df['imp'] / importance_df['imp'].sum()).apply(lambda x: format(x, '.2%'))
importance_df