In [7]:
import pandas as pd

# 用户信息
user_info = pd.read_csv('data/member_info_0926.txt', header=None, sep='\t')
user_info.columns = ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
                     'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
                     'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']

# 问题信息
question_info = pd.read_csv('data/question_info_0926.txt', header=None, sep='\t')
question_info.columns = ['qid','q_start_date','q_head_sw','q_head_w','q_desc_sw','q_desc_w',
                         'q_topic']

# 训练数据
train = pd.read_csv('data/invite_info_0926.txt', header=None, sep='\t')
train.columns = ['qid', 'uid', 'i_start_date','answer']
train = pd.merge(train, user_info, how='left', on='uid')
train = pd.merge(train, question_info, how='left', on='qid')

# 测试数据
test = pd.read_csv('data/invite_info_evaluate_0926.txt', header=None, sep='\t')
test.columns = ['qid', 'uid', 'i_start_date']
test = pd.merge(test, user_info, how='left', on='uid')
test = pd.merge(test, question_info, how='left', on='qid')

In [146]:
data = pd.concat([train, test], axis=0, sort=True)

In [147]:
# 用于保存提交结果
result_append = data[['qid', 'uid', 'i_start_date']][train.shape[0]:]

**时间解析**

In [148]:
data['i_start_day'] = data['i_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['i_start_hour'] = data['i_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

data['q_start_day'] = data['q_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['q_start_hour'] = data['q_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

**历史回答数据**

In [149]:
answer_info = pd.read_csv('data/answer_info_0926.txt', header=None, sep='\t')
answer_info.columns = ['aid', 'qid', 'uid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 
                       'round_table', 'picture', 'video', 'num_word', 'num_agree', 'num_cancel',
                       'num_commend', 'num_collect', 'num_thank', 'num_report', 'num_helpless', 'num_disagree']

answer_info = answer_info.drop(['aid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 'round_table', 
                                'picture', 'video'], axis=1)

In [150]:
# 该条邀请的回答详情，包括各种点赞数等所有数量（若无回答，补0）
data = data.merge(answer_info, on=['qid', 'uid'], how='left').fillna(0)

# 数据类型转换
int_columns = ['num_word', 'num_agree', 'num_cancel','num_commend', 'num_collect', 
               'num_thank', 'num_report', 'num_helpless', 'num_disagree']
data[int_columns] = data[int_columns].astype('int64')
data['answer'] = data['answer'].astype('int64')

**用户历史回答统计特征**

In [151]:
user_stat_raw = pd.read_csv('feature/user_stat_raw.txt', sep='\t')

In [152]:
data = data.merge(user_stat_raw, on='uid', how='left')

# 数据类型转换
u_stat_columns = ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
                'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
                'u_total_disagree_raw', 'u_total_invite_raw', 'u_total_answer_raw']
data[u_stat_columns] = data[u_stat_columns].fillna(0).astype('int64')

In [160]:
# 防标签泄露
for c in u_stat_columns:
    if c == 'u_total_invite_raw':
        continue
    temp = c.split('_')
    c_1 = 'num_' + temp[2]    # 当前 num
    c_2 = '_'.join([temp[0], temp[1], temp[2]])   # 处理后的新列名
    
    # total raw 减去当前的 num
    if c == 'u_total_answer_raw':
        data[c_2] = data[c] - data['answer']
        data[c_2] = data[c_2].replace(-1, 0)    # invite 的数据里 answer 为 1 的用户有时并没有在 answer info 里
    else:
        data[c_2] = data[c] - data[c_1] 


In [479]:
# data['u_total_answer'] = data['u_total_answer'].replace(-1, 0)

**类别特征 encoding**

In [171]:
from sklearn.preprocessing import LabelEncoder
class_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
              'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']
encoder = LabelEncoder()
for feat in class_feat:
    encoder.fit(data[feat])
    data[feat] = encoder.transform(data[feat])

**计数特征**

In [172]:
for feat in ['uid','qid','gender', 'visit_freq','u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d',
       'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [174]:
data.columns

Index(['answer', 'gender', 'i_start_date', 'q_desc_sw', 'q_desc_w',
       'q_head_sw', 'q_head_w', 'q_start_date', 'q_topic', 'qid', 'u_2_cat_a',
       'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_focus_topic',
       'u_like_topic', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c',
       'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid', 'visit_freq',
       'i_start_day', 'i_start_hour', 'q_start_day', 'q_start_hour',
       'num_word', 'num_agree', 'num_cancel', 'num_commend', 'num_collect',
       'num_thank', 'num_report', 'num_helpless', 'num_disagree',
       'u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw',
       'u_total_commend_raw', 'u_total_collect_raw', 'u_total_thank_raw',
       'u_total_report_raw', 'u_total_helpless_raw', 'u_total_disagree_raw',
       'u_total_invite_raw', 'u_total_answer_raw', 'u_total_word',
       'u_total_agree', 'u_total_cancel', 'u_total_commend', 'u_total_collect',
       'u_total_thank', 'u_total_report', 'u_t

**选择训练特征**

In [507]:
used_feat = ['answer']

# question_info
used_feat += ['qid']
# used_feat += ['qid', 'q_desc_sw', 'q_desc_w','q_head_sw', 'q_head_w', 'q_start_date', 'q_topic']


# member_info
used_feat += ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
              'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
              'u_multi_cat_e','u_salt_value']
# used_feat += ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
#               'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
#               'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']
# used_feat += ['gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
#               'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
#               'u_multi_cat_e','u_salt_value']

# answer_info 不用
# used_feat += ['num_word', 'num_agree', 'num_cancel', 'num_commend',
#               'num_collect', 'num_thank', 'num_report', 'num_helpless', 'num_disagree']

# invite_info 不用
# used_feat += ['i_start_date']

# 计数特征
used_feat += ['uid_count','qid_count','gender_count', 'visit_freq_count','u_2_cat_a_count', 
              'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count',
              'u_multi_cat_a_count','u_multi_cat_b_count','u_multi_cat_c_count',
              'u_multi_cat_d_count','u_multi_cat_e_count']
# used_feat += ['gender_count', 'visit_freq_count','u_2_cat_a_count', 
#               'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count',
#               'u_multi_cat_a_count','u_multi_cat_b_count','u_multi_cat_c_count',
#               'u_multi_cat_d_count','u_multi_cat_e_count']
# used_feat += ['u_total_invite_raw']  # 与 uid_count 重合

# 时间解析特征
used_feat += ['i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour']
# used_feat += ['q_start_day']


# 过滤后的用户历史回答特征
# used_feat += ['u_total_word', 'u_total_agree', 'u_total_cancel','u_total_commend', 
#              'u_total_collect', 'u_total_thank','u_total_report', 'u_total_helpless',
#              'u_total_disagree', 'u_total_answer']
used_feat += ['u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']

# 未过滤的用户历史回答特征
# used_feat += ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
#               'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
#               'u_total_disagree_raw', 'u_total_answer_raw']
# used_feat += ['u_total_word_raw', 'u_total_agree_raw', 'u_total_collect_raw', 'u_total_answer_raw']


# 验证
t1 = set(data.columns) 
t11 = list(data.columns)
assert len(t1) == len(t11)
print('data:', len(t11))
t2 = set(used_feat)
t22 = list(used_feat)
assert len(t2) == len(t22)
print('used_feat:', len(t22))

print('t1-t2:',len(t1 - t2))
# print(t1-t2)

# used_data = data[used_feat]
# print('columns for train:', used_data.columns)
print('columns for train:', used_feat)

data: 73
used_feat: 38
t1-t2: 35
columns for train: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count', 'u_multi_cat_a_count', 'u_multi_cat_b_count', 'u_multi_cat_c_count', 'u_multi_cat_d_count', 'u_multi_cat_e_count', 'i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour', 'u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']


**划分数据集**

In [508]:
# 验证集做法要同测试集做法一直
import copy
def process_eval_feat(used_feat):
    replace_feat = ['u_total_word', 'u_total_agree', 'u_total_cancel','u_total_commend', 'u_total_collect', 
                    'u_total_thank','u_total_report', 'u_total_helpless','u_total_disagree', 'u_total_answer']
    eval_feat = copy.copy(used_feat)
    for i, feat in enumerate(used_feat):
        if feat in replace_feat:
            eval_feat[i] = feat + '_raw'
    return eval_feat

In [509]:
# 获取使用特征中的类别特征
def get_cat_feat(used_feat):
    all_cat_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
                    'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e', 'u_2_cat_a',
                    'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e']
    used_cat_feat = []
    for feat in used_feat:
        if feat in all_cat_feat:
            used_cat_feat.append(feat)
    return used_cat_feat

In [510]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

is_eval = True   # 若为 true，取训练集中的后面的 days_eval 天作为验证集
days_eval = 2
cat_feat = get_cat_feat(used_feat)

if not is_eval:
    temp = data[used_feat]
    y_train = temp[:train.shape[0]]['answer']
    X_train = temp[:train.shape[0]].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=cat_feat)
    eval_set_lgb = train_set_lgb

else:
    train_idx = data['i_start_day'][:train.shape[0]] < (3867 - days_eval + 1)
    
    temp = data[used_feat]
    y_train = temp[:train.shape[0]][train_idx]['answer']
    X_train = temp[used_feat][:train.shape[0]][train_idx].drop(['answer'], axis=1)
    
    eval_feature = process_eval_feat(used_feat)
    print('used_feat:', used_feat)
    print('eval_feature:', eval_feature)
    temp = data[eval_feature]
    y_eval = temp[:train.shape[0]][~train_idx]['answer']
    X_eval = temp[eval_feature][:train.shape[0]][~train_idx].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=cat_feat)
    eval_set_lgb = lgb.Dataset(X_eval, y_eval, free_raw_data=False, categorical_feature=class_feat, reference=train_set_lgb)

X_test = data[used_feat][train.shape[0]:].drop(['answer'], axis=1)

used_feat: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count', 'u_multi_cat_a_count', 'u_multi_cat_b_count', 'u_multi_cat_c_count', 'u_multi_cat_d_count', 'u_multi_cat_e_count', 'i_start_hour', 'i_start_day', 'q_start_day', 'q_start_hour', 'u_total_word', 'u_total_agree', 'u_total_collect', 'u_total_answer']
eval_feature: ['answer', 'qid', 'uid', 'gender', 'visit_freq', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid_count', 'qid_count', 'gender_count', 'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count', 'u_

In [511]:
print(y_train.shape)
print(X_train.shape)

print(y_eval.shape)
print(X_eval.shape)

(8787227,)
(8787227, 37)
(701935,)
(701935, 37)


**训练**

In [512]:
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 64, 
    'learning_rate': 0.01, 
    'max_bin': 425, 
    'subsample_for_bin': 50000, 
    'objective': 'binary', 
    'min_split_gain': 0,
    'min_child_weight': 5,
    'min_child_samples': 10, 
    'subsample': 0.8, 
    'subsample_freq': 1,
    'colsample_bytree': 1, 
    'reg_alpha': 3, 
    'reg_lambda': 5, 
    'seed': 1000, 
    'n_jobs': -1, 
    'silent': True,
    'metric': ['auc', 'binary_logloss'],
}

In [513]:
gbm = lgb.train(params, 
                train_set_lgb, 
                valid_sets=eval_set_lgb, 
                early_stopping_rounds=10,
                num_boost_round = 2000
               )

[1]	valid_0's auc: 0.743831	valid_0's binary_logloss: 0.433854
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.74507	valid_0's binary_logloss: 0.432727
[3]	valid_0's auc: 0.745193	valid_0's binary_logloss: 0.431607
[4]	valid_0's auc: 0.746643	valid_0's binary_logloss: 0.430543
[5]	valid_0's auc: 0.74706	valid_0's binary_logloss: 0.429497
[6]	valid_0's auc: 0.747462	valid_0's binary_logloss: 0.42847
[7]	valid_0's auc: 0.747704	valid_0's binary_logloss: 0.427461
[8]	valid_0's auc: 0.749059	valid_0's binary_logloss: 0.426482
[9]	valid_0's auc: 0.749981	valid_0's binary_logloss: 0.425516
[10]	valid_0's auc: 0.750286	valid_0's binary_logloss: 0.424586
[11]	valid_0's auc: 0.750527	valid_0's binary_logloss: 0.423691
[12]	valid_0's auc: 0.75183	valid_0's binary_logloss: 0.422792
[13]	valid_0's auc: 0.752282	valid_0's binary_logloss: 0.421918
[14]	valid_0's auc: 0.752483	valid_0's binary_logloss: 0.421053
[15]	valid_0's auc: 0.752948	valid_0's binary_logloss: 0

[129]	valid_0's auc: 0.774083	valid_0's binary_logloss: 0.376105
[130]	valid_0's auc: 0.774125	valid_0's binary_logloss: 0.375939
[131]	valid_0's auc: 0.774206	valid_0's binary_logloss: 0.375776
[132]	valid_0's auc: 0.774267	valid_0's binary_logloss: 0.37561
[133]	valid_0's auc: 0.77435	valid_0's binary_logloss: 0.375456
[134]	valid_0's auc: 0.774417	valid_0's binary_logloss: 0.375304
[135]	valid_0's auc: 0.774517	valid_0's binary_logloss: 0.375149
[136]	valid_0's auc: 0.774587	valid_0's binary_logloss: 0.375001
[137]	valid_0's auc: 0.774672	valid_0's binary_logloss: 0.374846
[138]	valid_0's auc: 0.77481	valid_0's binary_logloss: 0.374689
[139]	valid_0's auc: 0.774898	valid_0's binary_logloss: 0.37454
[140]	valid_0's auc: 0.774972	valid_0's binary_logloss: 0.374384
[141]	valid_0's auc: 0.775034	valid_0's binary_logloss: 0.374243
[142]	valid_0's auc: 0.775089	valid_0's binary_logloss: 0.374092
[143]	valid_0's auc: 0.775163	valid_0's binary_logloss: 0.373942
[144]	valid_0's auc: 0.775257

[256]	valid_0's auc: 0.781981	valid_0's binary_logloss: 0.363881
[257]	valid_0's auc: 0.782034	valid_0's binary_logloss: 0.363835
[258]	valid_0's auc: 0.782046	valid_0's binary_logloss: 0.363795
[259]	valid_0's auc: 0.782083	valid_0's binary_logloss: 0.363742
[260]	valid_0's auc: 0.782128	valid_0's binary_logloss: 0.363693
[261]	valid_0's auc: 0.782172	valid_0's binary_logloss: 0.363645
[262]	valid_0's auc: 0.782206	valid_0's binary_logloss: 0.363593
[263]	valid_0's auc: 0.782219	valid_0's binary_logloss: 0.363555
[264]	valid_0's auc: 0.782283	valid_0's binary_logloss: 0.36349
[265]	valid_0's auc: 0.782312	valid_0's binary_logloss: 0.363437
[266]	valid_0's auc: 0.782369	valid_0's binary_logloss: 0.363382
[267]	valid_0's auc: 0.782419	valid_0's binary_logloss: 0.36332
[268]	valid_0's auc: 0.782497	valid_0's binary_logloss: 0.363254
[269]	valid_0's auc: 0.782526	valid_0's binary_logloss: 0.36321
[270]	valid_0's auc: 0.782544	valid_0's binary_logloss: 0.363174
[271]	valid_0's auc: 0.78260

[383]	valid_0's auc: 0.787462	valid_0's binary_logloss: 0.358752
[384]	valid_0's auc: 0.787479	valid_0's binary_logloss: 0.358733
[385]	valid_0's auc: 0.787526	valid_0's binary_logloss: 0.358698
[386]	valid_0's auc: 0.787546	valid_0's binary_logloss: 0.358671
[387]	valid_0's auc: 0.787573	valid_0's binary_logloss: 0.358649
[388]	valid_0's auc: 0.787597	valid_0's binary_logloss: 0.358629
[389]	valid_0's auc: 0.787637	valid_0's binary_logloss: 0.358597
[390]	valid_0's auc: 0.787672	valid_0's binary_logloss: 0.358571
[391]	valid_0's auc: 0.787697	valid_0's binary_logloss: 0.358549
[392]	valid_0's auc: 0.787711	valid_0's binary_logloss: 0.358527
[393]	valid_0's auc: 0.787738	valid_0's binary_logloss: 0.358504
[394]	valid_0's auc: 0.787754	valid_0's binary_logloss: 0.35849
[395]	valid_0's auc: 0.787789	valid_0's binary_logloss: 0.358461
[396]	valid_0's auc: 0.787823	valid_0's binary_logloss: 0.358437
[397]	valid_0's auc: 0.787857	valid_0's binary_logloss: 0.358408
[398]	valid_0's auc: 0.787

[510]	valid_0's auc: 0.790253	valid_0's binary_logloss: 0.356442
[511]	valid_0's auc: 0.790287	valid_0's binary_logloss: 0.356417
[512]	valid_0's auc: 0.790298	valid_0's binary_logloss: 0.356409
[513]	valid_0's auc: 0.790319	valid_0's binary_logloss: 0.356392
[514]	valid_0's auc: 0.790348	valid_0's binary_logloss: 0.356374
[515]	valid_0's auc: 0.790366	valid_0's binary_logloss: 0.356361
[516]	valid_0's auc: 0.790376	valid_0's binary_logloss: 0.35635
[517]	valid_0's auc: 0.79039	valid_0's binary_logloss: 0.356339
[518]	valid_0's auc: 0.79042	valid_0's binary_logloss: 0.35632
[519]	valid_0's auc: 0.790433	valid_0's binary_logloss: 0.356308
[520]	valid_0's auc: 0.790446	valid_0's binary_logloss: 0.356298
[521]	valid_0's auc: 0.790466	valid_0's binary_logloss: 0.356282
[522]	valid_0's auc: 0.790473	valid_0's binary_logloss: 0.356276
[523]	valid_0's auc: 0.790486	valid_0's binary_logloss: 0.356267
[524]	valid_0's auc: 0.790493	valid_0's binary_logloss: 0.356263
[525]	valid_0's auc: 0.790505

[637]	valid_0's auc: 0.79199	valid_0's binary_logloss: 0.355189
[638]	valid_0's auc: 0.792003	valid_0's binary_logloss: 0.355181
[639]	valid_0's auc: 0.792008	valid_0's binary_logloss: 0.355178
[640]	valid_0's auc: 0.792021	valid_0's binary_logloss: 0.35517
[641]	valid_0's auc: 0.792049	valid_0's binary_logloss: 0.355155
[642]	valid_0's auc: 0.792064	valid_0's binary_logloss: 0.355145
[643]	valid_0's auc: 0.792078	valid_0's binary_logloss: 0.355135
[644]	valid_0's auc: 0.792083	valid_0's binary_logloss: 0.355131
[645]	valid_0's auc: 0.792107	valid_0's binary_logloss: 0.355114
[646]	valid_0's auc: 0.792131	valid_0's binary_logloss: 0.355098
[647]	valid_0's auc: 0.792144	valid_0's binary_logloss: 0.355088
[648]	valid_0's auc: 0.792158	valid_0's binary_logloss: 0.355079
[649]	valid_0's auc: 0.792169	valid_0's binary_logloss: 0.355071
[650]	valid_0's auc: 0.792175	valid_0's binary_logloss: 0.355066
[651]	valid_0's auc: 0.792191	valid_0's binary_logloss: 0.355055
[652]	valid_0's auc: 0.7921

[764]	valid_0's auc: 0.793392	valid_0's binary_logloss: 0.354276
[765]	valid_0's auc: 0.793395	valid_0's binary_logloss: 0.354273
[766]	valid_0's auc: 0.793402	valid_0's binary_logloss: 0.354268
[767]	valid_0's auc: 0.793407	valid_0's binary_logloss: 0.354266
[768]	valid_0's auc: 0.79342	valid_0's binary_logloss: 0.354256
[769]	valid_0's auc: 0.793429	valid_0's binary_logloss: 0.35425
[770]	valid_0's auc: 0.793435	valid_0's binary_logloss: 0.354244
[771]	valid_0's auc: 0.793445	valid_0's binary_logloss: 0.354236
[772]	valid_0's auc: 0.793454	valid_0's binary_logloss: 0.354229
[773]	valid_0's auc: 0.793464	valid_0's binary_logloss: 0.354221
[774]	valid_0's auc: 0.793476	valid_0's binary_logloss: 0.354218
[775]	valid_0's auc: 0.793482	valid_0's binary_logloss: 0.354214
[776]	valid_0's auc: 0.79349	valid_0's binary_logloss: 0.354207
[777]	valid_0's auc: 0.793496	valid_0's binary_logloss: 0.354202
[778]	valid_0's auc: 0.7935	valid_0's binary_logloss: 0.354199
[779]	valid_0's auc: 0.79351	v

[891]	valid_0's auc: 0.794275	valid_0's binary_logloss: 0.353698
[892]	valid_0's auc: 0.794285	valid_0's binary_logloss: 0.353691
[893]	valid_0's auc: 0.79429	valid_0's binary_logloss: 0.353688
[894]	valid_0's auc: 0.794276	valid_0's binary_logloss: 0.353697
[895]	valid_0's auc: 0.794283	valid_0's binary_logloss: 0.353693
[896]	valid_0's auc: 0.794285	valid_0's binary_logloss: 0.353692
[897]	valid_0's auc: 0.794291	valid_0's binary_logloss: 0.353689
[898]	valid_0's auc: 0.794296	valid_0's binary_logloss: 0.353685
[899]	valid_0's auc: 0.794298	valid_0's binary_logloss: 0.353684
[900]	valid_0's auc: 0.794313	valid_0's binary_logloss: 0.353674
[901]	valid_0's auc: 0.794317	valid_0's binary_logloss: 0.353671
[902]	valid_0's auc: 0.794342	valid_0's binary_logloss: 0.353664
[903]	valid_0's auc: 0.794346	valid_0's binary_logloss: 0.353662
[904]	valid_0's auc: 0.79435	valid_0's binary_logloss: 0.353659
[905]	valid_0's auc: 0.794375	valid_0's binary_logloss: 0.353642
[906]	valid_0's auc: 0.7943

[1018]	valid_0's auc: 0.795081	valid_0's binary_logloss: 0.35321
[1019]	valid_0's auc: 0.795088	valid_0's binary_logloss: 0.353206
[1020]	valid_0's auc: 0.795094	valid_0's binary_logloss: 0.353202
[1021]	valid_0's auc: 0.795097	valid_0's binary_logloss: 0.3532
[1022]	valid_0's auc: 0.795106	valid_0's binary_logloss: 0.353193
[1023]	valid_0's auc: 0.795101	valid_0's binary_logloss: 0.353196
[1024]	valid_0's auc: 0.795103	valid_0's binary_logloss: 0.353195
[1025]	valid_0's auc: 0.795105	valid_0's binary_logloss: 0.353194
[1026]	valid_0's auc: 0.795109	valid_0's binary_logloss: 0.353191
[1027]	valid_0's auc: 0.795114	valid_0's binary_logloss: 0.353188
[1028]	valid_0's auc: 0.79512	valid_0's binary_logloss: 0.353185
[1029]	valid_0's auc: 0.795123	valid_0's binary_logloss: 0.353184
[1030]	valid_0's auc: 0.795123	valid_0's binary_logloss: 0.353184
[1031]	valid_0's auc: 0.795119	valid_0's binary_logloss: 0.353187
[1032]	valid_0's auc: 0.795123	valid_0's binary_logloss: 0.353185
[1033]	valid_0

[1143]	valid_0's auc: 0.795661	valid_0's binary_logloss: 0.352859
[1144]	valid_0's auc: 0.795664	valid_0's binary_logloss: 0.352857
[1145]	valid_0's auc: 0.795667	valid_0's binary_logloss: 0.352855
[1146]	valid_0's auc: 0.795669	valid_0's binary_logloss: 0.352854
[1147]	valid_0's auc: 0.795669	valid_0's binary_logloss: 0.352854
[1148]	valid_0's auc: 0.795672	valid_0's binary_logloss: 0.352853
[1149]	valid_0's auc: 0.79568	valid_0's binary_logloss: 0.352847
[1150]	valid_0's auc: 0.795684	valid_0's binary_logloss: 0.352845
[1151]	valid_0's auc: 0.795691	valid_0's binary_logloss: 0.352839
[1152]	valid_0's auc: 0.795698	valid_0's binary_logloss: 0.352835
[1153]	valid_0's auc: 0.795698	valid_0's binary_logloss: 0.352835
[1154]	valid_0's auc: 0.795702	valid_0's binary_logloss: 0.352832
[1155]	valid_0's auc: 0.795702	valid_0's binary_logloss: 0.352832
[1156]	valid_0's auc: 0.795704	valid_0's binary_logloss: 0.352831
[1157]	valid_0's auc: 0.795704	valid_0's binary_logloss: 0.352832
[1158]	vali

[1268]	valid_0's auc: 0.796031	valid_0's binary_logloss: 0.352625
[1269]	valid_0's auc: 0.796032	valid_0's binary_logloss: 0.352624
[1270]	valid_0's auc: 0.796036	valid_0's binary_logloss: 0.352621
[1271]	valid_0's auc: 0.796036	valid_0's binary_logloss: 0.352621
[1272]	valid_0's auc: 0.796045	valid_0's binary_logloss: 0.352616
[1273]	valid_0's auc: 0.796057	valid_0's binary_logloss: 0.352609
[1274]	valid_0's auc: 0.796058	valid_0's binary_logloss: 0.352608
[1275]	valid_0's auc: 0.796058	valid_0's binary_logloss: 0.352608
[1276]	valid_0's auc: 0.796059	valid_0's binary_logloss: 0.352608
[1277]	valid_0's auc: 0.79606	valid_0's binary_logloss: 0.352607
[1278]	valid_0's auc: 0.796063	valid_0's binary_logloss: 0.352605
[1279]	valid_0's auc: 0.796066	valid_0's binary_logloss: 0.352602
[1280]	valid_0's auc: 0.796066	valid_0's binary_logloss: 0.352602
[1281]	valid_0's auc: 0.796073	valid_0's binary_logloss: 0.352596
[1282]	valid_0's auc: 0.796074	valid_0's binary_logloss: 0.352595
[1283]	vali

[1393]	valid_0's auc: 0.796384	valid_0's binary_logloss: 0.352402
[1394]	valid_0's auc: 0.796386	valid_0's binary_logloss: 0.352401
[1395]	valid_0's auc: 0.796387	valid_0's binary_logloss: 0.352401
[1396]	valid_0's auc: 0.796389	valid_0's binary_logloss: 0.3524
[1397]	valid_0's auc: 0.79639	valid_0's binary_logloss: 0.352399
[1398]	valid_0's auc: 0.796391	valid_0's binary_logloss: 0.352398
[1399]	valid_0's auc: 0.796394	valid_0's binary_logloss: 0.352396
[1400]	valid_0's auc: 0.796398	valid_0's binary_logloss: 0.352394
[1401]	valid_0's auc: 0.796398	valid_0's binary_logloss: 0.352394
[1402]	valid_0's auc: 0.796401	valid_0's binary_logloss: 0.352391
[1403]	valid_0's auc: 0.796403	valid_0's binary_logloss: 0.35239
[1404]	valid_0's auc: 0.796403	valid_0's binary_logloss: 0.35239
[1405]	valid_0's auc: 0.796405	valid_0's binary_logloss: 0.352388
[1406]	valid_0's auc: 0.796407	valid_0's binary_logloss: 0.352387
[1407]	valid_0's auc: 0.796409	valid_0's binary_logloss: 0.352385
[1408]	valid_0'

[1518]	valid_0's auc: 0.79663	valid_0's binary_logloss: 0.35225
[1519]	valid_0's auc: 0.796634	valid_0's binary_logloss: 0.352247
[1520]	valid_0's auc: 0.796638	valid_0's binary_logloss: 0.352243
[1521]	valid_0's auc: 0.796648	valid_0's binary_logloss: 0.352237
[1522]	valid_0's auc: 0.796651	valid_0's binary_logloss: 0.352234
[1523]	valid_0's auc: 0.796653	valid_0's binary_logloss: 0.352233
[1524]	valid_0's auc: 0.796654	valid_0's binary_logloss: 0.352232
[1525]	valid_0's auc: 0.796655	valid_0's binary_logloss: 0.352231
[1526]	valid_0's auc: 0.796659	valid_0's binary_logloss: 0.35223
[1527]	valid_0's auc: 0.796659	valid_0's binary_logloss: 0.35223
[1528]	valid_0's auc: 0.796661	valid_0's binary_logloss: 0.352228
[1529]	valid_0's auc: 0.796661	valid_0's binary_logloss: 0.352228
[1530]	valid_0's auc: 0.796662	valid_0's binary_logloss: 0.352228
[1531]	valid_0's auc: 0.796663	valid_0's binary_logloss: 0.352228
[1532]	valid_0's auc: 0.796664	valid_0's binary_logloss: 0.352227
[1533]	valid_0

[1643]	valid_0's auc: 0.796832	valid_0's binary_logloss: 0.352121
[1644]	valid_0's auc: 0.796832	valid_0's binary_logloss: 0.352121
[1645]	valid_0's auc: 0.796834	valid_0's binary_logloss: 0.35212
[1646]	valid_0's auc: 0.796834	valid_0's binary_logloss: 0.35212
[1647]	valid_0's auc: 0.796834	valid_0's binary_logloss: 0.35212
[1648]	valid_0's auc: 0.796839	valid_0's binary_logloss: 0.352117
[1649]	valid_0's auc: 0.796839	valid_0's binary_logloss: 0.352117
[1650]	valid_0's auc: 0.796839	valid_0's binary_logloss: 0.352117
[1651]	valid_0's auc: 0.796841	valid_0's binary_logloss: 0.352116
Early stopping, best iteration is:
[1641]	valid_0's auc: 0.79684	valid_0's binary_logloss: 0.352115


**保存模型**

In [517]:
import pickle
pickle.dump(gbm, open('model/gbm.pkl', 'wb'))

**保存结果**

In [514]:
y_pred = gbm.predict(X_test)
print(y_pred.shape)
result_append['answer'] = y_pred
result_append.to_csv('result.txt', index=False, header=False, sep='\t')

(1141683,)


**特征重要性**

In [515]:
col = list(used_feat)
del col[0]
importance = gbm.feature_importance()
# importance
# print(col)
# print(importance)

In [516]:
importance_df = pd.DataFrame({'col':col, 'imp': importance})
importance_df['imp_rate'] = (importance_df['imp'] / importance_df['imp'].sum()).apply(lambda x: format(x, '.2%'))
importance_df

Unnamed: 0,col,imp,imp_rate
0,qid,10687,10.34%
1,uid,467,0.45%
2,gender,803,0.78%
3,visit_freq,328,0.32%
4,u_2_cat_a,49,0.05%
5,u_2_cat_b,21,0.02%
6,u_2_cat_c,112,0.11%
7,u_2_cat_d,195,0.19%
8,u_2_cat_e,4,0.00%
9,u_multi_cat_a,14160,13.70%
