In [7]:
import pandas as pd

# 用户信息
user_info = pd.read_csv('data/member_info_0926.txt', header=None, sep='\t')
user_info.columns = ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
                     'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
                     'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']

# 问题信息
question_info = pd.read_csv('data/question_info_0926.txt', header=None, sep='\t')
question_info.columns = ['qid','q_start_date','q_head_sw','q_head_w','q_desc_sw','q_desc_w',
                         'q_topic']

# 训练数据
train = pd.read_csv('data/invite_info_0926.txt', header=None, sep='\t')
train.columns = ['qid', 'uid', 'i_start_date','answer']
train = pd.merge(train, user_info, how='left', on='uid')
train = pd.merge(train, question_info, how='left', on='qid')

# 测试数据
test = pd.read_csv('data/invite_info_evaluate_0926.txt', header=None, sep='\t')
test.columns = ['qid', 'uid', 'i_start_date']
test = pd.merge(test, user_info, how='left', on='uid')
test = pd.merge(test, question_info, how='left', on='qid')

In [90]:
# 数据合并
data = pd.concat([train, test], axis=0, sort=True)

In [91]:
# 用于保存提交结果
result_append = data[['qid', 'uid', 'i_start_date']][train.shape[0]:]

In [92]:
# 时间解析
data['i_start_day'] = data['i_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['i_start_hour'] = data['i_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

data['q_start_day'] = data['q_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['q_start_hour'] = data['q_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

In [93]:
# 答案
answer_info = pd.read_csv('data/answer_info_0926.txt', header=None, sep='\t')
answer_info.columns = ['aid', 'qid', 'uid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 
                       'round_table', 'picture', 'video', 'num_word', 'num_agree', 'num_cancel',
                       'num_commend', 'num_collect', 'num_thank', 'num_report', 'num_helpless', 'num_disagree']

answer_info = answer_info.drop(['aid', 'a_start_date', 'a_sw', 'a_w', 'good', 'recommend', 'round_table', 
                                'picture', 'video'], axis=1)

In [97]:
int_columns

['num_word',
 'num_agree',
 'num_cancel',
 'num_commend',
 'num_collect',
 'num_thank',
 'num_report',
 'num_helpless',
 'num_disagree']

In [100]:
data.columns

Index(['answer', 'gender', 'i_start_date', 'q_desc_sw', 'q_desc_w',
       'q_head_sw', 'q_head_w', 'q_start_date', 'q_topic', 'qid', 'u_2_cat_a',
       'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_focus_topic',
       'u_like_topic', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c',
       'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid', 'visit_freq',
       'i_start_day', 'i_start_hour', 'q_start_day', 'q_start_hour'],
      dtype='object')

In [101]:
# 该条邀请的回答详情，包括各种点赞数等所有数量（若无回答，补0）
data = data.merge(answer_info, on=['qid', 'uid'], how='left').fillna(0)

# 数据类型转换
int_columns = ['num_word', 'num_agree', 'num_cancel','num_commend', 'num_collect', 
               'num_thank', 'num_report', 'num_helpless', 'num_disagree']
data[int_columns] = data[int_columns].astype('int64')

In [102]:
# 用户历史回答统计特征
user_stat_raw = pd.read_csv('feature/user_stat_raw.txt', sep='\t')

In [103]:
data = data.merge(user_stat_raw, on='uid', how='left')

# 数据类型转换
u_stat_columns = ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
                'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
                'u_total_disagree_raw', 'u_total_invite_raw']
data[u_stat_columns] = data[u_stat_columns].fillna(0).astype('int64')

In [104]:
# 防标签泄露
for c in u_stat_columns[:-1]:
    temp = c.split('_')
    c_1 = 'num_' + temp[2]    # 当前 num
    c_2 = '_'.join([temp[0], temp[1], temp[2]])   # 处理后的新列名
    
    # total raw 减去当前的 num
    data[c_2] = data[c] - data[c_1]


In [105]:
data.columns

Index(['answer', 'gender', 'i_start_date', 'q_desc_sw', 'q_desc_w',
       'q_head_sw', 'q_head_w', 'q_start_date', 'q_topic', 'qid', 'u_2_cat_a',
       'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d', 'u_2_cat_e', 'u_focus_topic',
       'u_like_topic', 'u_multi_cat_a', 'u_multi_cat_b', 'u_multi_cat_c',
       'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value', 'uid', 'visit_freq',
       'i_start_day', 'i_start_hour', 'q_start_day', 'q_start_hour',
       'num_word', 'num_agree', 'num_cancel', 'num_commend', 'num_collect',
       'num_thank', 'num_report', 'num_helpless', 'num_disagree',
       'u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw',
       'u_total_commend_raw', 'u_total_collect_raw', 'u_total_thank_raw',
       'u_total_report_raw', 'u_total_helpless_raw', 'u_total_disagree_raw',
       'u_total_invite_raw', 'u_total_word', 'u_total_agree', 'u_total_cancel',
       'u_total_commend', 'u_total_collect', 'u_total_thank', 'u_total_report',
       'u_total_helpless', 'u_tot

In [106]:
data['answer'] = data['answer'].astype('int64')

In [107]:
# 类别特征 encoding
from sklearn.preprocessing import LabelEncoder
class_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
              'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']
encoder = LabelEncoder()
for feat in class_feat:
    encoder.fit(data[feat])
    data[feat] = encoder.transform(data[feat])

In [108]:
# 计数特征
for feat in ['uid','qid','gender', 'visit_freq','u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d',
       'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [110]:
# 去掉特征
drop_feat = ['q_head_sw','q_head_w','q_desc_sw','q_desc_w','q_topic', 
             'u_focus_topic','u_like_topic','q_start_date','i_start_date', 
#              '性别_count', 
#              '用户二分类特征a_count', '用户二分类特征b_count', '用户二分类特征c_count', 
#              '用户二分类特征d_count','用户二分类特征e_count'
            ]

# 去掉 answer_info
drop_feat += ['num_word', 'num_agree', 'num_cancel','num_commend', 'num_collect', 
              'num_thank', 'num_report', 'num_helpless', 'num_disagree']

# 去掉历史回答的 raw 统计特征
# drop_feat += ['u_total_word_raw', 'u_total_agree_raw', 'u_total_cancel_raw','u_total_commend_raw', 
#               'u_total_collect_raw', 'u_total_thank_raw','u_total_report_raw', 'u_total_helpless_raw', 
#               'u_total_disagree_raw']

# 去掉历史回答处理后的统计特征
drop_feat += ['u_total_word', 'u_total_agree', 'u_total_cancel','u_total_commend', 
              'u_total_collect', 'u_total_thank','u_total_report', 'u_total_helpless', 
              'u_total_disagree']

used_data = data.drop(drop_feat, axis=1)
print('used columns:', used_data.columns)

used columns: Index(['answer', 'gender', 'qid', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c',
       'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b',
       'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value',
       'uid', 'visit_freq', 'i_start_day', 'i_start_hour', 'q_start_day',
       'q_start_hour', 'u_total_word_raw', 'u_total_agree_raw',
       'u_total_cancel_raw', 'u_total_commend_raw', 'u_total_collect_raw',
       'u_total_thank_raw', 'u_total_report_raw', 'u_total_helpless_raw',
       'u_total_disagree_raw', 'u_total_invite_raw', 'uid_count', 'qid_count',
       'gender_count', 'visit_freq_count', 'u_2_cat_a_count',
       'u_2_cat_b_count', 'u_2_cat_c_count', 'u_2_cat_d_count',
       'u_2_cat_e_count', 'u_multi_cat_a_count', 'u_multi_cat_b_count',
       'u_multi_cat_c_count', 'u_multi_cat_d_count', 'u_multi_cat_e_count'],
      dtype='object')


In [113]:
# 划分数据集
from lightgbm import LGBMClassifier
import lightgbm as lgb

is_eval = True   # 若为 true，取训练集中的后面的 days_eval 天作为验证集
days_eval = 1

if not is_eval:
    y_train = used_data[:train.shape[0]]['answer']
    X_train = used_data[:train.shape[0]].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=class_feat)
    eval_set_lgb = None
#     eval_set = [(X_train, y_train)] 
else:
    train_idx = used_data[:train.shape[0]]['i_start_day'] < (3867 - days_eval + 1)
    y_train = used_data[:train.shape[0]][train_idx]['answer']
    X_train = used_data[:train.shape[0]][train_idx].drop(['answer'], axis=1)
    
    y_eval = used_data[:train.shape[0]][~train_idx]['answer']
    X_eval = used_data[:train.shape[0]][~train_idx].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=class_feat)
    eval_set_lgb = lgb.Dataset(X_eval, y_eval, free_raw_data=False, categorical_feature=class_feat, reference=train_set_lgb)
#     eval_set = [(X_eval, y_eval)]
X_test = used_data[train.shape[0]:].drop(['answer'], axis=1)

In [114]:
print(y_train.shape)
print(X_train.shape)

print(y_eval.shape)
print(X_eval.shape)

(9141216,)
(9141216, 43)
(347946,)
(347946, 43)


In [115]:
# model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=64, learning_rate=0.01, n_estimators=200,
#                            max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
#                            min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1,
#                            colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=-1, silent=True)
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 64, 
    'learning_rate': 0.01, 
    'n_estimators': 200,
    'max_bin': 425, 
    'subsample_for_bin': 50000, 
    'objective': 'binary', 
    'min_split_gain': 0,
    'min_child_weight': 5,
    'min_child_samples': 10, 
    'subsample': 0.8, 
    'subsample_freq': 1,
    'colsample_bytree': 1, 
    'reg_alpha': 3, 
    'reg_lambda': 5, 
    'seed': 1000, 
    'n_jobs': -1, 
    'silent': True,
    'metric': ['auc', 'binary_logloss'],
}

In [None]:
# model_lgb.fit(X_train, y_train, 
#                   eval_names=['train'],
#                   eval_metric=['logloss','auc'],
#                   eval_set=eval_set,
#                   early_stopping_rounds=10)
gbm = lgb.train(params, train_set_lgb, valid_sets=eval_set_lgb, early_stopping_rounds=10)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[1]	valid_0's auc: 0.725229	valid_0's binary_logloss: 0.432742
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.725207	valid_0's binary_logloss: 0.431542
[3]	valid_0's auc: 0.725191	valid_0's binary_logloss: 0.430387
[4]	valid_0's auc: 0.727722	valid_0's binary_logloss: 0.429318
[5]	valid_0's auc: 0.727146	valid_0's binary_logloss: 0.428212
[6]	valid_0's auc: 0.727667	valid_0's binary_logloss: 0.427154
[7]	valid_0's auc: 0.728532	valid_0's binary_logloss: 0.426131
[8]	valid_0's auc: 0.730707	valid_0's binary_logloss: 0.425133
[9]	valid_0's auc: 0.730903	valid_0's binary_logloss: 0.424179
[10]	valid_0's auc: 0.730499	valid_0's binary_logloss: 0.423254
[11]	valid_0's auc: 0.73068	valid_0's binary_logloss: 0.422328
[12]	valid_0's auc: 0.732197	valid_0's binary_logloss: 0.421427
[13]	valid_0's auc: 0.732398	valid_0's binary_logloss: 0.420531
[14]	valid_0's auc: 0.733354	valid_0's binary_logloss: 0.419639
[15]	valid_0's auc: 0.734128	valid_0's binary_logloss

In [30]:
y_pred = gbm.predict(X_test)
print(y_pred.shape)
result_append['answer'] = y_pred
result_append.to_csv('result.txt', index=False, header=False, sep='\t')

(1141683,)


In [31]:
col = list(used_data.columns)
del col[1]
importance = gbm.feature_importance()
importance
# print(col)
# print(importance)

array([ 282,  666,    0,    0,    0,    9,    0,   32,   29,    0,   20,
          0, 2331,  132,   27,  735, 1096, 1717,  234, 1618, 2225,    0,
       1443,    0,    0,    0,    0,    0,    4,    0,    0,    0,    0],
      dtype=int32)

In [35]:
importance_df = pd.DataFrame({'col':col, 'imp': importance})
importance_df['imp_rate'] = (importance_df['imp'] / importance_df['imp'].sum()).apply(lambda x: format(x, '.2%'))
importance_df

Unnamed: 0,col,imp,imp_rate
0,answer,282,2.24%
1,qid,666,5.29%
2,u_2_cat_a,0,0.00%
3,u_2_cat_b,0,0.00%
4,u_2_cat_c,0,0.00%
5,u_2_cat_d,9,0.07%
6,u_2_cat_e,0,0.00%
7,u_multi_cat_a,32,0.25%
8,u_multi_cat_b,29,0.23%
9,u_multi_cat_c,0,0.00%
