In [7]:
import pandas as pd

# 用户信息
user_info = pd.read_csv('data/member_info_0926.txt', header=None, sep='\t')
user_info.columns = ['uid','gender','visit_freq','u_2_cat_a','u_2_cat_b','u_2_cat_c','u_2_cat_d',
                     'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d',
                     'u_multi_cat_e','u_salt_value','u_focus_topic','u_like_topic']

# 问题信息
question_info = pd.read_csv('data/question_info_0926.txt', header=None, sep='\t')
question_info.columns = ['qid','q_start_date','q_head_sw','q_head_w','q_desc_sw','q_desc_w',
                         'q_topic']

# 训练数据
train = pd.read_csv('data/invite_info_0926.txt', header=None, sep='\t')
train.columns = ['qid', 'uid', 'i_start_date','answer']
train = pd.merge(train, user_info, how='left', on='uid')
train = pd.merge(train, question_info, how='left', on='qid')

# 测试数据
test = pd.read_csv('data/invite_info_evaluate_0926.txt', header=None, sep='\t')
test.columns = ['qid', 'uid', 'i_start_date']
test = pd.merge(test, user_info, how='left', on='uid')
test = pd.merge(test, question_info, how='left', on='qid')

In [8]:
# 数据合并
data = pd.concat([train, test], axis=0, sort=True)

In [9]:
# 用于保存提交结果
result_append = data[['qid', 'uid', 'i_start_date']][train.shape[0]:]

In [10]:
# 时间解析
data['i_start_day'] = data['i_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['i_start_hour'] = data['i_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

data['q_start_day'] = data['q_start_date'].apply(lambda x: int(x.split('-')[0].split('D')[1]))
data['q_start_hour'] = data['q_start_date'].apply(lambda x: int(x.split('-')[1].split('H')[1]))

In [11]:
# 类别特征 encoding
from sklearn.preprocessing import LabelEncoder
class_feat = ['uid','qid','gender', 'visit_freq','u_multi_cat_a','u_multi_cat_b',
              'u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']
encoder = LabelEncoder()
for feat in class_feat:
    encoder.fit(data[feat])
    data[feat] = encoder.transform(data[feat])

In [12]:
# 计数特征
for feat in ['uid','qid','gender', 'visit_freq','u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c', 'u_2_cat_d',
       'u_2_cat_e','u_multi_cat_a','u_multi_cat_b','u_multi_cat_c','u_multi_cat_d','u_multi_cat_e']:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [13]:
# 去掉特征
drop_feat = ['q_head_sw','q_head_w','q_desc_sw','q_desc_w','q_topic', 
             'u_focus_topic','u_like_topic','q_start_date','i_start_date', 
#              '性别_count', 
#              '用户二分类特征a_count', '用户二分类特征b_count', '用户二分类特征c_count', 
#              '用户二分类特征d_count','用户二分类特征e_count'
            ]
data = data.drop(drop_feat, axis=1)
print('used columns:', data.columns)

used columns: Index(['answer', 'gender', 'qid', 'u_2_cat_a', 'u_2_cat_b', 'u_2_cat_c',
       'u_2_cat_d', 'u_2_cat_e', 'u_multi_cat_a', 'u_multi_cat_b',
       'u_multi_cat_c', 'u_multi_cat_d', 'u_multi_cat_e', 'u_salt_value',
       'uid', 'visit_freq', 'i_start_day', 'i_start_hour', 'q_start_day',
       'q_start_hour', 'uid_count', 'qid_count', 'gender_count',
       'visit_freq_count', 'u_2_cat_a_count', 'u_2_cat_b_count',
       'u_2_cat_c_count', 'u_2_cat_d_count', 'u_2_cat_e_count',
       'u_multi_cat_a_count', 'u_multi_cat_b_count', 'u_multi_cat_c_count',
       'u_multi_cat_d_count', 'u_multi_cat_e_count'],
      dtype='object')


In [14]:
# 划分数据集
from lightgbm import LGBMClassifier
import lightgbm as lgb

is_eval = True   # 若为 true，取训练集中的后面的 days_eval 天作为验证集
days_eval = 1
# 划分训练集和测试集
if not is_eval:
    y_train = data[:train.shape[0]]['answer']
    X_train = data[:train.shape[0]].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=class_feat)
    eval_set_lgb = None
#     eval_set = [(X_train, y_train)] 
else:
    train_idx = data[:train.shape[0]]['i_start_day'] < (3867 - days_eval + 1)
    y_train = data[:train.shape[0]][train_idx]['answer']
    X_train = data[:train.shape[0]][train_idx].drop(['answer'], axis=1)
    
    y_eval = data[:train.shape[0]][~train_idx]['answer']
    X_eval = data[:train.shape[0]][~train_idx].drop(['answer'], axis=1)
    
    train_set_lgb = lgb.Dataset(X_train, y_train, free_raw_data=False, categorical_feature=class_feat)
    eval_set_lgb = lgb.Dataset(X_eval, y_eval, free_raw_data=False, categorical_feature=class_feat, reference=train_set_lgb)
#     eval_set = [(X_eval, y_eval)]
X_test = data[train.shape[0]:].drop(['answer'], axis=1)

In [15]:
print(y_train.shape)
print(X_train.shape)

print(y_eval.shape)
print(X_eval.shape)

(9141216,)
(9141216, 33)
(347946,)
(347946, 33)


In [23]:
# model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=64, learning_rate=0.01, n_estimators=200,
#                            max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
#                            min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1,
#                            colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=-1, silent=True)
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 64, 
    'learning_rate': 0.01, 
    'n_estimators': 200,
    'max_bin': 425, 
    'subsample_for_bin': 50000, 
    'objective': 'binary', 
    'min_split_gain': 0,
    'min_child_weight': 5,
    'min_child_samples': 10, 
    'subsample': 0.8, 
    'subsample_freq': 1,
    'colsample_bytree': 1, 
    'reg_alpha': 3, 
    'reg_lambda': 5, 
    'seed': 1000, 
    'n_jobs': -1, 
    'silent': True,
    'metric': ['auc', 'binary_logloss'],
}

In [24]:
# model_lgb.fit(X_train, y_train, 
#                   eval_names=['train'],
#                   eval_metric=['logloss','auc'],
#                   eval_set=eval_set,
#                   early_stopping_rounds=10)
gbm = lgb.train(params, train_set_lgb, valid_sets=eval_set_lgb, early_stopping_rounds=10)

[1]	valid_0's auc: 0.744073	valid_0's binary_logloss: 0.432854
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.751933	valid_0's binary_logloss: 0.431736
[3]	valid_0's auc: 0.751337	valid_0's binary_logloss: 0.430647
[4]	valid_0's auc: 0.75383	valid_0's binary_logloss: 0.429585
[5]	valid_0's auc: 0.753539	valid_0's binary_logloss: 0.42855
[6]	valid_0's auc: 0.753627	valid_0's binary_logloss: 0.427526
[7]	valid_0's auc: 0.753735	valid_0's binary_logloss: 0.426519
[8]	valid_0's auc: 0.753887	valid_0's binary_logloss: 0.425565
[9]	valid_0's auc: 0.754309	valid_0's binary_logloss: 0.424607
[10]	valid_0's auc: 0.754622	valid_0's binary_logloss: 0.423693
[11]	valid_0's auc: 0.754876	valid_0's binary_logloss: 0.422767
[12]	valid_0's auc: 0.754656	valid_0's binary_logloss: 0.421871
[13]	valid_0's auc: 0.755171	valid_0's binary_logloss: 0.421013
[14]	valid_0's auc: 0.755104	valid_0's binary_logloss: 0.420181
[15]	valid_0's auc: 0.755243	valid_0's binary_logloss:

[128]	valid_0's auc: 0.769368	valid_0's binary_logloss: 0.376993
[129]	valid_0's auc: 0.769448	valid_0's binary_logloss: 0.376847
[130]	valid_0's auc: 0.769526	valid_0's binary_logloss: 0.376695
[131]	valid_0's auc: 0.769596	valid_0's binary_logloss: 0.376547
[132]	valid_0's auc: 0.769627	valid_0's binary_logloss: 0.376403
[133]	valid_0's auc: 0.769716	valid_0's binary_logloss: 0.376246
[134]	valid_0's auc: 0.769745	valid_0's binary_logloss: 0.376106
[135]	valid_0's auc: 0.769827	valid_0's binary_logloss: 0.37596
[136]	valid_0's auc: 0.769928	valid_0's binary_logloss: 0.375802
[137]	valid_0's auc: 0.769991	valid_0's binary_logloss: 0.375677
[138]	valid_0's auc: 0.770082	valid_0's binary_logloss: 0.375527
[139]	valid_0's auc: 0.770135	valid_0's binary_logloss: 0.375387
[140]	valid_0's auc: 0.770181	valid_0's binary_logloss: 0.375253
[141]	valid_0's auc: 0.77027	valid_0's binary_logloss: 0.375107
[142]	valid_0's auc: 0.770315	valid_0's binary_logloss: 0.374975
[143]	valid_0's auc: 0.7704

In [30]:
y_pred = gbm.predict(X_test)
print(y_pred.shape)
result_append['answer'] = y_pred
result_append.to_csv('result.txt', index=False, header=False, sep='\t')

(1141683,)


In [31]:
col = list(data.columns)
del col[1]
importance = gbm.feature_importance()
importance
# print(col)
# print(importance)

array([ 282,  666,    0,    0,    0,    9,    0,   32,   29,    0,   20,
          0, 2331,  132,   27,  735, 1096, 1717,  234, 1618, 2225,    0,
       1443,    0,    0,    0,    0,    0,    4,    0,    0,    0,    0],
      dtype=int32)

In [32]:
importance_df = pd.DataFrame({'col':col, 'imp': importance})
importance_df

Unnamed: 0,col,imp
0,answer,282
1,qid,666
2,u_2_cat_a,0
3,u_2_cat_b,0
4,u_2_cat_c,0
5,u_2_cat_d,9
6,u_2_cat_e,0
7,u_multi_cat_a,32
8,u_multi_cat_b,29
9,u_multi_cat_c,0
