In [68]:
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [69]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')


def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [70]:
base_path = './data'

In [71]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

[2019-11-27 06:56:51,053] INFO in <ipython-input-71-d8d667ebe145>: invite (9489162, 4)
[2019-11-27 06:56:54,180] INFO in <ipython-input-71-d8d667ebe145>: test (1141683, 3)


In [72]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-11-27 06:58:36,395] INFO in <ipython-input-72-9063dae39e17>: ques (1829900, 3)


In [73]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid', how='left')
del ques

[2019-11-27 07:00:31,188] INFO in <ipython-input-73-0f04b531e5be>: ans (4513735, 18)


In [74]:
ans.columns

Index(['aid', 'qid', 'uid', 'is_good', 'is_rec', 'is_dest', 'has_img',
       'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment',
       'reci_mark', 'reci_tks', 'reci_xxx', 'reci_no_help', 'reci_dis',
       'a_day', 'a_hour', 'topic', 'q_day', 'q_hour'],
      dtype='object')

In [75]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

In [76]:
# 时间窗口划分
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

# 3838~3860
train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

# 3845~3867
val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

# 3861~3867
train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-11-27 07:01:13,895] INFO in <ipython-input-76-5d7f58c037fb>: train_label_feature (6895493, 5)
[2019-11-27 07:01:14,879] INFO in <ipython-input-76-5d7f58c037fb>: val_label_feature (7583553, 5)
[2019-11-27 07:01:15,389] INFO in <ipython-input-76-5d7f58c037fb>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-11-27 07:01:15,426] INFO in <ipython-input-76-5d7f58c037fb>: test feature start 3845 end 3867, label start 3868 end 3874


In [77]:
# ans的时间范围 3807~3874

# 3810~3860
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

# 3817~3867
val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']

[2019-11-27 07:01:21,180] INFO in <ipython-input-77-cc959fd6ea4e>: train ans feature (3828707, 23), start 3807 end 3860
[2019-11-27 07:01:21,200] INFO in <ipython-input-77-cc959fd6ea4e>: val ans feature (4162522, 23), start 3814 end 3867


In [78]:
def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count'] # 回答率,回答次数,标准差,邀请次数
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征
    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']          # 在 answer_info 中的回答次数
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']          # 在 answer_info 中的回答次数
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target

In [79]:
train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-11-27 07:02:34,625] INFO in <ipython-input-78-0fafb00ac390>: extract is_good
[2019-11-27 07:02:57,442] INFO in <ipython-input-78-0fafb00ac390>: extract is_rec
[2019-11-27 07:03:21,116] INFO in <ipython-input-78-0fafb00ac390>: extract is_dest
[2019-11-27 07:03:44,858] INFO in <ipython-input-78-0fafb00ac390>: extract has_img
[2019-11-27 07:04:08,815] INFO in <ipython-input-78-0fafb00ac390>: extract has_video
[2019-11-27 07:04:33,625] INFO in <ipython-input-78-0fafb00ac390>: extract word_count
[2019-11-27 07:04:58,909] INFO in <ipython-input-78-0fafb00ac390>: extract reci_cheer
[2019-11-27 07:05:24,266] INFO in <ipython-input-78-0fafb00ac390>: extract reci_uncheer
[2019-11-27 07:05:50,765] INFO in <ipython-input-78-0fafb00ac390>: extract reci_comment
[2019-11-27 07:06:14,743] INFO in <ipython-input-78-0fafb00ac390>: extract reci_mark
[2019-11-27 07:06:30,578] INFO in <ipython-input-78-0fafb00ac390>: extract reci_tks
[2019-11-27 07:06:57,470] INFO in <ipython-input-78-0fafb00ac390>: 

In [80]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

[2019-11-27 07:15:00,259] INFO in <ipython-input-80-464c91a59cf8>: train shape (2593669, 105), test shape (1141683, 104)


In [81]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

[2019-11-27 07:15:17,972] INFO in <ipython-input-81-7cebf38ed8f9>: user (1931654, 14)
[2019-11-27 07:15:24,496] INFO in <ipython-input-81-7cebf38ed8f9>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-11-27 07:15:24,505] INFO in <ipython-input-81-7cebf38ed8f9>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-11-27 07:15:26,227] INFO in <ipython-input-81-7cebf38ed8f9>: encode gender
[2019-11-27 07:15:28,042] INFO in <ipython-input-81-7cebf38ed8f9>: encode freq
[2019-11-27 07:15:29,687] INFO in <ipython-input-81-7cebf38ed8f9>: encode uf_c1
[2019-11-27 07:15:31,306] INFO in <ipython-input-81-7cebf38ed8f9>: encode uf_c2
[2019-11-27 07:15:32,885] INFO in <ipython-input-81-7cebf38ed8f9>: encode uf_c3
[2019-11-27 07:15:34,431] INFO in

In [82]:
# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
del train_label, test

[2019-11-27 07:16:18,355] INFO in <ipython-input-82-5d1ad498aeb5>: train shape (2593669, 120), test shape (1141683, 119)


In [83]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [84]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [85]:
train_label = train[(train['day'] > train_label_feature_end)]

In [86]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]

[2019-11-27 07:18:50,580] INFO in <ipython-input-86-20048b165b8f>: feature size 126
[2019-11-27 07:18:50,899] INFO in <ipython-input-86-20048b165b8f>: train shape (2593669, 5), test shape (1141683, 130)


[1]	valid_0's auc: 0.758209	valid_0's binary_logloss: 0.426574
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.775093	valid_0's binary_logloss: 0.416574
[3]	valid_0's auc: 0.780504	valid_0's binary_logloss: 0.408526
[4]	valid_0's auc: 0.784522	valid_0's binary_logloss: 0.401732
[5]	valid_0's auc: 0.785633	valid_0's binary_logloss: 0.396236
[6]	valid_0's auc: 0.788326	valid_0's binary_logloss: 0.391159
[7]	valid_0's auc: 0.790373	valid_0's binary_logloss: 0.38688
[8]	valid_0's auc: 0.792092	valid_0's binary_logloss: 0.383054
[9]	valid_0's auc: 0.794116	valid_0's binary_logloss: 0.379763
[10]	valid_0's auc: 0.796005	valid_0's binary_logloss: 0.376867
[11]	valid_0's auc: 0.797395	valid_0's binary_logloss: 0.374089
[12]	valid_0's auc: 0.798549	valid_0's binary_logloss: 0.371696
[13]	valid_0's auc: 0.799592	valid_0's binary_logloss: 0.369706
[14]	valid_0's auc: 0.800246	valid_0's binary_logloss: 0.36779
[15]	valid_0's auc: 0.801392	valid_0's binary_logloss:

[129]	valid_0's auc: 0.828529	valid_0's binary_logloss: 0.336377
[130]	valid_0's auc: 0.828577	valid_0's binary_logloss: 0.336336
[131]	valid_0's auc: 0.828612	valid_0's binary_logloss: 0.336304
[132]	valid_0's auc: 0.828626	valid_0's binary_logloss: 0.336293
[133]	valid_0's auc: 0.828716	valid_0's binary_logloss: 0.33621
[134]	valid_0's auc: 0.828731	valid_0's binary_logloss: 0.336198
[135]	valid_0's auc: 0.828764	valid_0's binary_logloss: 0.336166
[136]	valid_0's auc: 0.828796	valid_0's binary_logloss: 0.33614
[137]	valid_0's auc: 0.828802	valid_0's binary_logloss: 0.336135
[138]	valid_0's auc: 0.828815	valid_0's binary_logloss: 0.336127
[139]	valid_0's auc: 0.828859	valid_0's binary_logloss: 0.336091
[140]	valid_0's auc: 0.828889	valid_0's binary_logloss: 0.336064
[141]	valid_0's auc: 0.828919	valid_0's binary_logloss: 0.336039
[142]	valid_0's auc: 0.828976	valid_0's binary_logloss: 0.335984
[143]	valid_0's auc: 0.829004	valid_0's binary_logloss: 0.33596
[144]	valid_0's auc: 0.82902

[256]	valid_0's auc: 0.831482	valid_0's binary_logloss: 0.333861
[257]	valid_0's auc: 0.831492	valid_0's binary_logloss: 0.333854
[258]	valid_0's auc: 0.831499	valid_0's binary_logloss: 0.333848
[259]	valid_0's auc: 0.831524	valid_0's binary_logloss: 0.333828
[260]	valid_0's auc: 0.831528	valid_0's binary_logloss: 0.333826
[261]	valid_0's auc: 0.831575	valid_0's binary_logloss: 0.33379
[262]	valid_0's auc: 0.831604	valid_0's binary_logloss: 0.333764
[263]	valid_0's auc: 0.831636	valid_0's binary_logloss: 0.333732
[264]	valid_0's auc: 0.831676	valid_0's binary_logloss: 0.333697
[265]	valid_0's auc: 0.8317	valid_0's binary_logloss: 0.333678
[266]	valid_0's auc: 0.831709	valid_0's binary_logloss: 0.333671
[267]	valid_0's auc: 0.831712	valid_0's binary_logloss: 0.333668
[268]	valid_0's auc: 0.83176	valid_0's binary_logloss: 0.333622
[269]	valid_0's auc: 0.831787	valid_0's binary_logloss: 0.333598
[270]	valid_0's auc: 0.831812	valid_0's binary_logloss: 0.333576
[271]	valid_0's auc: 0.831859

[383]	valid_0's auc: 0.833122	valid_0's binary_logloss: 0.332501
[384]	valid_0's auc: 0.83312	valid_0's binary_logloss: 0.332503
[385]	valid_0's auc: 0.833141	valid_0's binary_logloss: 0.332489
[386]	valid_0's auc: 0.833155	valid_0's binary_logloss: 0.332479
[387]	valid_0's auc: 0.833166	valid_0's binary_logloss: 0.33247
[388]	valid_0's auc: 0.833177	valid_0's binary_logloss: 0.33246
[389]	valid_0's auc: 0.833182	valid_0's binary_logloss: 0.332456
[390]	valid_0's auc: 0.833186	valid_0's binary_logloss: 0.332452
[391]	valid_0's auc: 0.833185	valid_0's binary_logloss: 0.332453
[392]	valid_0's auc: 0.83319	valid_0's binary_logloss: 0.33245
[393]	valid_0's auc: 0.833198	valid_0's binary_logloss: 0.332444
[394]	valid_0's auc: 0.833214	valid_0's binary_logloss: 0.332418
[395]	valid_0's auc: 0.833245	valid_0's binary_logloss: 0.332395
[396]	valid_0's auc: 0.83325	valid_0's binary_logloss: 0.332391
[397]	valid_0's auc: 0.833251	valid_0's binary_logloss: 0.332389
[398]	valid_0's auc: 0.833259	v

[510]	valid_0's auc: 0.834254	valid_0's binary_logloss: 0.331536
[511]	valid_0's auc: 0.834262	valid_0's binary_logloss: 0.331531
[512]	valid_0's auc: 0.834286	valid_0's binary_logloss: 0.331516
[513]	valid_0's auc: 0.83429	valid_0's binary_logloss: 0.331511
[514]	valid_0's auc: 0.834293	valid_0's binary_logloss: 0.33151
[515]	valid_0's auc: 0.834299	valid_0's binary_logloss: 0.331506
[516]	valid_0's auc: 0.8343	valid_0's binary_logloss: 0.331505
[517]	valid_0's auc: 0.834306	valid_0's binary_logloss: 0.331499
[518]	valid_0's auc: 0.834308	valid_0's binary_logloss: 0.331497
[519]	valid_0's auc: 0.834308	valid_0's binary_logloss: 0.331494
[520]	valid_0's auc: 0.834324	valid_0's binary_logloss: 0.331483
[521]	valid_0's auc: 0.834329	valid_0's binary_logloss: 0.331477
[522]	valid_0's auc: 0.834334	valid_0's binary_logloss: 0.33147
[523]	valid_0's auc: 0.834344	valid_0's binary_logloss: 0.331462
[524]	valid_0's auc: 0.834344	valid_0's binary_logloss: 0.331462
[525]	valid_0's auc: 0.83435	v

[637]	valid_0's auc: 0.834887	valid_0's binary_logloss: 0.330993
[638]	valid_0's auc: 0.834911	valid_0's binary_logloss: 0.330966
[639]	valid_0's auc: 0.834931	valid_0's binary_logloss: 0.33095
[640]	valid_0's auc: 0.834945	valid_0's binary_logloss: 0.330939
[641]	valid_0's auc: 0.834953	valid_0's binary_logloss: 0.330935
[642]	valid_0's auc: 0.83497	valid_0's binary_logloss: 0.330922
[643]	valid_0's auc: 0.834972	valid_0's binary_logloss: 0.330922
[644]	valid_0's auc: 0.834982	valid_0's binary_logloss: 0.330915
[645]	valid_0's auc: 0.834987	valid_0's binary_logloss: 0.330911
[646]	valid_0's auc: 0.834989	valid_0's binary_logloss: 0.330909
[647]	valid_0's auc: 0.835	valid_0's binary_logloss: 0.330903
[648]	valid_0's auc: 0.835014	valid_0's binary_logloss: 0.33089
[649]	valid_0's auc: 0.835013	valid_0's binary_logloss: 0.33089
[650]	valid_0's auc: 0.835057	valid_0's binary_logloss: 0.330854
[651]	valid_0's auc: 0.835066	valid_0's binary_logloss: 0.330847
[652]	valid_0's auc: 0.835077	va

[764]	valid_0's auc: 0.83557	valid_0's binary_logloss: 0.330394
[765]	valid_0's auc: 0.835578	valid_0's binary_logloss: 0.330388
[766]	valid_0's auc: 0.835581	valid_0's binary_logloss: 0.330386
[767]	valid_0's auc: 0.835579	valid_0's binary_logloss: 0.330387
[768]	valid_0's auc: 0.835578	valid_0's binary_logloss: 0.330388
[769]	valid_0's auc: 0.835578	valid_0's binary_logloss: 0.330387
[770]	valid_0's auc: 0.835579	valid_0's binary_logloss: 0.330386
[771]	valid_0's auc: 0.835577	valid_0's binary_logloss: 0.330388
[772]	valid_0's auc: 0.835578	valid_0's binary_logloss: 0.330387
[773]	valid_0's auc: 0.835577	valid_0's binary_logloss: 0.330388
[774]	valid_0's auc: 0.835576	valid_0's binary_logloss: 0.330387
[775]	valid_0's auc: 0.835574	valid_0's binary_logloss: 0.330389
[776]	valid_0's auc: 0.835573	valid_0's binary_logloss: 0.33039
[777]	valid_0's auc: 0.835575	valid_0's binary_logloss: 0.330388
[778]	valid_0's auc: 0.835581	valid_0's binary_logloss: 0.330383
[779]	valid_0's auc: 0.8355

[891]	valid_0's auc: 0.836169	valid_0's binary_logloss: 0.329901
[892]	valid_0's auc: 0.836173	valid_0's binary_logloss: 0.329898
[893]	valid_0's auc: 0.836177	valid_0's binary_logloss: 0.329894
[894]	valid_0's auc: 0.836182	valid_0's binary_logloss: 0.329889
[895]	valid_0's auc: 0.836185	valid_0's binary_logloss: 0.329887
[896]	valid_0's auc: 0.83619	valid_0's binary_logloss: 0.329884
[897]	valid_0's auc: 0.836202	valid_0's binary_logloss: 0.329876
[898]	valid_0's auc: 0.836207	valid_0's binary_logloss: 0.329873
[899]	valid_0's auc: 0.836212	valid_0's binary_logloss: 0.329869
[900]	valid_0's auc: 0.836217	valid_0's binary_logloss: 0.329866
[901]	valid_0's auc: 0.836222	valid_0's binary_logloss: 0.329861
[902]	valid_0's auc: 0.836224	valid_0's binary_logloss: 0.32986
[903]	valid_0's auc: 0.836224	valid_0's binary_logloss: 0.329859
[904]	valid_0's auc: 0.836224	valid_0's binary_logloss: 0.329858
[905]	valid_0's auc: 0.836225	valid_0's binary_logloss: 0.329858
[906]	valid_0's auc: 0.8362

[1018]	valid_0's auc: 0.836647	valid_0's binary_logloss: 0.329501
[1019]	valid_0's auc: 0.836647	valid_0's binary_logloss: 0.329501
[1020]	valid_0's auc: 0.836645	valid_0's binary_logloss: 0.329503
[1021]	valid_0's auc: 0.836645	valid_0's binary_logloss: 0.329504
[1022]	valid_0's auc: 0.836646	valid_0's binary_logloss: 0.329503
[1023]	valid_0's auc: 0.836646	valid_0's binary_logloss: 0.329503
[1024]	valid_0's auc: 0.83665	valid_0's binary_logloss: 0.329498
[1025]	valid_0's auc: 0.836649	valid_0's binary_logloss: 0.329498
[1026]	valid_0's auc: 0.836654	valid_0's binary_logloss: 0.329494
[1027]	valid_0's auc: 0.836655	valid_0's binary_logloss: 0.329493
[1028]	valid_0's auc: 0.836661	valid_0's binary_logloss: 0.329488
[1029]	valid_0's auc: 0.836672	valid_0's binary_logloss: 0.329481
[1030]	valid_0's auc: 0.836682	valid_0's binary_logloss: 0.329475
[1031]	valid_0's auc: 0.836682	valid_0's binary_logloss: 0.329475
[1032]	valid_0's auc: 0.836684	valid_0's binary_logloss: 0.329474
[1033]	vali

[1143]	valid_0's auc: 0.837005	valid_0's binary_logloss: 0.329197
[1144]	valid_0's auc: 0.837004	valid_0's binary_logloss: 0.329197
[1145]	valid_0's auc: 0.83701	valid_0's binary_logloss: 0.329193
[1146]	valid_0's auc: 0.837019	valid_0's binary_logloss: 0.329187
[1147]	valid_0's auc: 0.837019	valid_0's binary_logloss: 0.329186
[1148]	valid_0's auc: 0.83702	valid_0's binary_logloss: 0.329187
[1149]	valid_0's auc: 0.837027	valid_0's binary_logloss: 0.32918
[1150]	valid_0's auc: 0.837028	valid_0's binary_logloss: 0.329178
[1151]	valid_0's auc: 0.837035	valid_0's binary_logloss: 0.329174
[1152]	valid_0's auc: 0.837035	valid_0's binary_logloss: 0.329174
[1153]	valid_0's auc: 0.837039	valid_0's binary_logloss: 0.329169
[1154]	valid_0's auc: 0.837041	valid_0's binary_logloss: 0.329167
[1155]	valid_0's auc: 0.837048	valid_0's binary_logloss: 0.32916
[1156]	valid_0's auc: 0.837051	valid_0's binary_logloss: 0.329158
[1157]	valid_0's auc: 0.837053	valid_0's binary_logloss: 0.329156
[1158]	valid_0

[1268]	valid_0's auc: 0.837295	valid_0's binary_logloss: 0.328944
[1269]	valid_0's auc: 0.837293	valid_0's binary_logloss: 0.328946
[1270]	valid_0's auc: 0.837296	valid_0's binary_logloss: 0.328944
[1271]	valid_0's auc: 0.837302	valid_0's binary_logloss: 0.328936
[1272]	valid_0's auc: 0.837304	valid_0's binary_logloss: 0.328936
[1273]	valid_0's auc: 0.837306	valid_0's binary_logloss: 0.328935
[1274]	valid_0's auc: 0.837308	valid_0's binary_logloss: 0.328934
[1275]	valid_0's auc: 0.837308	valid_0's binary_logloss: 0.328933
[1276]	valid_0's auc: 0.837315	valid_0's binary_logloss: 0.328923
[1277]	valid_0's auc: 0.837333	valid_0's binary_logloss: 0.32891
[1278]	valid_0's auc: 0.837333	valid_0's binary_logloss: 0.32891
[1279]	valid_0's auc: 0.837334	valid_0's binary_logloss: 0.328909
[1280]	valid_0's auc: 0.837334	valid_0's binary_logloss: 0.328908
[1281]	valid_0's auc: 0.837334	valid_0's binary_logloss: 0.328908
[1282]	valid_0's auc: 0.837341	valid_0's binary_logloss: 0.328902
[1283]	valid

[1393]	valid_0's auc: 0.837628	valid_0's binary_logloss: 0.328665
[1394]	valid_0's auc: 0.837631	valid_0's binary_logloss: 0.328662
[1395]	valid_0's auc: 0.837639	valid_0's binary_logloss: 0.328658
[1396]	valid_0's auc: 0.83764	valid_0's binary_logloss: 0.328657
[1397]	valid_0's auc: 0.837643	valid_0's binary_logloss: 0.328654
[1398]	valid_0's auc: 0.837639	valid_0's binary_logloss: 0.328656
[1399]	valid_0's auc: 0.837647	valid_0's binary_logloss: 0.328651
[1400]	valid_0's auc: 0.837652	valid_0's binary_logloss: 0.328646
[1401]	valid_0's auc: 0.837654	valid_0's binary_logloss: 0.328644
[1402]	valid_0's auc: 0.837658	valid_0's binary_logloss: 0.328642
[1403]	valid_0's auc: 0.837662	valid_0's binary_logloss: 0.328637
[1404]	valid_0's auc: 0.837679	valid_0's binary_logloss: 0.328623
[1405]	valid_0's auc: 0.837675	valid_0's binary_logloss: 0.328626
[1406]	valid_0's auc: 0.837678	valid_0's binary_logloss: 0.328621
[1407]	valid_0's auc: 0.837677	valid_0's binary_logloss: 0.328622
[1408]	vali

[1518]	valid_0's auc: 0.837948	valid_0's binary_logloss: 0.328392
[1519]	valid_0's auc: 0.837951	valid_0's binary_logloss: 0.32839
[1520]	valid_0's auc: 0.837949	valid_0's binary_logloss: 0.328391
[1521]	valid_0's auc: 0.837949	valid_0's binary_logloss: 0.328391
[1522]	valid_0's auc: 0.83796	valid_0's binary_logloss: 0.328384
[1523]	valid_0's auc: 0.837961	valid_0's binary_logloss: 0.328383
[1524]	valid_0's auc: 0.837963	valid_0's binary_logloss: 0.328382
[1525]	valid_0's auc: 0.837969	valid_0's binary_logloss: 0.328378
[1526]	valid_0's auc: 0.837977	valid_0's binary_logloss: 0.328373
[1527]	valid_0's auc: 0.837982	valid_0's binary_logloss: 0.328369
[1528]	valid_0's auc: 0.837985	valid_0's binary_logloss: 0.328365
[1529]	valid_0's auc: 0.83799	valid_0's binary_logloss: 0.328362
[1530]	valid_0's auc: 0.837991	valid_0's binary_logloss: 0.32836
[1531]	valid_0's auc: 0.837997	valid_0's binary_logloss: 0.328354
[1532]	valid_0's auc: 0.838001	valid_0's binary_logloss: 0.328351
[1533]	valid_0

[1643]	valid_0's auc: 0.838371	valid_0's binary_logloss: 0.328032
[1644]	valid_0's auc: 0.838377	valid_0's binary_logloss: 0.328027
[1645]	valid_0's auc: 0.838378	valid_0's binary_logloss: 0.328025
[1646]	valid_0's auc: 0.838378	valid_0's binary_logloss: 0.328025
[1647]	valid_0's auc: 0.838376	valid_0's binary_logloss: 0.328027
[1648]	valid_0's auc: 0.838382	valid_0's binary_logloss: 0.328023
[1649]	valid_0's auc: 0.838379	valid_0's binary_logloss: 0.328025
[1650]	valid_0's auc: 0.838383	valid_0's binary_logloss: 0.328023
[1651]	valid_0's auc: 0.838389	valid_0's binary_logloss: 0.328017
[1652]	valid_0's auc: 0.83839	valid_0's binary_logloss: 0.328016
[1653]	valid_0's auc: 0.838391	valid_0's binary_logloss: 0.328016
[1654]	valid_0's auc: 0.83839	valid_0's binary_logloss: 0.328017
[1655]	valid_0's auc: 0.838391	valid_0's binary_logloss: 0.328017
[1656]	valid_0's auc: 0.8384	valid_0's binary_logloss: 0.328011
[1657]	valid_0's auc: 0.838404	valid_0's binary_logloss: 0.328008
[1658]	valid_0

[1768]	valid_0's auc: 0.838611	valid_0's binary_logloss: 0.327818
[1769]	valid_0's auc: 0.838612	valid_0's binary_logloss: 0.327817
[1770]	valid_0's auc: 0.838612	valid_0's binary_logloss: 0.327818
[1771]	valid_0's auc: 0.83861	valid_0's binary_logloss: 0.327818
[1772]	valid_0's auc: 0.838608	valid_0's binary_logloss: 0.327819
[1773]	valid_0's auc: 0.838611	valid_0's binary_logloss: 0.327817
[1774]	valid_0's auc: 0.838611	valid_0's binary_logloss: 0.327817
[1775]	valid_0's auc: 0.838609	valid_0's binary_logloss: 0.327818
[1776]	valid_0's auc: 0.838606	valid_0's binary_logloss: 0.32782
[1777]	valid_0's auc: 0.838613	valid_0's binary_logloss: 0.327815
[1778]	valid_0's auc: 0.838617	valid_0's binary_logloss: 0.327812
[1779]	valid_0's auc: 0.838624	valid_0's binary_logloss: 0.327807
[1780]	valid_0's auc: 0.838623	valid_0's binary_logloss: 0.327808
[1781]	valid_0's auc: 0.838625	valid_0's binary_logloss: 0.327807
[1782]	valid_0's auc: 0.838629	valid_0's binary_logloss: 0.327803
[1783]	valid

[1893]	valid_0's auc: 0.838783	valid_0's binary_logloss: 0.327667
[1894]	valid_0's auc: 0.838788	valid_0's binary_logloss: 0.327664
[1895]	valid_0's auc: 0.838787	valid_0's binary_logloss: 0.327664
[1896]	valid_0's auc: 0.838791	valid_0's binary_logloss: 0.327662
[1897]	valid_0's auc: 0.838793	valid_0's binary_logloss: 0.327659
[1898]	valid_0's auc: 0.838795	valid_0's binary_logloss: 0.327657
[1899]	valid_0's auc: 0.838796	valid_0's binary_logloss: 0.327656
[1900]	valid_0's auc: 0.838795	valid_0's binary_logloss: 0.327657
[1901]	valid_0's auc: 0.838795	valid_0's binary_logloss: 0.327658
[1902]	valid_0's auc: 0.838796	valid_0's binary_logloss: 0.327657
[1903]	valid_0's auc: 0.838795	valid_0's binary_logloss: 0.327657
[1904]	valid_0's auc: 0.838797	valid_0's binary_logloss: 0.327656
[1905]	valid_0's auc: 0.838797	valid_0's binary_logloss: 0.327655
[1906]	valid_0's auc: 0.838802	valid_0's binary_logloss: 0.32765
[1907]	valid_0's auc: 0.8388	valid_0's binary_logloss: 0.327651
[1908]	valid_

In [87]:
sub.to_csv('./result/2000_0.838811.txt', index=None, header=None, sep='\t')

In [88]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()

In [89]:
fi

Unnamed: 0,feature,imp,rate
0,freq,397,0.006663
1,gender,331,0.005556
2,hour,2433,0.040836
3,q_ans_count,351,0.005891
4,q_diff_qa_days_max,454,0.007620
...,...,...,...
121,uf_c2_count,386,0.006479
122,uf_c3_count,1258,0.021114
123,uf_c4_count,1228,0.020611
124,uf_c5_count,0,0.000000


In [100]:
fi.sort_values(by='rate', ascending=False)[:60]

Unnamed: 0,feature,imp,rate
54,score,3442,0.057771
117,qid_enc_count,3064,0.051427
2,hour,2433,0.040836
53,qid_enc,2239,0.03758
115,uid_enc,2191,0.036774
116,uid_enc_count,1966,0.032998
66,u_inv_mean,1674,0.028097
65,u_inv_count,1580,0.026519
57,u_diff_qa_days_mean,1447,0.024287
125,wk,1348,0.022625


In [91]:
feature_cols

['freq',
 'gender',
 'hour',
 'q_ans_count',
 'q_diff_qa_days_max',
 'q_diff_qa_days_mean',
 'q_diff_qa_days_sum',
 'q_has_img_max',
 'q_has_img_mean',
 'q_has_img_sum',
 'q_has_video_max',
 'q_has_video_mean',
 'q_has_video_sum',
 'q_inv_count',
 'q_inv_mean',
 'q_inv_std',
 'q_inv_sum',
 'q_is_dest_max',
 'q_is_dest_mean',
 'q_is_dest_sum',
 'q_is_good_max',
 'q_is_good_mean',
 'q_is_good_sum',
 'q_is_rec_max',
 'q_is_rec_mean',
 'q_is_rec_sum',
 'q_reci_cheer_max',
 'q_reci_cheer_mean',
 'q_reci_cheer_sum',
 'q_reci_comment_max',
 'q_reci_comment_mean',
 'q_reci_comment_sum',
 'q_reci_dis_max',
 'q_reci_dis_mean',
 'q_reci_dis_sum',
 'q_reci_mark_max',
 'q_reci_mark_mean',
 'q_reci_mark_sum',
 'q_reci_no_help_max',
 'q_reci_no_help_mean',
 'q_reci_no_help_sum',
 'q_reci_tks_max',
 'q_reci_tks_mean',
 'q_reci_tks_sum',
 'q_reci_uncheer_max',
 'q_reci_uncheer_mean',
 'q_reci_uncheer_sum',
 'q_reci_xxx_max',
 'q_reci_xxx_mean',
 'q_reci_xxx_sum',
 'q_word_count_max',
 'q_word_count_mea