In [1]:
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [4]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')


def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [5]:
base_path = './data'

In [7]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

[2019-11-25 07:06:53,893] INFO in <ipython-input-7-d8d667ebe145>: invite (9489162, 4)
[2019-11-25 07:06:55,284] INFO in <ipython-input-7-d8d667ebe145>: test (1141683, 3)


In [8]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-11-25 07:08:12,286] INFO in <ipython-input-8-1bb9e9c15e2e>: ques (1829900, 3)


In [10]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

[2019-11-25 07:09:45,518] INFO in <ipython-input-10-0f04b531e5be>: ans (4513735, 18)


In [11]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

In [27]:
# 时间窗口划分
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

# 3838~3860
train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

# 3845~3867
val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

# 3861~3867
train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

SyntaxError: invalid character in identifier (<ipython-input-27-10c3d59857a1>, line 22)

In [13]:
# 确定ans的时间范围
# 3807~3874

# 3810~3860
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

# 3817~3867
val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']

[2019-11-25 07:10:35,967] INFO in <ipython-input-13-aa15f72a2344>: train ans feature (3700178, 23), start 3810 end 3860
[2019-11-25 07:10:35,984] INFO in <ipython-input-13-aa15f72a2344>: val ans feature (3992334, 23), start 3817 end 3867


In [14]:
def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target

In [15]:
train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-11-25 07:11:40,500] INFO in <ipython-input-14-0fafb00ac390>: extract is_good
[2019-11-25 07:12:04,879] INFO in <ipython-input-14-0fafb00ac390>: extract is_rec
[2019-11-25 07:12:33,370] INFO in <ipython-input-14-0fafb00ac390>: extract is_dest
[2019-11-25 07:13:01,602] INFO in <ipython-input-14-0fafb00ac390>: extract has_img
[2019-11-25 07:13:30,159] INFO in <ipython-input-14-0fafb00ac390>: extract has_video
[2019-11-25 07:13:56,802] INFO in <ipython-input-14-0fafb00ac390>: extract word_count
[2019-11-25 07:14:23,174] INFO in <ipython-input-14-0fafb00ac390>: extract reci_cheer
[2019-11-25 07:14:49,895] INFO in <ipython-input-14-0fafb00ac390>: extract reci_uncheer
[2019-11-25 07:15:16,324] INFO in <ipython-input-14-0fafb00ac390>: extract reci_comment
[2019-11-25 07:15:43,431] INFO in <ipython-input-14-0fafb00ac390>: extract reci_mark
[2019-11-25 07:16:10,044] INFO in <ipython-input-14-0fafb00ac390>: extract reci_tks
[2019-11-25 07:16:34,616] INFO in <ipython-input-14-0fafb00ac390>: 

In [16]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

[2019-11-25 07:22:51,139] INFO in <ipython-input-16-464c91a59cf8>: train shape (2593669, 105), test shape (1141683, 104)


In [18]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

[2019-11-25 07:27:01,172] INFO in <ipython-input-18-7cebf38ed8f9>: user (1931654, 14)
[2019-11-25 07:27:08,012] INFO in <ipython-input-18-7cebf38ed8f9>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-11-25 07:27:08,020] INFO in <ipython-input-18-7cebf38ed8f9>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-11-25 07:27:09,867] INFO in <ipython-input-18-7cebf38ed8f9>: encode gender
[2019-11-25 07:27:11,683] INFO in <ipython-input-18-7cebf38ed8f9>: encode freq
[2019-11-25 07:27:13,432] INFO in <ipython-input-18-7cebf38ed8f9>: encode uf_c1
[2019-11-25 07:27:15,086] INFO in <ipython-input-18-7cebf38ed8f9>: encode uf_c2
[2019-11-25 07:27:16,652] INFO in <ipython-input-18-7cebf38ed8f9>: encode uf_c3
[2019-11-25 07:27:18,258] INFO in

In [19]:
# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
del train_label, test

[2019-11-25 07:28:03,321] INFO in <ipython-input-19-5d1ad498aeb5>: train shape (2593669, 120), test shape (1141683, 119)


In [20]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [21]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [24]:
train_label = train[(train['day'] > train_label_feature_end)]

In [25]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]

[2019-11-25 07:35:38,630] INFO in <ipython-input-25-20048b165b8f>: feature size 126
[2019-11-25 07:35:39,301] INFO in <ipython-input-25-20048b165b8f>: train shape (2593669, 5), test shape (1141683, 130)


[1]	valid_0's auc: 0.7582	valid_0's binary_logloss: 0.426574
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.775326	valid_0's binary_logloss: 0.416562
[3]	valid_0's auc: 0.78073	valid_0's binary_logloss: 0.408496
[4]	valid_0's auc: 0.784492	valid_0's binary_logloss: 0.401698
[5]	valid_0's auc: 0.785532	valid_0's binary_logloss: 0.39619
[6]	valid_0's auc: 0.787347	valid_0's binary_logloss: 0.391481
[7]	valid_0's auc: 0.78993	valid_0's binary_logloss: 0.387075
[8]	valid_0's auc: 0.79221	valid_0's binary_logloss: 0.383291
[9]	valid_0's auc: 0.793956	valid_0's binary_logloss: 0.379851
[10]	valid_0's auc: 0.795762	valid_0's binary_logloss: 0.377004
[11]	valid_0's auc: 0.797256	valid_0's binary_logloss: 0.374117
[12]	valid_0's auc: 0.798762	valid_0's binary_logloss: 0.371673
[13]	valid_0's auc: 0.799829	valid_0's binary_logloss: 0.369623
[14]	valid_0's auc: 0.800952	valid_0's binary_logloss: 0.367658
[15]	valid_0's auc: 0.802123	valid_0's binary_logloss: 0.3

[129]	valid_0's auc: 0.828533	valid_0's binary_logloss: 0.336331
[130]	valid_0's auc: 0.828574	valid_0's binary_logloss: 0.336286
[131]	valid_0's auc: 0.828627	valid_0's binary_logloss: 0.336235
[132]	valid_0's auc: 0.828685	valid_0's binary_logloss: 0.336175
[133]	valid_0's auc: 0.828767	valid_0's binary_logloss: 0.336114
[134]	valid_0's auc: 0.828835	valid_0's binary_logloss: 0.336061
[135]	valid_0's auc: 0.828889	valid_0's binary_logloss: 0.336019
[136]	valid_0's auc: 0.828917	valid_0's binary_logloss: 0.33599
[137]	valid_0's auc: 0.828973	valid_0's binary_logloss: 0.335952
[138]	valid_0's auc: 0.829	valid_0's binary_logloss: 0.33593
[139]	valid_0's auc: 0.82906	valid_0's binary_logloss: 0.335884
[140]	valid_0's auc: 0.829072	valid_0's binary_logloss: 0.335872
[141]	valid_0's auc: 0.829121	valid_0's binary_logloss: 0.33582
[142]	valid_0's auc: 0.829164	valid_0's binary_logloss: 0.335785
[143]	valid_0's auc: 0.829192	valid_0's binary_logloss: 0.335764
[144]	valid_0's auc: 0.829242	va

[256]	valid_0's auc: 0.831475	valid_0's binary_logloss: 0.333847
[257]	valid_0's auc: 0.831489	valid_0's binary_logloss: 0.333837
[258]	valid_0's auc: 0.831492	valid_0's binary_logloss: 0.333834
[259]	valid_0's auc: 0.831496	valid_0's binary_logloss: 0.333831
[260]	valid_0's auc: 0.831503	valid_0's binary_logloss: 0.333828
[261]	valid_0's auc: 0.831511	valid_0's binary_logloss: 0.333818
[262]	valid_0's auc: 0.831533	valid_0's binary_logloss: 0.333797
[263]	valid_0's auc: 0.831539	valid_0's binary_logloss: 0.333792
[264]	valid_0's auc: 0.831556	valid_0's binary_logloss: 0.333777
[265]	valid_0's auc: 0.831593	valid_0's binary_logloss: 0.333752
[266]	valid_0's auc: 0.831635	valid_0's binary_logloss: 0.333723
[267]	valid_0's auc: 0.83165	valid_0's binary_logloss: 0.33371
[268]	valid_0's auc: 0.831656	valid_0's binary_logloss: 0.333706
[269]	valid_0's auc: 0.83167	valid_0's binary_logloss: 0.333695
[270]	valid_0's auc: 0.831672	valid_0's binary_logloss: 0.333695
[271]	valid_0's auc: 0.83168

[383]	valid_0's auc: 0.832828	valid_0's binary_logloss: 0.332704
[384]	valid_0's auc: 0.832846	valid_0's binary_logloss: 0.332689
[385]	valid_0's auc: 0.832853	valid_0's binary_logloss: 0.332684
[386]	valid_0's auc: 0.832863	valid_0's binary_logloss: 0.332673
[387]	valid_0's auc: 0.83286	valid_0's binary_logloss: 0.332675
[388]	valid_0's auc: 0.832897	valid_0's binary_logloss: 0.33264
[389]	valid_0's auc: 0.832903	valid_0's binary_logloss: 0.332637
[390]	valid_0's auc: 0.832908	valid_0's binary_logloss: 0.332631
[391]	valid_0's auc: 0.832921	valid_0's binary_logloss: 0.332621
[392]	valid_0's auc: 0.832929	valid_0's binary_logloss: 0.332616
[393]	valid_0's auc: 0.832937	valid_0's binary_logloss: 0.33261
[394]	valid_0's auc: 0.832937	valid_0's binary_logloss: 0.332609
[395]	valid_0's auc: 0.832939	valid_0's binary_logloss: 0.332607
[396]	valid_0's auc: 0.832959	valid_0's binary_logloss: 0.332592
[397]	valid_0's auc: 0.832958	valid_0's binary_logloss: 0.332594
[398]	valid_0's auc: 0.83295

[510]	valid_0's auc: 0.834011	valid_0's binary_logloss: 0.331665
[511]	valid_0's auc: 0.834016	valid_0's binary_logloss: 0.331661
[512]	valid_0's auc: 0.834022	valid_0's binary_logloss: 0.331658
[513]	valid_0's auc: 0.834023	valid_0's binary_logloss: 0.331658
[514]	valid_0's auc: 0.834028	valid_0's binary_logloss: 0.331654
[515]	valid_0's auc: 0.834029	valid_0's binary_logloss: 0.331652
[516]	valid_0's auc: 0.834031	valid_0's binary_logloss: 0.331652
[517]	valid_0's auc: 0.834032	valid_0's binary_logloss: 0.33165
[518]	valid_0's auc: 0.83404	valid_0's binary_logloss: 0.331641
[519]	valid_0's auc: 0.834055	valid_0's binary_logloss: 0.331628
[520]	valid_0's auc: 0.834055	valid_0's binary_logloss: 0.331627
[521]	valid_0's auc: 0.834058	valid_0's binary_logloss: 0.331625
[522]	valid_0's auc: 0.834068	valid_0's binary_logloss: 0.331618
[523]	valid_0's auc: 0.834069	valid_0's binary_logloss: 0.331617
[524]	valid_0's auc: 0.834069	valid_0's binary_logloss: 0.331616
[525]	valid_0's auc: 0.8340

[637]	valid_0's auc: 0.834726	valid_0's binary_logloss: 0.331076
[638]	valid_0's auc: 0.834733	valid_0's binary_logloss: 0.331072
[639]	valid_0's auc: 0.834731	valid_0's binary_logloss: 0.331073
[640]	valid_0's auc: 0.83475	valid_0's binary_logloss: 0.331059
[641]	valid_0's auc: 0.834764	valid_0's binary_logloss: 0.331049
[642]	valid_0's auc: 0.834765	valid_0's binary_logloss: 0.331048
[643]	valid_0's auc: 0.834766	valid_0's binary_logloss: 0.331048
[644]	valid_0's auc: 0.834776	valid_0's binary_logloss: 0.331039
[645]	valid_0's auc: 0.834825	valid_0's binary_logloss: 0.330996
[646]	valid_0's auc: 0.834828	valid_0's binary_logloss: 0.330994
[647]	valid_0's auc: 0.83484	valid_0's binary_logloss: 0.330983
[648]	valid_0's auc: 0.834843	valid_0's binary_logloss: 0.33098
[649]	valid_0's auc: 0.834847	valid_0's binary_logloss: 0.330977
[650]	valid_0's auc: 0.83485	valid_0's binary_logloss: 0.330975
[651]	valid_0's auc: 0.834855	valid_0's binary_logloss: 0.330971
[652]	valid_0's auc: 0.834854

[764]	valid_0's auc: 0.835575	valid_0's binary_logloss: 0.33037
[765]	valid_0's auc: 0.835577	valid_0's binary_logloss: 0.330368
[766]	valid_0's auc: 0.8356	valid_0's binary_logloss: 0.330351
[767]	valid_0's auc: 0.835599	valid_0's binary_logloss: 0.330352
[768]	valid_0's auc: 0.835608	valid_0's binary_logloss: 0.330342
[769]	valid_0's auc: 0.835607	valid_0's binary_logloss: 0.330343
[770]	valid_0's auc: 0.835609	valid_0's binary_logloss: 0.330341
[771]	valid_0's auc: 0.835622	valid_0's binary_logloss: 0.330329
[772]	valid_0's auc: 0.835624	valid_0's binary_logloss: 0.330329
[773]	valid_0's auc: 0.835634	valid_0's binary_logloss: 0.33032
[774]	valid_0's auc: 0.835633	valid_0's binary_logloss: 0.330321
[775]	valid_0's auc: 0.835635	valid_0's binary_logloss: 0.33032
[776]	valid_0's auc: 0.835635	valid_0's binary_logloss: 0.33032
[777]	valid_0's auc: 0.835637	valid_0's binary_logloss: 0.330318
[778]	valid_0's auc: 0.835639	valid_0's binary_logloss: 0.330315
[779]	valid_0's auc: 0.835639	v

[891]	valid_0's auc: 0.836107	valid_0's binary_logloss: 0.32991
[892]	valid_0's auc: 0.836112	valid_0's binary_logloss: 0.329908
[893]	valid_0's auc: 0.836116	valid_0's binary_logloss: 0.329904
[894]	valid_0's auc: 0.836121	valid_0's binary_logloss: 0.3299
[895]	valid_0's auc: 0.836122	valid_0's binary_logloss: 0.3299
[896]	valid_0's auc: 0.836129	valid_0's binary_logloss: 0.329892
[897]	valid_0's auc: 0.836135	valid_0's binary_logloss: 0.329885
[898]	valid_0's auc: 0.836131	valid_0's binary_logloss: 0.329887
[899]	valid_0's auc: 0.836139	valid_0's binary_logloss: 0.329882
[900]	valid_0's auc: 0.836141	valid_0's binary_logloss: 0.329881
[901]	valid_0's auc: 0.836142	valid_0's binary_logloss: 0.329882
[902]	valid_0's auc: 0.836141	valid_0's binary_logloss: 0.329883
[903]	valid_0's auc: 0.836143	valid_0's binary_logloss: 0.32988
[904]	valid_0's auc: 0.836142	valid_0's binary_logloss: 0.32988
[905]	valid_0's auc: 0.83614	valid_0's binary_logloss: 0.329882
[906]	valid_0's auc: 0.836143	val

[1018]	valid_0's auc: 0.836639	valid_0's binary_logloss: 0.329454
[1019]	valid_0's auc: 0.836652	valid_0's binary_logloss: 0.329444
[1020]	valid_0's auc: 0.836661	valid_0's binary_logloss: 0.329436
[1021]	valid_0's auc: 0.836683	valid_0's binary_logloss: 0.32942
[1022]	valid_0's auc: 0.836686	valid_0's binary_logloss: 0.329418
[1023]	valid_0's auc: 0.836685	valid_0's binary_logloss: 0.329418
[1024]	valid_0's auc: 0.8367	valid_0's binary_logloss: 0.329403
[1025]	valid_0's auc: 0.8367	valid_0's binary_logloss: 0.329402
[1026]	valid_0's auc: 0.83671	valid_0's binary_logloss: 0.329393
[1027]	valid_0's auc: 0.836709	valid_0's binary_logloss: 0.329394
[1028]	valid_0's auc: 0.836711	valid_0's binary_logloss: 0.329393
[1029]	valid_0's auc: 0.836717	valid_0's binary_logloss: 0.329388
[1030]	valid_0's auc: 0.836732	valid_0's binary_logloss: 0.329377
[1031]	valid_0's auc: 0.836737	valid_0's binary_logloss: 0.329372
[1032]	valid_0's auc: 0.836741	valid_0's binary_logloss: 0.329367
[1033]	valid_0's

[1143]	valid_0's auc: 0.837128	valid_0's binary_logloss: 0.329029
[1144]	valid_0's auc: 0.837129	valid_0's binary_logloss: 0.329028
[1145]	valid_0's auc: 0.837129	valid_0's binary_logloss: 0.329027
[1146]	valid_0's auc: 0.837128	valid_0's binary_logloss: 0.329029
[1147]	valid_0's auc: 0.837128	valid_0's binary_logloss: 0.329029
[1148]	valid_0's auc: 0.837129	valid_0's binary_logloss: 0.329027
[1149]	valid_0's auc: 0.837131	valid_0's binary_logloss: 0.329025
[1150]	valid_0's auc: 0.837136	valid_0's binary_logloss: 0.329022
[1151]	valid_0's auc: 0.837152	valid_0's binary_logloss: 0.32901
[1152]	valid_0's auc: 0.837155	valid_0's binary_logloss: 0.329006
[1153]	valid_0's auc: 0.837159	valid_0's binary_logloss: 0.329003
[1154]	valid_0's auc: 0.837167	valid_0's binary_logloss: 0.328997
[1155]	valid_0's auc: 0.837169	valid_0's binary_logloss: 0.328996
[1156]	valid_0's auc: 0.83718	valid_0's binary_logloss: 0.32899
[1157]	valid_0's auc: 0.837187	valid_0's binary_logloss: 0.328982
[1158]	valid_

[1268]	valid_0's auc: 0.8375	valid_0's binary_logloss: 0.328713
[1269]	valid_0's auc: 0.837508	valid_0's binary_logloss: 0.328708
[1270]	valid_0's auc: 0.837519	valid_0's binary_logloss: 0.328698
[1271]	valid_0's auc: 0.837519	valid_0's binary_logloss: 0.328697
[1272]	valid_0's auc: 0.837519	valid_0's binary_logloss: 0.328698
[1273]	valid_0's auc: 0.837519	valid_0's binary_logloss: 0.328698
[1274]	valid_0's auc: 0.837515	valid_0's binary_logloss: 0.3287
[1275]	valid_0's auc: 0.837524	valid_0's binary_logloss: 0.328693
[1276]	valid_0's auc: 0.837533	valid_0's binary_logloss: 0.328685
[1277]	valid_0's auc: 0.837538	valid_0's binary_logloss: 0.328681
[1278]	valid_0's auc: 0.83754	valid_0's binary_logloss: 0.328679
[1279]	valid_0's auc: 0.837542	valid_0's binary_logloss: 0.328677
[1280]	valid_0's auc: 0.837543	valid_0's binary_logloss: 0.328676
[1281]	valid_0's auc: 0.837545	valid_0's binary_logloss: 0.328675
[1282]	valid_0's auc: 0.837542	valid_0's binary_logloss: 0.328677
[1283]	valid_0'

[1393]	valid_0's auc: 0.837679	valid_0's binary_logloss: 0.328558
[1394]	valid_0's auc: 0.837681	valid_0's binary_logloss: 0.328556
[1395]	valid_0's auc: 0.83769	valid_0's binary_logloss: 0.32855
[1396]	valid_0's auc: 0.837697	valid_0's binary_logloss: 0.328545
[1397]	valid_0's auc: 0.837697	valid_0's binary_logloss: 0.328545
[1398]	valid_0's auc: 0.837701	valid_0's binary_logloss: 0.328542
[1399]	valid_0's auc: 0.837713	valid_0's binary_logloss: 0.328533
[1400]	valid_0's auc: 0.837716	valid_0's binary_logloss: 0.328531
[1401]	valid_0's auc: 0.837711	valid_0's binary_logloss: 0.328533
[1402]	valid_0's auc: 0.837719	valid_0's binary_logloss: 0.328526
[1403]	valid_0's auc: 0.837726	valid_0's binary_logloss: 0.328522
[1404]	valid_0's auc: 0.837725	valid_0's binary_logloss: 0.328523
[1405]	valid_0's auc: 0.837732	valid_0's binary_logloss: 0.328518
[1406]	valid_0's auc: 0.837742	valid_0's binary_logloss: 0.328505
[1407]	valid_0's auc: 0.837746	valid_0's binary_logloss: 0.328502
[1408]	valid

[1518]	valid_0's auc: 0.83802	valid_0's binary_logloss: 0.328293
[1519]	valid_0's auc: 0.838019	valid_0's binary_logloss: 0.328294
[1520]	valid_0's auc: 0.838017	valid_0's binary_logloss: 0.328295
[1521]	valid_0's auc: 0.838028	valid_0's binary_logloss: 0.328286
[1522]	valid_0's auc: 0.838028	valid_0's binary_logloss: 0.328286
[1523]	valid_0's auc: 0.838032	valid_0's binary_logloss: 0.328282
[1524]	valid_0's auc: 0.838033	valid_0's binary_logloss: 0.328281
[1525]	valid_0's auc: 0.838032	valid_0's binary_logloss: 0.328282
[1526]	valid_0's auc: 0.83804	valid_0's binary_logloss: 0.328273
[1527]	valid_0's auc: 0.838038	valid_0's binary_logloss: 0.328274
[1528]	valid_0's auc: 0.838038	valid_0's binary_logloss: 0.328274
[1529]	valid_0's auc: 0.838035	valid_0's binary_logloss: 0.328276
[1530]	valid_0's auc: 0.838036	valid_0's binary_logloss: 0.328277
[1531]	valid_0's auc: 0.838035	valid_0's binary_logloss: 0.328278
[1532]	valid_0's auc: 0.838035	valid_0's binary_logloss: 0.328278
[1533]	valid

[1643]	valid_0's auc: 0.838285	valid_0's binary_logloss: 0.328067
[1644]	valid_0's auc: 0.838285	valid_0's binary_logloss: 0.328067
[1645]	valid_0's auc: 0.838285	valid_0's binary_logloss: 0.328067
[1646]	valid_0's auc: 0.838289	valid_0's binary_logloss: 0.328065
[1647]	valid_0's auc: 0.83829	valid_0's binary_logloss: 0.328065
[1648]	valid_0's auc: 0.838292	valid_0's binary_logloss: 0.328063
[1649]	valid_0's auc: 0.838294	valid_0's binary_logloss: 0.328062
[1650]	valid_0's auc: 0.838295	valid_0's binary_logloss: 0.328061
[1651]	valid_0's auc: 0.838294	valid_0's binary_logloss: 0.328062
[1652]	valid_0's auc: 0.838296	valid_0's binary_logloss: 0.328059
[1653]	valid_0's auc: 0.838297	valid_0's binary_logloss: 0.328058
[1654]	valid_0's auc: 0.838293	valid_0's binary_logloss: 0.328061
[1655]	valid_0's auc: 0.838295	valid_0's binary_logloss: 0.328058
[1656]	valid_0's auc: 0.838296	valid_0's binary_logloss: 0.328057
[1657]	valid_0's auc: 0.838299	valid_0's binary_logloss: 0.328054
[1658]	vali

[1768]	valid_0's auc: 0.838481	valid_0's binary_logloss: 0.327905
[1769]	valid_0's auc: 0.838481	valid_0's binary_logloss: 0.327905
[1770]	valid_0's auc: 0.838478	valid_0's binary_logloss: 0.327907
[1771]	valid_0's auc: 0.838476	valid_0's binary_logloss: 0.327909
[1772]	valid_0's auc: 0.838477	valid_0's binary_logloss: 0.327908
[1773]	valid_0's auc: 0.838479	valid_0's binary_logloss: 0.327907
[1774]	valid_0's auc: 0.838481	valid_0's binary_logloss: 0.327906
[1775]	valid_0's auc: 0.83849	valid_0's binary_logloss: 0.327899
[1776]	valid_0's auc: 0.838486	valid_0's binary_logloss: 0.327901
[1777]	valid_0's auc: 0.838485	valid_0's binary_logloss: 0.327902
[1778]	valid_0's auc: 0.838484	valid_0's binary_logloss: 0.327903
[1779]	valid_0's auc: 0.838485	valid_0's binary_logloss: 0.327902
[1780]	valid_0's auc: 0.838487	valid_0's binary_logloss: 0.327901
[1781]	valid_0's auc: 0.838487	valid_0's binary_logloss: 0.327899
[1782]	valid_0's auc: 0.838487	valid_0's binary_logloss: 0.327899
[1783]	vali

[1893]	valid_0's auc: 0.838631	valid_0's binary_logloss: 0.327785
[1894]	valid_0's auc: 0.83863	valid_0's binary_logloss: 0.327785
[1895]	valid_0's auc: 0.838631	valid_0's binary_logloss: 0.327785
[1896]	valid_0's auc: 0.838636	valid_0's binary_logloss: 0.327781
[1897]	valid_0's auc: 0.838636	valid_0's binary_logloss: 0.327779
[1898]	valid_0's auc: 0.838642	valid_0's binary_logloss: 0.327774
[1899]	valid_0's auc: 0.838647	valid_0's binary_logloss: 0.32777
[1900]	valid_0's auc: 0.838648	valid_0's binary_logloss: 0.32777
[1901]	valid_0's auc: 0.838652	valid_0's binary_logloss: 0.327767
[1902]	valid_0's auc: 0.838656	valid_0's binary_logloss: 0.327765
[1903]	valid_0's auc: 0.83866	valid_0's binary_logloss: 0.327759
[1904]	valid_0's auc: 0.838659	valid_0's binary_logloss: 0.327761
[1905]	valid_0's auc: 0.838661	valid_0's binary_logloss: 0.327757
[1906]	valid_0's auc: 0.838659	valid_0's binary_logloss: 0.327759
[1907]	valid_0's auc: 0.838657	valid_0's binary_logloss: 0.32776
[1908]	valid_0'

In [26]:
sub.to_csv('./result/2000_0.838811.txt', index=None, header=None, sep='\t')

In [38]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()

In [39]:
fi

Unnamed: 0,feature,imp,rate
0,freq,428,0.007133
1,gender,342,0.005700
2,hour,2436,0.040600
3,q_ans_count,326,0.005433
4,q_diff_qa_days_max,464,0.007733
...,...,...,...
121,uf_c2_count,384,0.006400
122,uf_c3_count,1194,0.019900
123,uf_c4_count,1183,0.019717
124,uf_c5_count,0,0.000000


In [43]:
fi.sort_values(by='rate', ascending=False)[:20]

Unnamed: 0,feature,imp,rate
54,score,3315,0.05525
117,qid_enc_count,3030,0.0505
2,hour,2436,0.0406
53,qid_enc,2256,0.0376
115,uid_enc,2239,0.037317
116,uid_enc_count,1985,0.033083
66,u_inv_mean,1619,0.026983
65,u_inv_count,1527,0.02545
57,u_diff_qa_days_mean,1429,0.023817
67,u_inv_std,1367,0.022783
