In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import pickle

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']

del train['dt']
logging.info("train %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)

del test['dt']
logging.info("test %s", test.shape)

[2019-12-10 13:55:24,237] INFO in <ipython-input-4-c899d16ad23a>: train (9489162, 3)
[2019-12-10 13:55:25,717] INFO in <ipython-input-4-c899d16ad23a>: test (1141683, 2)


In [5]:
# 加载 ans kfold feature
all_col = ['day', 'hour', 'q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count', 
           'u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count', 'q_ans_kfold_count',
           'u_ans_kfold_count', 'q_is_good_sum', 'q_is_good_max', 'q_is_good_mean', 'u_is_good_sum',
           'u_is_good_max', 'u_is_good_mean', 'q_is_rec_sum', 'q_is_rec_max', 'q_is_rec_mean', 'u_is_rec_sum',
           'u_is_rec_max', 'u_is_rec_mean', 'q_is_dest_sum', 'q_is_dest_max', 'q_is_dest_mean', 
           'u_is_dest_sum', 'u_is_dest_max', 'u_is_dest_mean', 'q_has_img_sum', 'q_has_img_max', 
           'q_has_img_mean', 'u_has_img_sum', 'u_has_img_max', 'u_has_img_mean', 'q_has_video_sum', 
           'q_has_video_max', 'q_has_video_mean', 'u_has_video_sum', 'u_has_video_max', 'u_has_video_mean',
           'q_word_count_sum', 'q_word_count_max', 'q_word_count_mean', 'u_word_count_sum', 'u_word_count_max',
           'u_word_count_mean', 'q_reci_cheer_sum', 'q_reci_cheer_max', 'q_reci_cheer_mean', 'u_reci_cheer_sum',
           'u_reci_cheer_max', 'u_reci_cheer_mean', 'q_reci_uncheer_sum', 'q_reci_uncheer_max', 
           'q_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_uncheer_max', 'u_reci_uncheer_mean', 
           'q_reci_comment_sum', 'q_reci_comment_max', 'q_reci_comment_mean', 'u_reci_comment_sum', 
           'u_reci_comment_max', 'u_reci_comment_mean', 'q_reci_mark_sum', 'q_reci_mark_max', 
           'q_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_mark_max', 'u_reci_mark_mean', 'q_reci_tks_sum',
           'q_reci_tks_max', 'q_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_tks_max', 'u_reci_tks_mean',
           'q_reci_xxx_sum', 'q_reci_xxx_max', 'q_reci_xxx_mean', 'u_reci_xxx_sum', 'u_reci_xxx_max', 
           'u_reci_xxx_mean', 'q_reci_no_help_sum', 'q_reci_no_help_max', 'q_reci_no_help_mean', 
           'u_reci_no_help_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 'q_reci_dis_sum', 
           'q_reci_dis_max', 'q_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 
           'q_diff_qa_days_sum', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'u_diff_qa_days_sum', 
           'u_diff_qa_days_max', 'u_diff_qa_days_mean']
drop_col = ['u_is_rec_mean', 'u_reci_uncheer_mean', 'q_is_dest_sum', 'u_reci_uncheer_sum', 'u_is_rec_max', 
             'u_is_dest_mean','q_reci_uncheer_mean', 'q_reci_uncheer_sum', 'u_is_dest_sum', 'q_is_dest_max',
             'q_reci_uncheer_max', 'u_reci_tks_max', 'q_reci_mark_max','u_reci_dis_max', 'q_has_video_mean',
             'q_reci_no_help_mean', 'count_u_topic', 'u_has_video_mean', 'q_reci_dis_sum', 'q_reci_mark_sum',
             'q_reci_tks_sum','q_reci_tks_max','q_reci_dis_max','u_reci_mark_max','q_is_good_mean',
             'q_reci_no_help_sum', 'q_reci_xxx_max', 'u_reci_xxx_max','u_reci_no_help_sum','u_reci_xxx_sum',
              'u_is_good_mean','q_reci_no_help_max','u_has_img_max','u_is_good_sum','u_reci_no_help_max',
              'u_has_video_sum','uf_b5','q_reci_xxx_sum','q_is_good_sum','q_has_img_max','q_has_video_sum',
              'q_has_video_max','u_has_video_max','q_is_good_max','q_is_rec_max','u_is_good_max',
              'q_is_dest_mean','u_reci_uncheer_max','uf_c5_count','u_is_dest_max','q_is_rec_mean',
              'q_is_rec_sum','u_is_rec_sum', 'q_reci_xxx_mean','u_reci_xxx_mean','u_reci_comment_max',
              'q_reci_comment_sum','u_reci_cheer_max','u_reci_dis_sum','u_reci_tks_sum','q_has_img_sum',
              'q_reci_comment_max','q_reci_cheer_max','u_reci_no_help_mean','u_has_img_sum','u_reci_mark_sum']
use_col = list(set(all_col) - set(drop_col))

t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t', usecols=use_col)
train = pd.concat([train, t1], axis=1)
logging.info("train %s", train.shape)


t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t', usecols=use_col)
test = pd.concat([test, t1], axis=1)
logging.info("test %s", test.shape)

[2019-12-10 13:56:31,518] INFO in <ipython-input-5-082dff1845e2>: train (9489162, 42)
[2019-12-10 13:56:38,673] INFO in <ipython-input-5-082dff1845e2>: test (1141683, 41)


In [6]:
train['week'] = train['day']%7
test['week'] = test['day']%7

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)
logging.info("train %s", train.shape)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)
logging.info("test %s", test.shape)

[2019-12-10 13:57:00,745] INFO in <ipython-input-7-25a44eae3c1c>: train (9489162, 61)
[2019-12-10 13:57:03,345] INFO in <ipython-input-7-25a44eae3c1c>: test (1141683, 60)


In [8]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)
logging.info("train %s", train.shape)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)
logging.info("test %s", test.shape)

[2019-12-10 13:57:37,373] INFO in <ipython-input-8-898ae5923687>: train (9489162, 89)
[2019-12-10 13:57:41,051] INFO in <ipython-input-8-898ae5923687>: test (1141683, 88)


In [9]:
# 加载 kfold topic feature, QU
t1 = pd.read_csv(f'{feature_path}/train_kfold_topic_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)
logging.info("train %s", train.shape)

t1 = pd.read_csv(f'{feature_path}/test_kfold_topic_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)
logging.info("test %s", test.shape)

[2019-12-10 13:58:19,941] INFO in <ipython-input-9-dff71a8a2332>: train (9489162, 113)
[2019-12-10 13:58:24,332] INFO in <ipython-input-9-dff71a8a2332>: test (1141683, 112)


In [10]:
# 加载 user kfold topic feature，UU
t1 = pd.read_csv(f'{feature_path}/train_kfold_ut_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)
logging.info("train %s", train.shape)

t1 = pd.read_csv(f'{feature_path}/test_kfold_ut_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)
logging.info("test %s", test.shape)

[2019-12-10 13:58:52,385] INFO in <ipython-input-10-35a09705a8ca>: train (9489162, 129)
[2019-12-10 13:58:55,741] INFO in <ipython-input-10-35a09705a8ca>: test (1141683, 128)


In [11]:
# 加载 kfold label 特征
# t1 = pickle.load(open(f'{feature_path}/train_kfold_label_feature.pkl', 'rb'))
# train = pd.concat([train, t1], axis=1)
# logging.info("train %s", train.shape)

# t1 = pickle.load(open(f'{feature_path}/test_kfold_label_feature.pkl', 'rb'))
# test = pd.concat([test, t1], axis=1)
# logging.info("test %s", test.shape)

In [12]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
logging.info('encoding qid...')    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])
logging.info('add qid_enc')

logging.info('encoding uid...')
u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])
logging.info('add uid_enc')

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-10 13:59:05,144] INFO in <ipython-input-12-9ca2dee00e76>: user (1931654, 14)
[2019-12-10 13:59:08,738] INFO in <ipython-input-12-9ca2dee00e76>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-10 13:59:08,745] INFO in <ipython-input-12-9ca2dee00e76>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-10 13:59:09,607] INFO in <ipython-input-12-9ca2dee00e76>: encode gender
[2019-12-10 13:59:10,522] INFO in <ipython-input-12-9ca2dee00e76>: encode freq
[2019-12-10 13:59:11,349] INFO in <ipython-input-12-9ca2dee00e76>: encode uf_c1
[2019-12-10 13:59:12,148] INFO in <ipython-input-12-9ca2dee00e76>: encode uf_c2
[2019-12-10 13:59:12,934] INFO in <ipython-input-12-9ca2dee00e76>: encode uf_c3
[2019-12-10 13:59:13,706] INFO in

In [13]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [14]:
# feat_counts_save = []

# other_feats = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
# for fj in other_feats:
#     feat_counts_save.append(fj+'_count')

# fi = 'qid_enc'
# other_feats = ['freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']
# for fj in other_feats:
#     feat_counts_save.append(fi+'_'+fj+'_count')
# fi = 'uid_enc'
# other_feats = ['hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']
# for fj in other_feats:
#     feat_counts_save.append(fi+'_'+fj+'_count')
# fi = 'day'
# other_feats = ['freq', 'gender', 'score', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'hour']
# for fj in other_feats:
#     feat_counts_save.append(fi+'_'+fj+'_count')

# print(feat_counts_save)

# # data[feat_counts_save].to_csv(f'{feature_path}/count_features.txt', index=None, header=None, sep='\t')
# # 加载 invete feature 2
# t1 = pd.read_csv(f'{feature_path}/count_features.txt', sep='\t', header=None)
# t1.columns = feat_counts_save
# print(len(t1), len(data))
# t1 = t1.reset_index(drop= True)
# data = data.reset_index(drop= True)
# data = pd.concat([data, t1], axis=1)

In [15]:
# count 特征
count_feat = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_feat:
    logging.info('counting %s', feat)
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

[2019-12-10 14:00:29,609] INFO in <ipython-input-15-1e9177a7eb97>: counting uid_enc
[2019-12-10 14:00:45,997] INFO in <ipython-input-15-1e9177a7eb97>: counting qid_enc
[2019-12-10 14:01:01,215] INFO in <ipython-input-15-1e9177a7eb97>: counting gender
[2019-12-10 14:01:14,385] INFO in <ipython-input-15-1e9177a7eb97>: counting freq
[2019-12-10 14:01:27,586] INFO in <ipython-input-15-1e9177a7eb97>: counting uf_c1
[2019-12-10 14:01:41,479] INFO in <ipython-input-15-1e9177a7eb97>: counting uf_c2
[2019-12-10 14:01:55,003] INFO in <ipython-input-15-1e9177a7eb97>: counting uf_c3
[2019-12-10 14:02:08,769] INFO in <ipython-input-15-1e9177a7eb97>: counting uf_c4
[2019-12-10 14:02:22,901] INFO in <ipython-input-15-1e9177a7eb97>: counting uf_c5


In [32]:
drop_feat = ['label', 'uid', 'qid', 'dt']
# drop_feat += ['u_is_rec_mean', 'u_reci_uncheer_mean', 'q_is_dest_sum', 'u_reci_uncheer_sum', 'u_is_rec_max', 
#              'u_is_dest_mean','q_reci_uncheer_mean', 'q_reci_uncheer_sum', 'u_is_dest_sum', 'q_is_dest_max',
#              'q_reci_uncheer_max', 'u_reci_tks_max', 'q_reci_mark_max','u_reci_dis_max', 'q_has_video_mean',
#              'q_reci_no_help_mean', 'count_u_topic', 'u_has_video_mean', 'q_reci_dis_sum', 'q_reci_mark_sum',
#              'q_reci_tks_sum','q_reci_tks_max','q_reci_dis_max','u_reci_mark_max','q_is_good_mean',
#              'q_reci_no_help_sum', 'q_reci_xxx_max', 'u_reci_xxx_max','u_reci_no_help_sum','u_reci_xxx_sum',
#               'u_is_good_mean','q_reci_no_help_max','u_has_img_max','u_is_good_sum','u_reci_no_help_max',
#               'u_has_video_sum','uf_b5','q_reci_xxx_sum','q_is_good_sum','q_has_img_max','q_has_video_sum',
#               'q_has_video_max','u_has_video_max','q_is_good_max','q_is_rec_max','u_is_good_max',
#               'q_is_dest_mean','u_reci_uncheer_max','uf_c5_count','u_is_dest_max','q_is_rec_mean',
#               'q_is_rec_sum','u_is_rec_sum', 'q_reci_xxx_mean','u_reci_xxx_mean','u_reci_comment_max',
#               'q_reci_comment_sum','u_reci_cheer_max','u_reci_dis_sum','u_reci_tks_sum','q_has_img_sum',
#               'q_reci_comment_max','q_reci_cheer_max','u_reci_no_help_mean','u_has_img_sum','u_reci_mark_sum']
# drop_feat += ['q_is_good_sum', 'q_is_good_max', 'q_is_good_mean', 'u_is_good_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'q_is_rec_sum', 'q_is_rec_max', 'q_is_rec_mean', 'u_is_rec_sum', 
#               'u_is_rec_max', 'u_is_rec_mean', 'q_is_dest_sum', 'q_is_dest_max', 'q_is_dest_mean', 
#               'u_is_dest_sum', 'u_is_dest_max', 'u_is_dest_mean', 'q_has_img_sum', 'q_has_img_max', 
#               'q_has_img_mean', 'u_has_img_sum', 'u_has_img_max', 'u_has_img_mean', 'q_has_video_sum', 
#               'q_has_video_max', 'q_has_video_mean', 'u_has_video_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'q_word_count_sum', 'q_word_count_max', 'q_word_count_mean', 'u_word_count_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'q_reci_cheer_sum', 'q_reci_cheer_max', 
#               'q_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_cheer_max', 'u_reci_cheer_mean', 
#               'q_reci_uncheer_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'u_reci_uncheer_sum',
#               'u_reci_uncheer_max', 'u_reci_uncheer_mean', 'q_reci_comment_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'u_reci_comment_sum', 'u_reci_comment_max', 'u_reci_comment_mean', 
#               'q_reci_mark_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'q_reci_tks_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 'u_reci_tks_sum', 
#               'u_reci_tks_max', 'u_reci_tks_mean', 'q_reci_xxx_sum', 'q_reci_xxx_max', 'q_reci_xxx_mean', 
#               'u_reci_xxx_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'q_reci_no_help_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'u_reci_no_help_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'q_reci_dis_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_dis_max', 
#               'u_reci_dis_mean']

feature_with_day = [x for x in data.columns if x not in drop_feat]
feature_cols = [x for x in data.columns if x not in drop_feat+['day']]
# feature_cols

In [35]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_with_day]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

# fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
#     break

# X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
#                                  y_train_all.iloc[train_idx], \
#                                  y_train_all.iloc[val_idx]
# del X_train_all

X_train = X_train_all.loc[X_train_all['day']<3867, feature_cols]
X_val = X_train_all.loc[X_train_all['day']==3867, feature_cols]
y_train = y_train_all[X_train_all['day']<3867]
y_val = y_train_all[X_train_all['day']==3867]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-12-10 15:52:59,016] INFO in <ipython-input-35-cdf4c334029b>: feature size 149
[2019-12-10 15:55:05,740] INFO in <ipython-input-35-cdf4c334029b>: train shape (9141216, 149), val shape (347946, 149), test shape (1141683, 153)


[1]	valid_0's auc: 0.730113	valid_0's binary_logloss: 0.422614
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.743115	valid_0's binary_logloss: 0.414125
[3]	valid_0's auc: 0.761552	valid_0's binary_logloss: 0.406219
[4]	valid_0's auc: 0.766067	valid_0's binary_logloss: 0.399879
[5]	valid_0's auc: 0.770836	valid_0's binary_logloss: 0.394369
[6]	valid_0's auc: 0.778932	valid_0's binary_logloss: 0.389271
[7]	valid_0's auc: 0.783106	valid_0's binary_logloss: 0.384788
[8]	valid_0's auc: 0.788171	valid_0's binary_logloss: 0.380597
[9]	valid_0's auc: 0.789766	valid_0's binary_logloss: 0.377112
[10]	valid_0's auc: 0.792357	valid_0's binary_logloss: 0.373507
[11]	valid_0's auc: 0.795494	valid_0's binary_logloss: 0.370226
[12]	valid_0's auc: 0.797779	valid_0's binary_logloss: 0.367684
[13]	valid_0's auc: 0.7989	valid_0's binary_logloss: 0.365353
[14]	valid_0's auc: 0.800048	valid_0's binary_logloss: 0.363391
[15]	valid_0's auc: 0.800678	valid_0's binary_logloss:

[129]	valid_0's auc: 0.837848	valid_0's binary_logloss: 0.325985
[130]	valid_0's auc: 0.837973	valid_0's binary_logloss: 0.325907
[131]	valid_0's auc: 0.838064	valid_0's binary_logloss: 0.325822
[132]	valid_0's auc: 0.837967	valid_0's binary_logloss: 0.325831
[133]	valid_0's auc: 0.838004	valid_0's binary_logloss: 0.325782
[134]	valid_0's auc: 0.838139	valid_0's binary_logloss: 0.325676
[135]	valid_0's auc: 0.838194	valid_0's binary_logloss: 0.325634
[136]	valid_0's auc: 0.838293	valid_0's binary_logloss: 0.325552
[137]	valid_0's auc: 0.838354	valid_0's binary_logloss: 0.325501
[138]	valid_0's auc: 0.838369	valid_0's binary_logloss: 0.325461
[139]	valid_0's auc: 0.838346	valid_0's binary_logloss: 0.325464
[140]	valid_0's auc: 0.83845	valid_0's binary_logloss: 0.3254
[141]	valid_0's auc: 0.838486	valid_0's binary_logloss: 0.325364
[142]	valid_0's auc: 0.838607	valid_0's binary_logloss: 0.325229
[143]	valid_0's auc: 0.838654	valid_0's binary_logloss: 0.325162
[144]	valid_0's auc: 0.83870

[256]	valid_0's auc: 0.843033	valid_0's binary_logloss: 0.321968
[257]	valid_0's auc: 0.843068	valid_0's binary_logloss: 0.321935
[258]	valid_0's auc: 0.843091	valid_0's binary_logloss: 0.32192
[259]	valid_0's auc: 0.843066	valid_0's binary_logloss: 0.321917
[260]	valid_0's auc: 0.843076	valid_0's binary_logloss: 0.321907
[261]	valid_0's auc: 0.843115	valid_0's binary_logloss: 0.321876
[262]	valid_0's auc: 0.84313	valid_0's binary_logloss: 0.321862
[263]	valid_0's auc: 0.843072	valid_0's binary_logloss: 0.322016
[264]	valid_0's auc: 0.8431	valid_0's binary_logloss: 0.32199
[265]	valid_0's auc: 0.84318	valid_0's binary_logloss: 0.321915
[266]	valid_0's auc: 0.843215	valid_0's binary_logloss: 0.321898
[267]	valid_0's auc: 0.843255	valid_0's binary_logloss: 0.321872
[268]	valid_0's auc: 0.843274	valid_0's binary_logloss: 0.321858
[269]	valid_0's auc: 0.843193	valid_0's binary_logloss: 0.32204
[270]	valid_0's auc: 0.843219	valid_0's binary_logloss: 0.322034
[271]	valid_0's auc: 0.843154	va

[383]	valid_0's auc: 0.845361	valid_0's binary_logloss: 0.320202
[384]	valid_0's auc: 0.845378	valid_0's binary_logloss: 0.320192
[385]	valid_0's auc: 0.845383	valid_0's binary_logloss: 0.320193
[386]	valid_0's auc: 0.845402	valid_0's binary_logloss: 0.320173
[387]	valid_0's auc: 0.845416	valid_0's binary_logloss: 0.320145
[388]	valid_0's auc: 0.845443	valid_0's binary_logloss: 0.320126
[389]	valid_0's auc: 0.845433	valid_0's binary_logloss: 0.320132
[390]	valid_0's auc: 0.845463	valid_0's binary_logloss: 0.320097
[391]	valid_0's auc: 0.845481	valid_0's binary_logloss: 0.320094
[392]	valid_0's auc: 0.845487	valid_0's binary_logloss: 0.320089
[393]	valid_0's auc: 0.845418	valid_0's binary_logloss: 0.320213
[394]	valid_0's auc: 0.845419	valid_0's binary_logloss: 0.320237
[395]	valid_0's auc: 0.845428	valid_0's binary_logloss: 0.320236
[396]	valid_0's auc: 0.845481	valid_0's binary_logloss: 0.320185
[397]	valid_0's auc: 0.845546	valid_0's binary_logloss: 0.320156
[398]	valid_0's auc: 0.84

[510]	valid_0's auc: 0.847032	valid_0's binary_logloss: 0.319007
[511]	valid_0's auc: 0.847041	valid_0's binary_logloss: 0.319
[512]	valid_0's auc: 0.847034	valid_0's binary_logloss: 0.319004
[513]	valid_0's auc: 0.847044	valid_0's binary_logloss: 0.318991
[514]	valid_0's auc: 0.847052	valid_0's binary_logloss: 0.318982
[515]	valid_0's auc: 0.847065	valid_0's binary_logloss: 0.318972
[516]	valid_0's auc: 0.847082	valid_0's binary_logloss: 0.318962
[517]	valid_0's auc: 0.847161	valid_0's binary_logloss: 0.318896
[518]	valid_0's auc: 0.847171	valid_0's binary_logloss: 0.318889
[519]	valid_0's auc: 0.847202	valid_0's binary_logloss: 0.318877
[520]	valid_0's auc: 0.847222	valid_0's binary_logloss: 0.318858
[521]	valid_0's auc: 0.847249	valid_0's binary_logloss: 0.318837
[522]	valid_0's auc: 0.847254	valid_0's binary_logloss: 0.318839
[523]	valid_0's auc: 0.847262	valid_0's binary_logloss: 0.318833
[524]	valid_0's auc: 0.847268	valid_0's binary_logloss: 0.318823
[525]	valid_0's auc: 0.84726

[637]	valid_0's auc: 0.848057	valid_0's binary_logloss: 0.318135
[638]	valid_0's auc: 0.848058	valid_0's binary_logloss: 0.318131
[639]	valid_0's auc: 0.848051	valid_0's binary_logloss: 0.31814
[640]	valid_0's auc: 0.848086	valid_0's binary_logloss: 0.318104
[641]	valid_0's auc: 0.848081	valid_0's binary_logloss: 0.318109
[642]	valid_0's auc: 0.848081	valid_0's binary_logloss: 0.318103
[643]	valid_0's auc: 0.84811	valid_0's binary_logloss: 0.318088
[644]	valid_0's auc: 0.848183	valid_0's binary_logloss: 0.318022
[645]	valid_0's auc: 0.848184	valid_0's binary_logloss: 0.318022
[646]	valid_0's auc: 0.848188	valid_0's binary_logloss: 0.318018
[647]	valid_0's auc: 0.848206	valid_0's binary_logloss: 0.318002
[648]	valid_0's auc: 0.848207	valid_0's binary_logloss: 0.318007
[649]	valid_0's auc: 0.848221	valid_0's binary_logloss: 0.318
[650]	valid_0's auc: 0.848225	valid_0's binary_logloss: 0.317995
[651]	valid_0's auc: 0.848241	valid_0's binary_logloss: 0.317982
[652]	valid_0's auc: 0.848251	

[764]	valid_0's auc: 0.849087	valid_0's binary_logloss: 0.31728
[765]	valid_0's auc: 0.849112	valid_0's binary_logloss: 0.31725
[766]	valid_0's auc: 0.849116	valid_0's binary_logloss: 0.31725
[767]	valid_0's auc: 0.849129	valid_0's binary_logloss: 0.317239
[768]	valid_0's auc: 0.849131	valid_0's binary_logloss: 0.31724
[769]	valid_0's auc: 0.849132	valid_0's binary_logloss: 0.317237
[770]	valid_0's auc: 0.849141	valid_0's binary_logloss: 0.317232
[771]	valid_0's auc: 0.849149	valid_0's binary_logloss: 0.317223
[772]	valid_0's auc: 0.849149	valid_0's binary_logloss: 0.317224
[773]	valid_0's auc: 0.849154	valid_0's binary_logloss: 0.317216
[774]	valid_0's auc: 0.849178	valid_0's binary_logloss: 0.317194
[775]	valid_0's auc: 0.849204	valid_0's binary_logloss: 0.317174
[776]	valid_0's auc: 0.849202	valid_0's binary_logloss: 0.317175
[777]	valid_0's auc: 0.849214	valid_0's binary_logloss: 0.317161
[778]	valid_0's auc: 0.849222	valid_0's binary_logloss: 0.317155
[779]	valid_0's auc: 0.849238

[891]	valid_0's auc: 0.849716	valid_0's binary_logloss: 0.316703
[892]	valid_0's auc: 0.849717	valid_0's binary_logloss: 0.316702
[893]	valid_0's auc: 0.849719	valid_0's binary_logloss: 0.3167
[894]	valid_0's auc: 0.849723	valid_0's binary_logloss: 0.316695
[895]	valid_0's auc: 0.849733	valid_0's binary_logloss: 0.31669
[896]	valid_0's auc: 0.849722	valid_0's binary_logloss: 0.316731
[897]	valid_0's auc: 0.849726	valid_0's binary_logloss: 0.316726
[898]	valid_0's auc: 0.849725	valid_0's binary_logloss: 0.316727
[899]	valid_0's auc: 0.849728	valid_0's binary_logloss: 0.316727
[900]	valid_0's auc: 0.849741	valid_0's binary_logloss: 0.316716
[901]	valid_0's auc: 0.849745	valid_0's binary_logloss: 0.316712
[902]	valid_0's auc: 0.849747	valid_0's binary_logloss: 0.316709
[903]	valid_0's auc: 0.849748	valid_0's binary_logloss: 0.316712
[904]	valid_0's auc: 0.849764	valid_0's binary_logloss: 0.316701
[905]	valid_0's auc: 0.849766	valid_0's binary_logloss: 0.316693
[906]	valid_0's auc: 0.84975

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [19]:
pickle.dump(model_lgb, open('./model/model.pkl', 'wb'))

In [20]:
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]
sub.to_csv('./result/2000.txt', index=None, header=None, sep='\t')

In [21]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi_sorted = fi.sort_values(by='rate', ascending=False)

In [22]:
pickle.dump(fi_sorted, open('./fi/fi.pkl', 'wb'))

In [23]:
# import pickle
# pickle.dump(fi_sorted, open('./feature_importance.pkl', 'wb'))

In [24]:
fi_sorted[:60]

Unnamed: 0,feature,imp,rate
77,score,2586,0.0431
2,diff_iq_hour,2263,0.037717
140,uid_enc_count,1523,0.025383
59,qid_hour_count,1490,0.024833
5,hour,1488,0.0248
98,u_inv_kfold_mean,1273,0.021217
50,qid_day_count,1095,0.01825
97,u_inv_kfold_count,1083,0.01805
134,uid_hour_std,1070,0.017833
122,uid_diff_day_daymean,1059,0.01765


In [25]:
test['score_label_mean'].value_counts()

KeyError: 'score_label_mean'