In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-11-28 15:50:04,314] INFO in <ipython-input-4-f1f9bce9582b>: invite (9489162, 3)
[2019-11-28 15:50:05,991] INFO in <ipython-input-4-f1f9bce9582b>: test (1141683, 2)


In [5]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [6]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-11-28 15:52:23,550] INFO in <ipython-input-6-af285af762e4>: user (1931654, 14)
[2019-11-28 15:52:26,946] INFO in <ipython-input-6-af285af762e4>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-11-28 15:52:26,952] INFO in <ipython-input-6-af285af762e4>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-11-28 15:52:27,812] INFO in <ipython-input-6-af285af762e4>: encode gender
[2019-11-28 15:52:28,660] INFO in <ipython-input-6-af285af762e4>: encode freq
[2019-11-28 15:52:29,488] INFO in <ipython-input-6-af285af762e4>: encode uf_c1
[2019-11-28 15:52:30,299] INFO in <ipython-input-6-af285af762e4>: encode uf_c2
[2019-11-28 15:52:31,113] INFO in <ipython-input-6-af285af762e4>: encode uf_c3
[2019-11-28 15:52:31,919] INFO in <ipytho

In [11]:
train.columns

Index(['qid', 'uid', 'label', 'day', 'hour', 'q_inv_kfold_mean',
       'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count',
       'u_inv_kfold_mean',
       ...
       'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4',
       'uf_c5', 'score'],
      dtype='object', length=120)

In [12]:
test.columns

Index(['qid', 'uid', 'day', 'hour', 'q_inv_kfold_mean', 'q_inv_kfold_sum',
       'q_inv_kfold_std', 'q_inv_kfold_count', 'u_inv_kfold_mean',
       'u_inv_kfold_sum',
       ...
       'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4',
       'uf_c5', 'score'],
      dtype='object', length=119)

In [7]:
data = pd.concat((train, test), axis=0, sort=True)

In [8]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [15]:
data[-20:]

Unnamed: 0,day,freq,gender,hour,label,q_ans_kfold_count,q_diff_qa_days_max,q_diff_qa_days_mean,q_diff_qa_days_sum,q_has_img_max,...,uid_enc_count,qid_enc_count,gender_count,freq_count,uf_c1_count,uf_c2_count,uf_c3_count,uf_c4_count,uf_c5_count,wk
1141663,3871,1,3,8,,,,,,,...,7.9e-05,0.0,1.0,1.0,1.0,1.0,0.028967,1.0,1.0,0
1141664,3872,1,2,18,,,,,,,...,0.000237,2.4e-05,0.027183,1.0,1.0,1.0,0.441988,0.129433,1.0,1
1141665,3872,2,1,15,,,,,,,...,5.7e-05,5.9e-05,0.0,0.24599,1.0,1.0,0.432284,0.005611,1.0,1
1141666,3873,1,3,21,,,,,,,...,3.1e-05,1.2e-05,1.0,1.0,0.042962,0.372573,0.069194,1.0,1.0,2
1141667,3868,1,3,14,,,,,,,...,5.3e-05,6e-06,1.0,1.0,1.0,1.0,1.0,0.071494,1.0,4
1141668,3870,3,3,14,,1.0,0.0,0.0,0.0,0.0,...,0.0,0.000349,1.0,0.023868,1.0,1.0,0.020596,1.0,1.0,6
1141669,3869,5,3,8,,15.0,5.0,3.533333,53.0,1.0,...,1.0,0.002174,1.0,0.934679,0.05124,0.372573,0.612471,0.179357,1.0,5
1141670,3870,1,3,22,,,,,,,...,6.6e-05,1.0,1.0,1.0,1.0,1.0,1.0,0.071494,1.0,6
1141671,3870,2,2,16,,10.0,13.0,8.0,80.0,1.0,...,0.0,0.000195,0.027183,0.24599,1.0,1.0,0.187605,0.009302,1.0,6
1141672,3869,1,2,8,,,,,,,...,3.5e-05,2.4e-05,0.027183,1.0,0.057592,0.372573,0.128859,0.011044,1.0,5


In [27]:
print(list(data.columns))

['day', 'freq', 'gender', 'hour', 'label', 'q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 'q_has_video_sum', 'q_inv_kfold_count', 'q_inv_kfold_mean', 'q_inv_kfold_std', 'q_inv_kfold_sum', 'q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max', 'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', '

In [9]:
data['wk'] = data['day'] % 7
# 选特征
# feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [29]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

['freq',
 'gender',
 'hour',
 'q_inv_kfold_count',
 'q_inv_kfold_mean',
 'q_inv_kfold_std',
 'q_inv_kfold_sum',
 'qid_enc',
 'score',
 'u_inv_kfold_count',
 'u_inv_kfold_mean',
 'u_inv_kfold_std',
 'u_inv_kfold_sum',
 'uf_b1',
 'uf_b2',
 'uf_b3',
 'uf_b4',
 'uf_b5',
 'uf_c1',
 'uf_c2',
 'uf_c3',
 'uf_c4',
 'uf_c5',
 'uid_enc',
 'uid_enc_count',
 'qid_enc_count',
 'gender_count',
 'freq_count',
 'uf_c1_count',
 'uf_c2_count',
 'uf_c3_count',
 'uf_c4_count',
 'uf_c5_count',
 'wk']

In [30]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train)][feature_cols]
y_train_all = data.iloc[:len(train)]['label']
X_test = data.iloc[len(train):]
assert len(X_test) == sub_size

logging.info("train shape %s, test shape %s", train.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-11-29 01:46:18,044] INFO in <ipython-input-30-20ad4e7d53e4>: feature size 34
[2019-11-29 01:46:38,937] INFO in <ipython-input-30-20ad4e7d53e4>: train shape (9489162, 120), test shape (1141683, 119)


[1]	valid_0's auc: 0.777536	valid_0's binary_logloss: 0.451265
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.784692	valid_0's binary_logloss: 0.439019
[3]	valid_0's auc: 0.789297	valid_0's binary_logloss: 0.429074
[4]	valid_0's auc: 0.7918	valid_0's binary_logloss: 0.421178
[5]	valid_0's auc: 0.79395	valid_0's binary_logloss: 0.414455
[6]	valid_0's auc: 0.795128	valid_0's binary_logloss: 0.408824
[7]	valid_0's auc: 0.796642	valid_0's binary_logloss: 0.404102
[8]	valid_0's auc: 0.79781	valid_0's binary_logloss: 0.400014
[9]	valid_0's auc: 0.799027	valid_0's binary_logloss: 0.396441
[10]	valid_0's auc: 0.799628	valid_0's binary_logloss: 0.393478
[11]	valid_0's auc: 0.800115	valid_0's binary_logloss: 0.390918
[12]	valid_0's auc: 0.801427	valid_0's binary_logloss: 0.388504
[13]	valid_0's auc: 0.802045	valid_0's binary_logloss: 0.386516
[14]	valid_0's auc: 0.802836	valid_0's binary_logloss: 0.384707
[15]	valid_0's auc: 0.803337	valid_0's binary_logloss: 0

[129]	valid_0's auc: 0.817858	valid_0's binary_logloss: 0.362921
[130]	valid_0's auc: 0.817875	valid_0's binary_logloss: 0.362904
[131]	valid_0's auc: 0.817909	valid_0's binary_logloss: 0.362871
[132]	valid_0's auc: 0.817949	valid_0's binary_logloss: 0.362836
[133]	valid_0's auc: 0.817996	valid_0's binary_logloss: 0.362798
[134]	valid_0's auc: 0.818009	valid_0's binary_logloss: 0.362782
[135]	valid_0's auc: 0.818023	valid_0's binary_logloss: 0.362765
[136]	valid_0's auc: 0.818034	valid_0's binary_logloss: 0.362754
[137]	valid_0's auc: 0.818044	valid_0's binary_logloss: 0.362747
[138]	valid_0's auc: 0.818061	valid_0's binary_logloss: 0.362733
[139]	valid_0's auc: 0.818079	valid_0's binary_logloss: 0.362719
[140]	valid_0's auc: 0.818089	valid_0's binary_logloss: 0.362707
[141]	valid_0's auc: 0.818121	valid_0's binary_logloss: 0.36268
[142]	valid_0's auc: 0.818157	valid_0's binary_logloss: 0.362648
[143]	valid_0's auc: 0.818194	valid_0's binary_logloss: 0.36262
[144]	valid_0's auc: 0.8182

[256]	valid_0's auc: 0.819557	valid_0's binary_logloss: 0.361396
[257]	valid_0's auc: 0.819557	valid_0's binary_logloss: 0.361396
[258]	valid_0's auc: 0.819559	valid_0's binary_logloss: 0.361395
[259]	valid_0's auc: 0.819567	valid_0's binary_logloss: 0.361388
[260]	valid_0's auc: 0.819586	valid_0's binary_logloss: 0.361374
[261]	valid_0's auc: 0.81959	valid_0's binary_logloss: 0.36137
[262]	valid_0's auc: 0.81959	valid_0's binary_logloss: 0.36137
[263]	valid_0's auc: 0.819593	valid_0's binary_logloss: 0.361367
[264]	valid_0's auc: 0.819595	valid_0's binary_logloss: 0.361366
[265]	valid_0's auc: 0.819609	valid_0's binary_logloss: 0.361355
[266]	valid_0's auc: 0.819633	valid_0's binary_logloss: 0.361337
[267]	valid_0's auc: 0.81964	valid_0's binary_logloss: 0.361332
[268]	valid_0's auc: 0.819646	valid_0's binary_logloss: 0.361327
[269]	valid_0's auc: 0.819663	valid_0's binary_logloss: 0.361315
[270]	valid_0's auc: 0.819668	valid_0's binary_logloss: 0.36131
[271]	valid_0's auc: 0.819669	v

[383]	valid_0's auc: 0.820336	valid_0's binary_logloss: 0.360727
[384]	valid_0's auc: 0.820343	valid_0's binary_logloss: 0.360721
[385]	valid_0's auc: 0.820358	valid_0's binary_logloss: 0.360711
[386]	valid_0's auc: 0.820364	valid_0's binary_logloss: 0.360706
[387]	valid_0's auc: 0.820374	valid_0's binary_logloss: 0.360698
[388]	valid_0's auc: 0.82038	valid_0's binary_logloss: 0.360691
[389]	valid_0's auc: 0.820383	valid_0's binary_logloss: 0.360689
[390]	valid_0's auc: 0.820385	valid_0's binary_logloss: 0.360686
[391]	valid_0's auc: 0.820385	valid_0's binary_logloss: 0.360686
[392]	valid_0's auc: 0.820386	valid_0's binary_logloss: 0.360686
[393]	valid_0's auc: 0.820387	valid_0's binary_logloss: 0.360685
[394]	valid_0's auc: 0.820387	valid_0's binary_logloss: 0.360685
[395]	valid_0's auc: 0.820391	valid_0's binary_logloss: 0.360682
[396]	valid_0's auc: 0.820392	valid_0's binary_logloss: 0.360681
[397]	valid_0's auc: 0.820392	valid_0's binary_logloss: 0.360681
[398]	valid_0's auc: 0.820

[510]	valid_0's auc: 0.820746	valid_0's binary_logloss: 0.360375
[511]	valid_0's auc: 0.820746	valid_0's binary_logloss: 0.360374
[512]	valid_0's auc: 0.820748	valid_0's binary_logloss: 0.360371
[513]	valid_0's auc: 0.820748	valid_0's binary_logloss: 0.360371
[514]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360368
[515]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360368
[516]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360367
[517]	valid_0's auc: 0.820753	valid_0's binary_logloss: 0.360367
[518]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360368
[519]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360367
[520]	valid_0's auc: 0.820753	valid_0's binary_logloss: 0.360367
[521]	valid_0's auc: 0.820752	valid_0's binary_logloss: 0.360367
[522]	valid_0's auc: 0.820753	valid_0's binary_logloss: 0.360367
[523]	valid_0's auc: 0.820753	valid_0's binary_logloss: 0.360367
[524]	valid_0's auc: 0.820764	valid_0's binary_logloss: 0.360359
[525]	valid_0's auc: 0.82

[637]	valid_0's auc: 0.821088	valid_0's binary_logloss: 0.360098
[638]	valid_0's auc: 0.82109	valid_0's binary_logloss: 0.360097
[639]	valid_0's auc: 0.821091	valid_0's binary_logloss: 0.360093
[640]	valid_0's auc: 0.821092	valid_0's binary_logloss: 0.360092
[641]	valid_0's auc: 0.821095	valid_0's binary_logloss: 0.36009
[642]	valid_0's auc: 0.821097	valid_0's binary_logloss: 0.360087
[643]	valid_0's auc: 0.821097	valid_0's binary_logloss: 0.360087
[644]	valid_0's auc: 0.821097	valid_0's binary_logloss: 0.360086
[645]	valid_0's auc: 0.821098	valid_0's binary_logloss: 0.360086
[646]	valid_0's auc: 0.821098	valid_0's binary_logloss: 0.360086
[647]	valid_0's auc: 0.821098	valid_0's binary_logloss: 0.360087
[648]	valid_0's auc: 0.821098	valid_0's binary_logloss: 0.360086
[649]	valid_0's auc: 0.821101	valid_0's binary_logloss: 0.360084
[650]	valid_0's auc: 0.821103	valid_0's binary_logloss: 0.360082
[651]	valid_0's auc: 0.821104	valid_0's binary_logloss: 0.36008
[652]	valid_0's auc: 0.82110

[764]	valid_0's auc: 0.821261	valid_0's binary_logloss: 0.359947
[765]	valid_0's auc: 0.821262	valid_0's binary_logloss: 0.359946
[766]	valid_0's auc: 0.821264	valid_0's binary_logloss: 0.359945
[767]	valid_0's auc: 0.821263	valid_0's binary_logloss: 0.359945
[768]	valid_0's auc: 0.821264	valid_0's binary_logloss: 0.359944
[769]	valid_0's auc: 0.821265	valid_0's binary_logloss: 0.359943
[770]	valid_0's auc: 0.821266	valid_0's binary_logloss: 0.359943
[771]	valid_0's auc: 0.821266	valid_0's binary_logloss: 0.359943
[772]	valid_0's auc: 0.821266	valid_0's binary_logloss: 0.359943
[773]	valid_0's auc: 0.821266	valid_0's binary_logloss: 0.359943
[774]	valid_0's auc: 0.821267	valid_0's binary_logloss: 0.359941
[775]	valid_0's auc: 0.821266	valid_0's binary_logloss: 0.359942
[776]	valid_0's auc: 0.821267	valid_0's binary_logloss: 0.359941
[777]	valid_0's auc: 0.821267	valid_0's binary_logloss: 0.359941
[778]	valid_0's auc: 0.821269	valid_0's binary_logloss: 0.359939
[779]	valid_0's auc: 0.82

[891]	valid_0's auc: 0.821415	valid_0's binary_logloss: 0.359811
[892]	valid_0's auc: 0.821416	valid_0's binary_logloss: 0.35981
[893]	valid_0's auc: 0.821416	valid_0's binary_logloss: 0.35981
[894]	valid_0's auc: 0.821418	valid_0's binary_logloss: 0.359808
[895]	valid_0's auc: 0.82142	valid_0's binary_logloss: 0.359807
[896]	valid_0's auc: 0.82142	valid_0's binary_logloss: 0.359806
[897]	valid_0's auc: 0.82142	valid_0's binary_logloss: 0.359807
[898]	valid_0's auc: 0.82142	valid_0's binary_logloss: 0.359807
[899]	valid_0's auc: 0.821421	valid_0's binary_logloss: 0.359806
[900]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359805
[901]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359805
[902]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359805
[903]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359805
[904]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359806
[905]	valid_0's auc: 0.821422	valid_0's binary_logloss: 0.359806
[906]	valid_0's auc: 0.821422	v

[1018]	valid_0's auc: 0.821549	valid_0's binary_logloss: 0.359701
[1019]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1020]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1021]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1022]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1023]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1024]	valid_0's auc: 0.821551	valid_0's binary_logloss: 0.359699
[1025]	valid_0's auc: 0.821553	valid_0's binary_logloss: 0.359697
[1026]	valid_0's auc: 0.821553	valid_0's binary_logloss: 0.359697
[1027]	valid_0's auc: 0.821553	valid_0's binary_logloss: 0.359697
[1028]	valid_0's auc: 0.821553	valid_0's binary_logloss: 0.359697
[1029]	valid_0's auc: 0.821555	valid_0's binary_logloss: 0.359696
[1030]	valid_0's auc: 0.821555	valid_0's binary_logloss: 0.359696
[1031]	valid_0's auc: 0.821555	valid_0's binary_logloss: 0.359696
[1032]	valid_0's auc: 0.821556	valid_0's binary_logloss: 0.359695
[1033]	val

[1143]	valid_0's auc: 0.821626	valid_0's binary_logloss: 0.359643
[1144]	valid_0's auc: 0.821628	valid_0's binary_logloss: 0.359641
[1145]	valid_0's auc: 0.821629	valid_0's binary_logloss: 0.359641
[1146]	valid_0's auc: 0.821631	valid_0's binary_logloss: 0.359639
[1147]	valid_0's auc: 0.821632	valid_0's binary_logloss: 0.359638
[1148]	valid_0's auc: 0.821633	valid_0's binary_logloss: 0.359637
[1149]	valid_0's auc: 0.821633	valid_0's binary_logloss: 0.359637
[1150]	valid_0's auc: 0.821634	valid_0's binary_logloss: 0.359636
[1151]	valid_0's auc: 0.821635	valid_0's binary_logloss: 0.359635
[1152]	valid_0's auc: 0.821637	valid_0's binary_logloss: 0.359634
[1153]	valid_0's auc: 0.821637	valid_0's binary_logloss: 0.359634
[1154]	valid_0's auc: 0.821637	valid_0's binary_logloss: 0.359634
[1155]	valid_0's auc: 0.821638	valid_0's binary_logloss: 0.359633
[1156]	valid_0's auc: 0.821638	valid_0's binary_logloss: 0.359633
[1157]	valid_0's auc: 0.821639	valid_0's binary_logloss: 0.359632
[1158]	val

[1268]	valid_0's auc: 0.821725	valid_0's binary_logloss: 0.359559
[1269]	valid_0's auc: 0.821726	valid_0's binary_logloss: 0.359558
[1270]	valid_0's auc: 0.821727	valid_0's binary_logloss: 0.359557
[1271]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.359556
[1272]	valid_0's auc: 0.821729	valid_0's binary_logloss: 0.359556
[1273]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.359556
[1274]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.359556
[1275]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.359556
[1276]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.359556
[1277]	valid_0's auc: 0.821729	valid_0's binary_logloss: 0.359555
[1278]	valid_0's auc: 0.82173	valid_0's binary_logloss: 0.359555
[1279]	valid_0's auc: 0.82173	valid_0's binary_logloss: 0.359555
[1280]	valid_0's auc: 0.821732	valid_0's binary_logloss: 0.359553
[1281]	valid_0's auc: 0.821732	valid_0's binary_logloss: 0.359553
[1282]	valid_0's auc: 0.821731	valid_0's binary_logloss: 0.359554
[1283]	valid

[1393]	valid_0's auc: 0.821782	valid_0's binary_logloss: 0.359512
[1394]	valid_0's auc: 0.821781	valid_0's binary_logloss: 0.359512
[1395]	valid_0's auc: 0.821782	valid_0's binary_logloss: 0.359512
[1396]	valid_0's auc: 0.821783	valid_0's binary_logloss: 0.359511
[1397]	valid_0's auc: 0.821782	valid_0's binary_logloss: 0.359511
[1398]	valid_0's auc: 0.821781	valid_0's binary_logloss: 0.359512
[1399]	valid_0's auc: 0.821782	valid_0's binary_logloss: 0.359511
[1400]	valid_0's auc: 0.821783	valid_0's binary_logloss: 0.35951
[1401]	valid_0's auc: 0.821783	valid_0's binary_logloss: 0.35951
[1402]	valid_0's auc: 0.821783	valid_0's binary_logloss: 0.35951
[1403]	valid_0's auc: 0.821784	valid_0's binary_logloss: 0.359509
[1404]	valid_0's auc: 0.821783	valid_0's binary_logloss: 0.35951
[1405]	valid_0's auc: 0.821785	valid_0's binary_logloss: 0.359508
[1406]	valid_0's auc: 0.821785	valid_0's binary_logloss: 0.359508
[1407]	valid_0's auc: 0.821786	valid_0's binary_logloss: 0.359507
[1408]	valid_0

[1518]	valid_0's auc: 0.821839	valid_0's binary_logloss: 0.359463
[1519]	valid_0's auc: 0.821839	valid_0's binary_logloss: 0.359464
[1520]	valid_0's auc: 0.821839	valid_0's binary_logloss: 0.359463
[1521]	valid_0's auc: 0.821839	valid_0's binary_logloss: 0.359463
[1522]	valid_0's auc: 0.82184	valid_0's binary_logloss: 0.359462
[1523]	valid_0's auc: 0.821841	valid_0's binary_logloss: 0.359461
[1524]	valid_0's auc: 0.821843	valid_0's binary_logloss: 0.35946
[1525]	valid_0's auc: 0.821842	valid_0's binary_logloss: 0.35946
[1526]	valid_0's auc: 0.821844	valid_0's binary_logloss: 0.359459
[1527]	valid_0's auc: 0.821849	valid_0's binary_logloss: 0.359455
[1528]	valid_0's auc: 0.821848	valid_0's binary_logloss: 0.359455
[1529]	valid_0's auc: 0.821851	valid_0's binary_logloss: 0.359454
[1530]	valid_0's auc: 0.821851	valid_0's binary_logloss: 0.359454
[1531]	valid_0's auc: 0.821851	valid_0's binary_logloss: 0.359454
[1532]	valid_0's auc: 0.821852	valid_0's binary_logloss: 0.359453
[1533]	valid_

[1643]	valid_0's auc: 0.821904	valid_0's binary_logloss: 0.35941
[1644]	valid_0's auc: 0.821903	valid_0's binary_logloss: 0.35941
[1645]	valid_0's auc: 0.821903	valid_0's binary_logloss: 0.35941
[1646]	valid_0's auc: 0.821903	valid_0's binary_logloss: 0.35941
[1647]	valid_0's auc: 0.821903	valid_0's binary_logloss: 0.359411
[1648]	valid_0's auc: 0.821902	valid_0's binary_logloss: 0.359411
[1649]	valid_0's auc: 0.821902	valid_0's binary_logloss: 0.359412
[1650]	valid_0's auc: 0.821902	valid_0's binary_logloss: 0.359411
[1651]	valid_0's auc: 0.8219	valid_0's binary_logloss: 0.359413
[1652]	valid_0's auc: 0.8219	valid_0's binary_logloss: 0.359413
[1653]	valid_0's auc: 0.8219	valid_0's binary_logloss: 0.359413
[1654]	valid_0's auc: 0.8219	valid_0's binary_logloss: 0.359413
[1655]	valid_0's auc: 0.8219	valid_0's binary_logloss: 0.359412
[1656]	valid_0's auc: 0.821901	valid_0's binary_logloss: 0.359412
[1657]	valid_0's auc: 0.821902	valid_0's binary_logloss: 0.359412
[1658]	valid_0's auc: 0.

[1768]	valid_0's auc: 0.821929	valid_0's binary_logloss: 0.359389
[1769]	valid_0's auc: 0.821929	valid_0's binary_logloss: 0.359389
[1770]	valid_0's auc: 0.821929	valid_0's binary_logloss: 0.35939
[1771]	valid_0's auc: 0.821927	valid_0's binary_logloss: 0.35939
[1772]	valid_0's auc: 0.821927	valid_0's binary_logloss: 0.35939
[1773]	valid_0's auc: 0.821928	valid_0's binary_logloss: 0.359389
[1774]	valid_0's auc: 0.82193	valid_0's binary_logloss: 0.359388
[1775]	valid_0's auc: 0.821929	valid_0's binary_logloss: 0.359388
[1776]	valid_0's auc: 0.821933	valid_0's binary_logloss: 0.359385
[1777]	valid_0's auc: 0.821934	valid_0's binary_logloss: 0.359385
[1778]	valid_0's auc: 0.821934	valid_0's binary_logloss: 0.359385
[1779]	valid_0's auc: 0.821934	valid_0's binary_logloss: 0.359385
[1780]	valid_0's auc: 0.821934	valid_0's binary_logloss: 0.359386
[1781]	valid_0's auc: 0.821936	valid_0's binary_logloss: 0.359382
[1782]	valid_0's auc: 0.821938	valid_0's binary_logloss: 0.359381
[1783]	valid_0

[1893]	valid_0's auc: 0.821981	valid_0's binary_logloss: 0.359341
[1894]	valid_0's auc: 0.821982	valid_0's binary_logloss: 0.359341
[1895]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.35934
[1896]	valid_0's auc: 0.821982	valid_0's binary_logloss: 0.359341
[1897]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.35934
[1898]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.35934
[1899]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.359341
[1900]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.35934
[1901]	valid_0's auc: 0.821983	valid_0's binary_logloss: 0.35934
[1902]	valid_0's auc: 0.821984	valid_0's binary_logloss: 0.359339
[1903]	valid_0's auc: 0.821984	valid_0's binary_logloss: 0.359339
[1904]	valid_0's auc: 0.821986	valid_0's binary_logloss: 0.359337
[1905]	valid_0's auc: 0.821987	valid_0's binary_logloss: 0.359337
[1906]	valid_0's auc: 0.821987	valid_0's binary_logloss: 0.359337
[1907]	valid_0's auc: 0.821987	valid_0's binary_logloss: 0.359337
[1908]	valid_0'

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [31]:
sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [32]:
sub[:20]

Unnamed: 0,qid,uid,dt,label
0,Q1493039281,M64135255,D3870-H9,0.111413
1,Q2023398782,M2536956560,D3872-H22,0.007898
2,Q4151338694,M3294926344,D3874-H15,0.165985
3,Q3271436624,M3744310794,D3873-H4,0.442629
4,Q3314287018,M1349051752,D3872-H19,0.049092
5,Q4214103875,M2007129506,D3871-H13,0.175161
6,Q1421177878,M3927950819,D3873-H14,0.214217
7,Q3598252818,M2871943120,D3873-H9,0.564212
8,Q568518135,M998566127,D3872-H18,0.856189
9,Q2242868437,M1307039867,D3874-H11,0.286607


In [24]:
sub.to_csv('./result/kfold_2000.txt', index=None, header=None, sep='\t')

In [25]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi

Unnamed: 0,feature,imp,rate
0,freq,418,0.007702
1,gender,306,0.005638
2,hour,2393,0.044094
3,q_ans_kfold_count,725,0.013359
4,q_diff_qa_days_max,1326,0.024433
...,...,...,...
121,uf_c2_count,256,0.004717
122,uf_c3_count,605,0.011148
123,uf_c4_count,594,0.010945
124,uf_c5_count,0,0.000000


In [26]:
fi.sort_values(by='rate', ascending=False)[:60]

Unnamed: 0,feature,imp,rate
54,score,3241,0.05972
2,hour,2393,0.044094
14,q_inv_kfold_mean,1856,0.034199
116,uid_enc_count,1626,0.029961
66,u_inv_kfold_mean,1589,0.02928
13,q_inv_kfold_count,1492,0.027492
5,q_diff_qa_days_mean,1417,0.02611
4,q_diff_qa_days_max,1326,0.024433
65,u_inv_kfold_count,1308,0.024102
117,qid_enc_count,1279,0.023567
