In [44]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [45]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [46]:
base_path = './data'
feature_path = './feature'

In [47]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-11-29 13:55:44,937] INFO in <ipython-input-47-f5fcce844f4e>: invite (9489162, 3)
[2019-11-29 13:55:46,416] INFO in <ipython-input-47-f5fcce844f4e>: test (1141683, 2)


In [48]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature_4.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [49]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-11-29 13:57:44,052] INFO in <ipython-input-49-af285af762e4>: user (1931654, 14)
[2019-11-29 13:57:47,710] INFO in <ipython-input-49-af285af762e4>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-11-29 13:57:47,715] INFO in <ipython-input-49-af285af762e4>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-11-29 13:57:48,617] INFO in <ipython-input-49-af285af762e4>: encode gender
[2019-11-29 13:57:49,470] INFO in <ipython-input-49-af285af762e4>: encode freq
[2019-11-29 13:57:50,303] INFO in <ipython-input-49-af285af762e4>: encode uf_c1
[2019-11-29 13:57:51,137] INFO in <ipython-input-49-af285af762e4>: encode uf_c2
[2019-11-29 13:57:51,941] INFO in <ipython-input-49-af285af762e4>: encode uf_c3
[2019-11-29 13:57:52,723] INFO in

In [50]:
train.columns

Index(['qid', 'uid', 'label', 'day', 'hour', 'q_inv_kfold_mean',
       'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count',
       'u_inv_kfold_mean',
       ...
       'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4',
       'uf_c5', 'score'],
      dtype='object', length=120)

In [51]:
test.columns

Index(['qid', 'uid', 'day', 'hour', 'q_inv_kfold_mean', 'q_inv_kfold_sum',
       'q_inv_kfold_std', 'q_inv_kfold_count', 'u_inv_kfold_mean',
       'u_inv_kfold_sum',
       ...
       'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4',
       'uf_c5', 'score'],
      dtype='object', length=119)

In [52]:
data = pd.concat((train, test), axis=0, sort=True)

In [53]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [54]:
data[-20:]

Unnamed: 0,day,freq,gender,hour,label,q_ans_kfold_count,q_diff_qa_days_max,q_diff_qa_days_mean,q_diff_qa_days_sum,q_has_img_max,...,uid_enc,uid_enc_count,qid_enc_count,gender_count,freq_count,uf_c1_count,uf_c2_count,uf_c3_count,uf_c4_count,uf_c5_count
1141663,3871,1,3,8,,,,,,,...,942361,7.9e-05,0.0,1.0,1.0,1.0,1.0,0.028967,1.0,1.0
1141664,3872,1,2,18,,,,,,,...,561542,0.000237,2.4e-05,0.027183,1.0,1.0,1.0,0.441988,0.129433,1.0
1141665,3872,2,1,15,,,,,,,...,1442840,5.7e-05,5.9e-05,0.0,0.24599,1.0,1.0,0.432284,0.005611,1.0
1141666,3873,1,3,21,,,,,,,...,245886,3.1e-05,1.2e-05,1.0,1.0,0.042962,0.372573,0.069194,1.0,1.0
1141667,3868,1,3,14,,,,,,,...,930792,5.3e-05,6e-06,1.0,1.0,1.0,1.0,1.0,0.071494,1.0
1141668,3870,3,3,14,,1.0,0.0,0.0,0.0,0.0,...,1285244,0.0,0.000349,1.0,0.023868,1.0,1.0,0.020596,1.0,1.0
1141669,3869,5,3,8,,15.0,5.0,3.533333,53.0,1.0,...,0,1.0,0.002174,1.0,0.934679,0.05124,0.372573,0.612471,0.179357,1.0
1141670,3870,1,3,22,,,,,,,...,1185999,6.6e-05,1.0,1.0,1.0,1.0,1.0,1.0,0.071494,1.0
1141671,3870,2,2,16,,10.0,13.0,8.0,80.0,1.0,...,1071387,0.0,0.000195,0.027183,0.24599,1.0,1.0,0.187605,0.009302,1.0
1141672,3869,1,2,8,,,,,,,...,14280,3.5e-05,2.4e-05,0.027183,1.0,0.057592,0.372573,0.128859,0.011044,1.0


In [55]:
print(list(data.columns))

['day', 'freq', 'gender', 'hour', 'label', 'q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 'q_has_video_sum', 'q_inv_kfold_count', 'q_inv_kfold_mean', 'q_inv_kfold_std', 'q_inv_kfold_sum', 'q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max', 'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', '

In [56]:
data['wk'] = data['day'] % 7
# 选特征
# feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [57]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [58]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train)][feature_cols]
y_train_all = data.iloc[:len(train)]['label']
X_test = data.iloc[len(train):]
assert len(X_test) == sub_size

logging.info("train shape %s, test shape %s", train.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-11-29 14:00:31,872] INFO in <ipython-input-58-20ad4e7d53e4>: feature size 126
[2019-11-29 14:01:00,777] INFO in <ipython-input-58-20ad4e7d53e4>: train shape (9489162, 120), test shape (1141683, 119)


[1]	valid_0's auc: 0.744286	valid_0's binary_logloss: 0.455363
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.75728	valid_0's binary_logloss: 0.446032
[3]	valid_0's auc: 0.764131	valid_0's binary_logloss: 0.438107
[4]	valid_0's auc: 0.76821	valid_0's binary_logloss: 0.431624
[5]	valid_0's auc: 0.770609	valid_0's binary_logloss: 0.426187
[6]	valid_0's auc: 0.773889	valid_0's binary_logloss: 0.421301
[7]	valid_0's auc: 0.775747	valid_0's binary_logloss: 0.417166
[8]	valid_0's auc: 0.777326	valid_0's binary_logloss: 0.413463
[9]	valid_0's auc: 0.779739	valid_0's binary_logloss: 0.410073
[10]	valid_0's auc: 0.78152	valid_0's binary_logloss: 0.40705
[11]	valid_0's auc: 0.782788	valid_0's binary_logloss: 0.404438
[12]	valid_0's auc: 0.783706	valid_0's binary_logloss: 0.402181
[13]	valid_0's auc: 0.784601	valid_0's binary_logloss: 0.400171
[14]	valid_0's auc: 0.786074	valid_0's binary_logloss: 0.398259
[15]	valid_0's auc: 0.78735	valid_0's binary_logloss: 0.

[129]	valid_0's auc: 0.815798	valid_0's binary_logloss: 0.365236
[130]	valid_0's auc: 0.815895	valid_0's binary_logloss: 0.365152
[131]	valid_0's auc: 0.816004	valid_0's binary_logloss: 0.365063
[132]	valid_0's auc: 0.816076	valid_0's binary_logloss: 0.365011
[133]	valid_0's auc: 0.816125	valid_0's binary_logloss: 0.364973
[134]	valid_0's auc: 0.81619	valid_0's binary_logloss: 0.364905
[135]	valid_0's auc: 0.816248	valid_0's binary_logloss: 0.364863
[136]	valid_0's auc: 0.816333	valid_0's binary_logloss: 0.364801
[137]	valid_0's auc: 0.816387	valid_0's binary_logloss: 0.364755
[138]	valid_0's auc: 0.816467	valid_0's binary_logloss: 0.364696
[139]	valid_0's auc: 0.816504	valid_0's binary_logloss: 0.364661
[140]	valid_0's auc: 0.816573	valid_0's binary_logloss: 0.364604
[141]	valid_0's auc: 0.81663	valid_0's binary_logloss: 0.364561
[142]	valid_0's auc: 0.816677	valid_0's binary_logloss: 0.364523
[143]	valid_0's auc: 0.816737	valid_0's binary_logloss: 0.364475
[144]	valid_0's auc: 0.8167

[256]	valid_0's auc: 0.821197	valid_0's binary_logloss: 0.360672
[257]	valid_0's auc: 0.821223	valid_0's binary_logloss: 0.360654
[258]	valid_0's auc: 0.82125	valid_0's binary_logloss: 0.36063
[259]	valid_0's auc: 0.821265	valid_0's binary_logloss: 0.360616
[260]	valid_0's auc: 0.821281	valid_0's binary_logloss: 0.360599
[261]	valid_0's auc: 0.821292	valid_0's binary_logloss: 0.360588
[262]	valid_0's auc: 0.821337	valid_0's binary_logloss: 0.36056
[263]	valid_0's auc: 0.821389	valid_0's binary_logloss: 0.360517
[264]	valid_0's auc: 0.82145	valid_0's binary_logloss: 0.360464
[265]	valid_0's auc: 0.821496	valid_0's binary_logloss: 0.360428
[266]	valid_0's auc: 0.821548	valid_0's binary_logloss: 0.36039
[267]	valid_0's auc: 0.82158	valid_0's binary_logloss: 0.360368
[268]	valid_0's auc: 0.821607	valid_0's binary_logloss: 0.360339
[269]	valid_0's auc: 0.82163	valid_0's binary_logloss: 0.360317
[270]	valid_0's auc: 0.821652	valid_0's binary_logloss: 0.360301
[271]	valid_0's auc: 0.821664	va

[383]	valid_0's auc: 0.823843	valid_0's binary_logloss: 0.358389
[384]	valid_0's auc: 0.82385	valid_0's binary_logloss: 0.358383
[385]	valid_0's auc: 0.823857	valid_0's binary_logloss: 0.358377
[386]	valid_0's auc: 0.82387	valid_0's binary_logloss: 0.358363
[387]	valid_0's auc: 0.823874	valid_0's binary_logloss: 0.35836
[388]	valid_0's auc: 0.823894	valid_0's binary_logloss: 0.358346
[389]	valid_0's auc: 0.823899	valid_0's binary_logloss: 0.358342
[390]	valid_0's auc: 0.823922	valid_0's binary_logloss: 0.358326
[391]	valid_0's auc: 0.82394	valid_0's binary_logloss: 0.358308
[392]	valid_0's auc: 0.823946	valid_0's binary_logloss: 0.358302
[393]	valid_0's auc: 0.823972	valid_0's binary_logloss: 0.358284
[394]	valid_0's auc: 0.823984	valid_0's binary_logloss: 0.358274
[395]	valid_0's auc: 0.824001	valid_0's binary_logloss: 0.358262
[396]	valid_0's auc: 0.824021	valid_0's binary_logloss: 0.358237
[397]	valid_0's auc: 0.824049	valid_0's binary_logloss: 0.35821
[398]	valid_0's auc: 0.824057	

[510]	valid_0's auc: 0.825408	valid_0's binary_logloss: 0.357041
[511]	valid_0's auc: 0.825424	valid_0's binary_logloss: 0.357029
[512]	valid_0's auc: 0.825432	valid_0's binary_logloss: 0.357022
[513]	valid_0's auc: 0.825434	valid_0's binary_logloss: 0.35702
[514]	valid_0's auc: 0.825444	valid_0's binary_logloss: 0.357011
[515]	valid_0's auc: 0.825446	valid_0's binary_logloss: 0.357009
[516]	valid_0's auc: 0.825474	valid_0's binary_logloss: 0.35698
[517]	valid_0's auc: 0.825484	valid_0's binary_logloss: 0.356973
[518]	valid_0's auc: 0.825491	valid_0's binary_logloss: 0.356966
[519]	valid_0's auc: 0.825495	valid_0's binary_logloss: 0.356962
[520]	valid_0's auc: 0.825496	valid_0's binary_logloss: 0.356961
[521]	valid_0's auc: 0.825498	valid_0's binary_logloss: 0.356959
[522]	valid_0's auc: 0.825502	valid_0's binary_logloss: 0.356957
[523]	valid_0's auc: 0.825519	valid_0's binary_logloss: 0.356942
[524]	valid_0's auc: 0.825544	valid_0's binary_logloss: 0.356918
[525]	valid_0's auc: 0.8255

[637]	valid_0's auc: 0.826537	valid_0's binary_logloss: 0.356057
[638]	valid_0's auc: 0.826548	valid_0's binary_logloss: 0.356048
[639]	valid_0's auc: 0.826548	valid_0's binary_logloss: 0.356048
[640]	valid_0's auc: 0.826556	valid_0's binary_logloss: 0.356041
[641]	valid_0's auc: 0.826559	valid_0's binary_logloss: 0.356037
[642]	valid_0's auc: 0.826566	valid_0's binary_logloss: 0.356031
[643]	valid_0's auc: 0.826568	valid_0's binary_logloss: 0.35603
[644]	valid_0's auc: 0.826602	valid_0's binary_logloss: 0.355998
[645]	valid_0's auc: 0.826614	valid_0's binary_logloss: 0.355989
[646]	valid_0's auc: 0.826632	valid_0's binary_logloss: 0.355973
[647]	valid_0's auc: 0.826652	valid_0's binary_logloss: 0.355954
[648]	valid_0's auc: 0.826652	valid_0's binary_logloss: 0.355953
[649]	valid_0's auc: 0.826664	valid_0's binary_logloss: 0.355943
[650]	valid_0's auc: 0.826666	valid_0's binary_logloss: 0.355941
[651]	valid_0's auc: 0.826674	valid_0's binary_logloss: 0.355934
[652]	valid_0's auc: 0.826

[764]	valid_0's auc: 0.827427	valid_0's binary_logloss: 0.355258
[765]	valid_0's auc: 0.827431	valid_0's binary_logloss: 0.355254
[766]	valid_0's auc: 0.827434	valid_0's binary_logloss: 0.355252
[767]	valid_0's auc: 0.827438	valid_0's binary_logloss: 0.355249
[768]	valid_0's auc: 0.82744	valid_0's binary_logloss: 0.355247
[769]	valid_0's auc: 0.827446	valid_0's binary_logloss: 0.355243
[770]	valid_0's auc: 0.827449	valid_0's binary_logloss: 0.355241
[771]	valid_0's auc: 0.827464	valid_0's binary_logloss: 0.355225
[772]	valid_0's auc: 0.827487	valid_0's binary_logloss: 0.355206
[773]	valid_0's auc: 0.827498	valid_0's binary_logloss: 0.355197
[774]	valid_0's auc: 0.827508	valid_0's binary_logloss: 0.355189
[775]	valid_0's auc: 0.827508	valid_0's binary_logloss: 0.355188
[776]	valid_0's auc: 0.827512	valid_0's binary_logloss: 0.355183
[777]	valid_0's auc: 0.827521	valid_0's binary_logloss: 0.355172
[778]	valid_0's auc: 0.827531	valid_0's binary_logloss: 0.355164
[779]	valid_0's auc: 0.827

[891]	valid_0's auc: 0.828243	valid_0's binary_logloss: 0.354525
[892]	valid_0's auc: 0.828253	valid_0's binary_logloss: 0.354517
[893]	valid_0's auc: 0.828262	valid_0's binary_logloss: 0.35451
[894]	valid_0's auc: 0.828265	valid_0's binary_logloss: 0.354507
[895]	valid_0's auc: 0.828291	valid_0's binary_logloss: 0.354481
[896]	valid_0's auc: 0.828295	valid_0's binary_logloss: 0.354477
[897]	valid_0's auc: 0.828302	valid_0's binary_logloss: 0.354473
[898]	valid_0's auc: 0.828318	valid_0's binary_logloss: 0.354458
[899]	valid_0's auc: 0.828325	valid_0's binary_logloss: 0.35445
[900]	valid_0's auc: 0.828331	valid_0's binary_logloss: 0.354444
[901]	valid_0's auc: 0.828338	valid_0's binary_logloss: 0.354437
[902]	valid_0's auc: 0.828339	valid_0's binary_logloss: 0.354436
[903]	valid_0's auc: 0.828341	valid_0's binary_logloss: 0.354434
[904]	valid_0's auc: 0.828348	valid_0's binary_logloss: 0.354428
[905]	valid_0's auc: 0.828358	valid_0's binary_logloss: 0.354417
[906]	valid_0's auc: 0.8283

[1018]	valid_0's auc: 0.828801	valid_0's binary_logloss: 0.354007
[1019]	valid_0's auc: 0.828801	valid_0's binary_logloss: 0.354007
[1020]	valid_0's auc: 0.828802	valid_0's binary_logloss: 0.354006
[1021]	valid_0's auc: 0.828804	valid_0's binary_logloss: 0.354004
[1022]	valid_0's auc: 0.828807	valid_0's binary_logloss: 0.354002
[1023]	valid_0's auc: 0.828815	valid_0's binary_logloss: 0.353995
[1024]	valid_0's auc: 0.828818	valid_0's binary_logloss: 0.353992
[1025]	valid_0's auc: 0.828818	valid_0's binary_logloss: 0.353992
[1026]	valid_0's auc: 0.82882	valid_0's binary_logloss: 0.35399
[1027]	valid_0's auc: 0.828825	valid_0's binary_logloss: 0.353986
[1028]	valid_0's auc: 0.828826	valid_0's binary_logloss: 0.353986
[1029]	valid_0's auc: 0.828829	valid_0's binary_logloss: 0.353983
[1030]	valid_0's auc: 0.828834	valid_0's binary_logloss: 0.353979
[1031]	valid_0's auc: 0.82884	valid_0's binary_logloss: 0.353973
[1032]	valid_0's auc: 0.828843	valid_0's binary_logloss: 0.35397
[1033]	valid_0

[1143]	valid_0's auc: 0.829255	valid_0's binary_logloss: 0.353602
[1144]	valid_0's auc: 0.82926	valid_0's binary_logloss: 0.353598
[1145]	valid_0's auc: 0.82926	valid_0's binary_logloss: 0.353598
[1146]	valid_0's auc: 0.829271	valid_0's binary_logloss: 0.353588
[1147]	valid_0's auc: 0.829289	valid_0's binary_logloss: 0.353574
[1148]	valid_0's auc: 0.829295	valid_0's binary_logloss: 0.353568
[1149]	valid_0's auc: 0.829297	valid_0's binary_logloss: 0.353566
[1150]	valid_0's auc: 0.829298	valid_0's binary_logloss: 0.353565
[1151]	valid_0's auc: 0.829301	valid_0's binary_logloss: 0.353564
[1152]	valid_0's auc: 0.829303	valid_0's binary_logloss: 0.353562
[1153]	valid_0's auc: 0.829307	valid_0's binary_logloss: 0.353559
[1154]	valid_0's auc: 0.829313	valid_0's binary_logloss: 0.353554
[1155]	valid_0's auc: 0.829314	valid_0's binary_logloss: 0.353553
[1156]	valid_0's auc: 0.829314	valid_0's binary_logloss: 0.353553
[1157]	valid_0's auc: 0.829316	valid_0's binary_logloss: 0.353552
[1158]	valid

[1268]	valid_0's auc: 0.829716	valid_0's binary_logloss: 0.353172
[1269]	valid_0's auc: 0.829719	valid_0's binary_logloss: 0.353169
[1270]	valid_0's auc: 0.82972	valid_0's binary_logloss: 0.353168
[1271]	valid_0's auc: 0.829723	valid_0's binary_logloss: 0.353167
[1272]	valid_0's auc: 0.829724	valid_0's binary_logloss: 0.353166
[1273]	valid_0's auc: 0.829724	valid_0's binary_logloss: 0.353166
[1274]	valid_0's auc: 0.82973	valid_0's binary_logloss: 0.35316
[1275]	valid_0's auc: 0.829733	valid_0's binary_logloss: 0.353157
[1276]	valid_0's auc: 0.829751	valid_0's binary_logloss: 0.353145
[1277]	valid_0's auc: 0.829759	valid_0's binary_logloss: 0.353137
[1278]	valid_0's auc: 0.829759	valid_0's binary_logloss: 0.353137
[1279]	valid_0's auc: 0.829761	valid_0's binary_logloss: 0.353136
[1280]	valid_0's auc: 0.829762	valid_0's binary_logloss: 0.353135
[1281]	valid_0's auc: 0.829765	valid_0's binary_logloss: 0.353133
[1282]	valid_0's auc: 0.829766	valid_0's binary_logloss: 0.353132
[1283]	valid_

[1393]	valid_0's auc: 0.830189	valid_0's binary_logloss: 0.352756
[1394]	valid_0's auc: 0.830191	valid_0's binary_logloss: 0.352754
[1395]	valid_0's auc: 0.830194	valid_0's binary_logloss: 0.35275
[1396]	valid_0's auc: 0.830194	valid_0's binary_logloss: 0.35275
[1397]	valid_0's auc: 0.830194	valid_0's binary_logloss: 0.35275
[1398]	valid_0's auc: 0.830193	valid_0's binary_logloss: 0.35275
[1399]	valid_0's auc: 0.830193	valid_0's binary_logloss: 0.35275
[1400]	valid_0's auc: 0.830193	valid_0's binary_logloss: 0.35275
[1401]	valid_0's auc: 0.830201	valid_0's binary_logloss: 0.352744
[1402]	valid_0's auc: 0.830217	valid_0's binary_logloss: 0.352732
[1403]	valid_0's auc: 0.83022	valid_0's binary_logloss: 0.352729
[1404]	valid_0's auc: 0.830224	valid_0's binary_logloss: 0.352727
[1405]	valid_0's auc: 0.830224	valid_0's binary_logloss: 0.352727
[1406]	valid_0's auc: 0.830227	valid_0's binary_logloss: 0.352723
[1407]	valid_0's auc: 0.830228	valid_0's binary_logloss: 0.352723
[1408]	valid_0's 

[1518]	valid_0's auc: 0.830514	valid_0's binary_logloss: 0.352474
[1519]	valid_0's auc: 0.83052	valid_0's binary_logloss: 0.352469
[1520]	valid_0's auc: 0.830522	valid_0's binary_logloss: 0.352467
[1521]	valid_0's auc: 0.830521	valid_0's binary_logloss: 0.352468
[1522]	valid_0's auc: 0.830525	valid_0's binary_logloss: 0.352465
[1523]	valid_0's auc: 0.830534	valid_0's binary_logloss: 0.352457
[1524]	valid_0's auc: 0.830536	valid_0's binary_logloss: 0.352455
[1525]	valid_0's auc: 0.830536	valid_0's binary_logloss: 0.352455
[1526]	valid_0's auc: 0.830548	valid_0's binary_logloss: 0.352444
[1527]	valid_0's auc: 0.830559	valid_0's binary_logloss: 0.352437
[1528]	valid_0's auc: 0.830562	valid_0's binary_logloss: 0.352434
[1529]	valid_0's auc: 0.830562	valid_0's binary_logloss: 0.352434
[1530]	valid_0's auc: 0.830563	valid_0's binary_logloss: 0.352432
[1531]	valid_0's auc: 0.830564	valid_0's binary_logloss: 0.352432
[1532]	valid_0's auc: 0.830565	valid_0's binary_logloss: 0.352431
[1533]	vali

[1643]	valid_0's auc: 0.830881	valid_0's binary_logloss: 0.352141
[1644]	valid_0's auc: 0.830883	valid_0's binary_logloss: 0.352139
[1645]	valid_0's auc: 0.830887	valid_0's binary_logloss: 0.352136
[1646]	valid_0's auc: 0.830888	valid_0's binary_logloss: 0.352135
[1647]	valid_0's auc: 0.830889	valid_0's binary_logloss: 0.352133
[1648]	valid_0's auc: 0.83089	valid_0's binary_logloss: 0.352132
[1649]	valid_0's auc: 0.830892	valid_0's binary_logloss: 0.352131
[1650]	valid_0's auc: 0.830893	valid_0's binary_logloss: 0.35213
[1651]	valid_0's auc: 0.830896	valid_0's binary_logloss: 0.352127
[1652]	valid_0's auc: 0.830897	valid_0's binary_logloss: 0.352126
[1653]	valid_0's auc: 0.8309	valid_0's binary_logloss: 0.352124
[1654]	valid_0's auc: 0.830901	valid_0's binary_logloss: 0.352123
[1655]	valid_0's auc: 0.830902	valid_0's binary_logloss: 0.352122
[1656]	valid_0's auc: 0.830903	valid_0's binary_logloss: 0.352121
[1657]	valid_0's auc: 0.830904	valid_0's binary_logloss: 0.35212
[1658]	valid_0'

[1768]	valid_0's auc: 0.831158	valid_0's binary_logloss: 0.351897
[1769]	valid_0's auc: 0.831159	valid_0's binary_logloss: 0.351896
[1770]	valid_0's auc: 0.831164	valid_0's binary_logloss: 0.351893
[1771]	valid_0's auc: 0.831167	valid_0's binary_logloss: 0.35189
[1772]	valid_0's auc: 0.831168	valid_0's binary_logloss: 0.351888
[1773]	valid_0's auc: 0.831171	valid_0's binary_logloss: 0.351886
[1774]	valid_0's auc: 0.831172	valid_0's binary_logloss: 0.351885
[1775]	valid_0's auc: 0.831171	valid_0's binary_logloss: 0.351886
[1776]	valid_0's auc: 0.831173	valid_0's binary_logloss: 0.351885
[1777]	valid_0's auc: 0.831186	valid_0's binary_logloss: 0.351874
[1778]	valid_0's auc: 0.831189	valid_0's binary_logloss: 0.351872
[1779]	valid_0's auc: 0.831189	valid_0's binary_logloss: 0.351872
[1780]	valid_0's auc: 0.831188	valid_0's binary_logloss: 0.351873
[1781]	valid_0's auc: 0.831189	valid_0's binary_logloss: 0.351872
[1782]	valid_0's auc: 0.831189	valid_0's binary_logloss: 0.351872
[1783]	vali

[1893]	valid_0's auc: 0.831464	valid_0's binary_logloss: 0.351617
[1894]	valid_0's auc: 0.831465	valid_0's binary_logloss: 0.351616
[1895]	valid_0's auc: 0.831465	valid_0's binary_logloss: 0.351616
[1896]	valid_0's auc: 0.831467	valid_0's binary_logloss: 0.351613
[1897]	valid_0's auc: 0.831469	valid_0's binary_logloss: 0.351612
[1898]	valid_0's auc: 0.83147	valid_0's binary_logloss: 0.351611
[1899]	valid_0's auc: 0.83147	valid_0's binary_logloss: 0.351612
[1900]	valid_0's auc: 0.831471	valid_0's binary_logloss: 0.351611
[1901]	valid_0's auc: 0.831477	valid_0's binary_logloss: 0.351605
[1902]	valid_0's auc: 0.831482	valid_0's binary_logloss: 0.351601
[1903]	valid_0's auc: 0.831485	valid_0's binary_logloss: 0.351597
[1904]	valid_0's auc: 0.831488	valid_0's binary_logloss: 0.351595
[1905]	valid_0's auc: 0.831487	valid_0's binary_logloss: 0.351595
[1906]	valid_0's auc: 0.83149	valid_0's binary_logloss: 0.351593
[1907]	valid_0's auc: 0.831491	valid_0's binary_logloss: 0.351592
[1908]	valid_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [59]:
# sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [60]:
sub[:20]

Unnamed: 0,qid,uid,dt,label
0,Q1493039281,M64135255,D3870-H9,0.112841
1,Q2023398782,M2536956560,D3872-H22,0.001505
2,Q4151338694,M3294926344,D3874-H15,0.07556
3,Q3271436624,M3744310794,D3873-H4,0.503449
4,Q3314287018,M1349051752,D3872-H19,0.03176
5,Q4214103875,M2007129506,D3871-H13,0.010617
6,Q1421177878,M3927950819,D3873-H14,0.216128
7,Q3598252818,M2871943120,D3873-H9,0.369479
8,Q568518135,M998566127,D3872-H18,0.629476
9,Q2242868437,M1307039867,D3874-H11,0.049391


In [61]:
sub.to_csv('./result/kfold_2000_4fold_deal_ans.txt', index=None, header=None, sep='\t')

In [62]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi

Unnamed: 0,feature,imp,rate
0,freq,425,0.007098
1,gender,298,0.004977
2,hour,2207,0.036857
3,q_ans_kfold_count,900,0.015030
4,q_diff_qa_days_max,1542,0.025752
...,...,...,...
121,uf_c2_count,241,0.004025
122,uf_c3_count,643,0.010738
123,uf_c4_count,539,0.009001
124,uf_c5_count,0,0.000000


In [63]:
fi.sort_values(by='rate', ascending=False)[:60]

Unnamed: 0,feature,imp,rate
117,qid_enc_count,3436,0.057381
54,score,3392,0.056647
116,uid_enc_count,2854,0.047662
2,hour,2207,0.036857
65,u_inv_kfold_count,1983,0.033116
66,u_inv_kfold_mean,1786,0.029826
55,u_ans_kfold_count,1713,0.028607
5,q_diff_qa_days_mean,1612,0.026921
57,u_diff_qa_days_mean,1561,0.026069
4,q_diff_qa_days_max,1542,0.025752
