In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-12-03 07:55:18,530] INFO in <ipython-input-4-f5fcce844f4e>: invite (9489162, 3)
[2019-12-03 07:55:21,254] INFO in <ipython-input-4-f5fcce844f4e>: test (1141683, 2)


In [5]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [6]:
# 加载 user 过去两个月的回答统计特征（除当条记录）
# t1 = pd.read_csv(f'{feature_path}/train_ua_feature.txt', sep='\t')
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_ua_feature.txt', sep='\t')
# test = pd.concat([test, t1], axis=1)

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [9]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [12]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-03 08:01:36,148] INFO in <ipython-input-12-af285af762e4>: user (1931654, 14)
[2019-12-03 08:01:42,547] INFO in <ipython-input-12-af285af762e4>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-03 08:01:42,556] INFO in <ipython-input-12-af285af762e4>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-03 08:01:44,243] INFO in <ipython-input-12-af285af762e4>: encode gender
[2019-12-03 08:01:45,916] INFO in <ipython-input-12-af285af762e4>: encode freq
[2019-12-03 08:01:47,574] INFO in <ipython-input-12-af285af762e4>: encode uf_c1
[2019-12-03 08:01:49,184] INFO in <ipython-input-12-af285af762e4>: encode uf_c2
[2019-12-03 08:01:50,751] INFO in <ipython-input-12-af285af762e4>: encode uf_c3
[2019-12-03 08:01:52,318] INFO in

In [13]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [14]:
data

Unnamed: 0,day,diff_iq_day,diff_iq_hour,freq,gender,hour,intersection_ft_count,intersection_it_count,intersection_it_score,label,...,uid_hour_count,uid_hour_max,uid_hour_mean,uid_hour_median,uid_hour_min,uid_hour_std,uid_week_count,uid_week_mean,uid_week_median,uid_week_std
0,3865,4,95,4,2,22,1,0,0.000000,0.0,...,2,23,20.400000,22.0,12,4.722288,2,2.200000,2.0,1.643168
1,3844,21,495,1,2,11,0,0,0.000000,0.0,...,2,14,9.875000,9.5,7,3.044316,2,3.625000,4.0,1.922610
2,3862,1,24,4,2,15,0,0,0.000000,0.0,...,4,19,13.071428,13.0,7,3.561855,3,3.357143,3.5,1.736803
3,3849,2,37,0,2,11,0,1,1.066367,0.0,...,2,20,8.714286,8.0,0,6.421689,1,3.428571,3.0,1.718249
4,3867,20,469,1,2,4,0,0,0.000000,0.0,...,2,23,14.900000,16.0,4,6.740425,2,3.000000,3.0,2.108185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,3869,0,0,4,2,20,0,0,0.000000,,...,1,23,19.777779,21.0,13,3.419714,1,3.333333,3.0,2.000000
1141679,3872,0,1,1,2,21,0,0,0.000000,,...,1,23,21.000000,21.5,18,2.160247,1,4.000000,4.5,2.160247
1141680,3871,1,27,0,2,15,1,0,0.000000,,...,1,21,16.916666,18.0,11,3.315483,2,3.083333,3.0,2.020726
1141681,3871,0,8,4,2,8,0,0,0.000000,,...,10,23,11.421053,8.0,7,6.176261,1,3.000000,3.0,1.795055


In [15]:
# count 特征
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [16]:
data['wk'] = data['day'] % 7

In [17]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
# drop_feat += ('u_total_answer',)
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [18]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_cols]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-12-03 08:08:03,083] INFO in <ipython-input-18-20ad4e7d53e4>: feature size 172
[2019-12-03 08:12:20,825] INFO in <ipython-input-18-20ad4e7d53e4>: train shape (9489162, 166), test shape (1141683, 165)


[1]	valid_0's auc: 0.763383	valid_0's binary_logloss: 0.452917
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.779588	valid_0's binary_logloss: 0.441553
[3]	valid_0's auc: 0.785646	valid_0's binary_logloss: 0.432483
[4]	valid_0's auc: 0.792269	valid_0's binary_logloss: 0.424924
[5]	valid_0's auc: 0.795037	valid_0's binary_logloss: 0.418391
[6]	valid_0's auc: 0.798738	valid_0's binary_logloss: 0.412499
[7]	valid_0's auc: 0.80065	valid_0's binary_logloss: 0.407784
[8]	valid_0's auc: 0.805181	valid_0's binary_logloss: 0.40248
[9]	valid_0's auc: 0.807694	valid_0's binary_logloss: 0.398183
[10]	valid_0's auc: 0.809915	valid_0's binary_logloss: 0.394545
[11]	valid_0's auc: 0.812131	valid_0's binary_logloss: 0.390852
[12]	valid_0's auc: 0.8138	valid_0's binary_logloss: 0.387913
[13]	valid_0's auc: 0.814767	valid_0's binary_logloss: 0.385485
[14]	valid_0's auc: 0.816367	valid_0's binary_logloss: 0.382714
[15]	valid_0's auc: 0.817849	valid_0's binary_logloss: 0

[129]	valid_0's auc: 0.852843	valid_0's binary_logloss: 0.334617
[130]	valid_0's auc: 0.852921	valid_0's binary_logloss: 0.334538
[131]	valid_0's auc: 0.853023	valid_0's binary_logloss: 0.334434
[132]	valid_0's auc: 0.853101	valid_0's binary_logloss: 0.334355
[133]	valid_0's auc: 0.853197	valid_0's binary_logloss: 0.334265
[134]	valid_0's auc: 0.853295	valid_0's binary_logloss: 0.334163
[135]	valid_0's auc: 0.853393	valid_0's binary_logloss: 0.334064
[136]	valid_0's auc: 0.853468	valid_0's binary_logloss: 0.333986
[137]	valid_0's auc: 0.85362	valid_0's binary_logloss: 0.333826
[138]	valid_0's auc: 0.853732	valid_0's binary_logloss: 0.333705
[139]	valid_0's auc: 0.853835	valid_0's binary_logloss: 0.333613
[140]	valid_0's auc: 0.853906	valid_0's binary_logloss: 0.333536
[141]	valid_0's auc: 0.853985	valid_0's binary_logloss: 0.333451
[142]	valid_0's auc: 0.854105	valid_0's binary_logloss: 0.333338
[143]	valid_0's auc: 0.854164	valid_0's binary_logloss: 0.333284
[144]	valid_0's auc: 0.854

[256]	valid_0's auc: 0.860917	valid_0's binary_logloss: 0.326304
[257]	valid_0's auc: 0.860945	valid_0's binary_logloss: 0.326272
[258]	valid_0's auc: 0.860991	valid_0's binary_logloss: 0.326225
[259]	valid_0's auc: 0.861032	valid_0's binary_logloss: 0.326186
[260]	valid_0's auc: 0.861065	valid_0's binary_logloss: 0.326152
[261]	valid_0's auc: 0.861092	valid_0's binary_logloss: 0.326123
[262]	valid_0's auc: 0.861132	valid_0's binary_logloss: 0.326085
[263]	valid_0's auc: 0.861193	valid_0's binary_logloss: 0.326026
[264]	valid_0's auc: 0.861223	valid_0's binary_logloss: 0.325995
[265]	valid_0's auc: 0.861252	valid_0's binary_logloss: 0.325967
[266]	valid_0's auc: 0.861287	valid_0's binary_logloss: 0.325929
[267]	valid_0's auc: 0.861312	valid_0's binary_logloss: 0.325898
[268]	valid_0's auc: 0.861341	valid_0's binary_logloss: 0.32587
[269]	valid_0's auc: 0.861368	valid_0's binary_logloss: 0.325839
[270]	valid_0's auc: 0.861394	valid_0's binary_logloss: 0.325815
[271]	valid_0's auc: 0.861

[383]	valid_0's auc: 0.864291	valid_0's binary_logloss: 0.322735
[384]	valid_0's auc: 0.864317	valid_0's binary_logloss: 0.322705
[385]	valid_0's auc: 0.864358	valid_0's binary_logloss: 0.32266
[386]	valid_0's auc: 0.864398	valid_0's binary_logloss: 0.322616
[387]	valid_0's auc: 0.864434	valid_0's binary_logloss: 0.32257
[388]	valid_0's auc: 0.864446	valid_0's binary_logloss: 0.322558
[389]	valid_0's auc: 0.86446	valid_0's binary_logloss: 0.322545
[390]	valid_0's auc: 0.864484	valid_0's binary_logloss: 0.322522
[391]	valid_0's auc: 0.86451	valid_0's binary_logloss: 0.322498
[392]	valid_0's auc: 0.864518	valid_0's binary_logloss: 0.322491
[393]	valid_0's auc: 0.864547	valid_0's binary_logloss: 0.32246
[394]	valid_0's auc: 0.864587	valid_0's binary_logloss: 0.322408
[395]	valid_0's auc: 0.864619	valid_0's binary_logloss: 0.322373
[396]	valid_0's auc: 0.864644	valid_0's binary_logloss: 0.322346
[397]	valid_0's auc: 0.864659	valid_0's binary_logloss: 0.322327
[398]	valid_0's auc: 0.864678	

[510]	valid_0's auc: 0.866379	valid_0's binary_logloss: 0.320475
[511]	valid_0's auc: 0.866384	valid_0's binary_logloss: 0.320468
[512]	valid_0's auc: 0.866395	valid_0's binary_logloss: 0.320458
[513]	valid_0's auc: 0.866405	valid_0's binary_logloss: 0.320446
[514]	valid_0's auc: 0.866433	valid_0's binary_logloss: 0.320418
[515]	valid_0's auc: 0.866441	valid_0's binary_logloss: 0.320409
[516]	valid_0's auc: 0.866447	valid_0's binary_logloss: 0.320404
[517]	valid_0's auc: 0.866463	valid_0's binary_logloss: 0.320382
[518]	valid_0's auc: 0.866472	valid_0's binary_logloss: 0.320373
[519]	valid_0's auc: 0.866474	valid_0's binary_logloss: 0.32037
[520]	valid_0's auc: 0.866487	valid_0's binary_logloss: 0.320355
[521]	valid_0's auc: 0.866496	valid_0's binary_logloss: 0.320343
[522]	valid_0's auc: 0.866501	valid_0's binary_logloss: 0.320335
[523]	valid_0's auc: 0.866518	valid_0's binary_logloss: 0.320317
[524]	valid_0's auc: 0.866532	valid_0's binary_logloss: 0.320303
[525]	valid_0's auc: 0.866

[637]	valid_0's auc: 0.867803	valid_0's binary_logloss: 0.318934
[638]	valid_0's auc: 0.867809	valid_0's binary_logloss: 0.318925
[639]	valid_0's auc: 0.867816	valid_0's binary_logloss: 0.318919
[640]	valid_0's auc: 0.867827	valid_0's binary_logloss: 0.318905
[641]	valid_0's auc: 0.867835	valid_0's binary_logloss: 0.318896
[642]	valid_0's auc: 0.867848	valid_0's binary_logloss: 0.318882
[643]	valid_0's auc: 0.867862	valid_0's binary_logloss: 0.318868
[644]	valid_0's auc: 0.867872	valid_0's binary_logloss: 0.318857
[645]	valid_0's auc: 0.867883	valid_0's binary_logloss: 0.318846
[646]	valid_0's auc: 0.867884	valid_0's binary_logloss: 0.318844
[647]	valid_0's auc: 0.867885	valid_0's binary_logloss: 0.318843
[648]	valid_0's auc: 0.8679	valid_0's binary_logloss: 0.318824
[649]	valid_0's auc: 0.867918	valid_0's binary_logloss: 0.318806
[650]	valid_0's auc: 0.867937	valid_0's binary_logloss: 0.318788
[651]	valid_0's auc: 0.867946	valid_0's binary_logloss: 0.31878
[652]	valid_0's auc: 0.86795

[764]	valid_0's auc: 0.868956	valid_0's binary_logloss: 0.317716
[765]	valid_0's auc: 0.86897	valid_0's binary_logloss: 0.317699
[766]	valid_0's auc: 0.868977	valid_0's binary_logloss: 0.317691
[767]	valid_0's auc: 0.868988	valid_0's binary_logloss: 0.317678
[768]	valid_0's auc: 0.868997	valid_0's binary_logloss: 0.317668
[769]	valid_0's auc: 0.869017	valid_0's binary_logloss: 0.317644
[770]	valid_0's auc: 0.869046	valid_0's binary_logloss: 0.317619
[771]	valid_0's auc: 0.869059	valid_0's binary_logloss: 0.317603
[772]	valid_0's auc: 0.869068	valid_0's binary_logloss: 0.317594
[773]	valid_0's auc: 0.869072	valid_0's binary_logloss: 0.317591
[774]	valid_0's auc: 0.869073	valid_0's binary_logloss: 0.317589
[775]	valid_0's auc: 0.869074	valid_0's binary_logloss: 0.317589
[776]	valid_0's auc: 0.869079	valid_0's binary_logloss: 0.317584
[777]	valid_0's auc: 0.86908	valid_0's binary_logloss: 0.317583
[778]	valid_0's auc: 0.869095	valid_0's binary_logloss: 0.317569
[779]	valid_0's auc: 0.8691

[891]	valid_0's auc: 0.869816	valid_0's binary_logloss: 0.316763
[892]	valid_0's auc: 0.869818	valid_0's binary_logloss: 0.316762
[893]	valid_0's auc: 0.869821	valid_0's binary_logloss: 0.316757
[894]	valid_0's auc: 0.869832	valid_0's binary_logloss: 0.316746
[895]	valid_0's auc: 0.869842	valid_0's binary_logloss: 0.316733
[896]	valid_0's auc: 0.869856	valid_0's binary_logloss: 0.316716
[897]	valid_0's auc: 0.869873	valid_0's binary_logloss: 0.316699
[898]	valid_0's auc: 0.869893	valid_0's binary_logloss: 0.316679
[899]	valid_0's auc: 0.869894	valid_0's binary_logloss: 0.316677
[900]	valid_0's auc: 0.869901	valid_0's binary_logloss: 0.316669
[901]	valid_0's auc: 0.86991	valid_0's binary_logloss: 0.316659
[902]	valid_0's auc: 0.86991	valid_0's binary_logloss: 0.316659
[903]	valid_0's auc: 0.869911	valid_0's binary_logloss: 0.316657
[904]	valid_0's auc: 0.86992	valid_0's binary_logloss: 0.31665
[905]	valid_0's auc: 0.869921	valid_0's binary_logloss: 0.316648
[906]	valid_0's auc: 0.869924

[1018]	valid_0's auc: 0.870744	valid_0's binary_logloss: 0.315748
[1019]	valid_0's auc: 0.870745	valid_0's binary_logloss: 0.315748
[1020]	valid_0's auc: 0.870752	valid_0's binary_logloss: 0.315741
[1021]	valid_0's auc: 0.870759	valid_0's binary_logloss: 0.315734
[1022]	valid_0's auc: 0.870766	valid_0's binary_logloss: 0.315727
[1023]	valid_0's auc: 0.870773	valid_0's binary_logloss: 0.31572
[1024]	valid_0's auc: 0.870774	valid_0's binary_logloss: 0.315718
[1025]	valid_0's auc: 0.870778	valid_0's binary_logloss: 0.315714
[1026]	valid_0's auc: 0.870783	valid_0's binary_logloss: 0.315705
[1027]	valid_0's auc: 0.87079	valid_0's binary_logloss: 0.315697
[1028]	valid_0's auc: 0.870798	valid_0's binary_logloss: 0.315689
[1029]	valid_0's auc: 0.870813	valid_0's binary_logloss: 0.315675
[1030]	valid_0's auc: 0.870822	valid_0's binary_logloss: 0.315666
[1031]	valid_0's auc: 0.870822	valid_0's binary_logloss: 0.315666
[1032]	valid_0's auc: 0.870827	valid_0's binary_logloss: 0.315661
[1033]	valid

[1143]	valid_0's auc: 0.871373	valid_0's binary_logloss: 0.315052
[1144]	valid_0's auc: 0.871377	valid_0's binary_logloss: 0.315048
[1145]	valid_0's auc: 0.871379	valid_0's binary_logloss: 0.315046
[1146]	valid_0's auc: 0.871381	valid_0's binary_logloss: 0.315043
[1147]	valid_0's auc: 0.871381	valid_0's binary_logloss: 0.315043
[1148]	valid_0's auc: 0.871382	valid_0's binary_logloss: 0.315042
[1149]	valid_0's auc: 0.871383	valid_0's binary_logloss: 0.315041
[1150]	valid_0's auc: 0.871383	valid_0's binary_logloss: 0.315041
[1151]	valid_0's auc: 0.871386	valid_0's binary_logloss: 0.315038
[1152]	valid_0's auc: 0.871389	valid_0's binary_logloss: 0.315035
[1153]	valid_0's auc: 0.87139	valid_0's binary_logloss: 0.315034
[1154]	valid_0's auc: 0.871392	valid_0's binary_logloss: 0.315031
[1155]	valid_0's auc: 0.871393	valid_0's binary_logloss: 0.315031
[1156]	valid_0's auc: 0.871393	valid_0's binary_logloss: 0.31503
[1157]	valid_0's auc: 0.871395	valid_0's binary_logloss: 0.315028
[1158]	valid

[1268]	valid_0's auc: 0.871898	valid_0's binary_logloss: 0.314474
[1269]	valid_0's auc: 0.871912	valid_0's binary_logloss: 0.314457
[1270]	valid_0's auc: 0.871917	valid_0's binary_logloss: 0.314451
[1271]	valid_0's auc: 0.871929	valid_0's binary_logloss: 0.314438
[1272]	valid_0's auc: 0.87193	valid_0's binary_logloss: 0.314436
[1273]	valid_0's auc: 0.871931	valid_0's binary_logloss: 0.314435
[1274]	valid_0's auc: 0.871936	valid_0's binary_logloss: 0.31443
[1275]	valid_0's auc: 0.871938	valid_0's binary_logloss: 0.314427
[1276]	valid_0's auc: 0.87194	valid_0's binary_logloss: 0.314425
[1277]	valid_0's auc: 0.871943	valid_0's binary_logloss: 0.314421
[1278]	valid_0's auc: 0.871949	valid_0's binary_logloss: 0.314414
[1279]	valid_0's auc: 0.87195	valid_0's binary_logloss: 0.314412
[1280]	valid_0's auc: 0.871951	valid_0's binary_logloss: 0.314412
[1281]	valid_0's auc: 0.871953	valid_0's binary_logloss: 0.31441
[1282]	valid_0's auc: 0.871952	valid_0's binary_logloss: 0.314411
[1283]	valid_0'

[1393]	valid_0's auc: 0.872355	valid_0's binary_logloss: 0.313967
[1394]	valid_0's auc: 0.872355	valid_0's binary_logloss: 0.313967
[1395]	valid_0's auc: 0.872356	valid_0's binary_logloss: 0.313965
[1396]	valid_0's auc: 0.872356	valid_0's binary_logloss: 0.313965
[1397]	valid_0's auc: 0.872356	valid_0's binary_logloss: 0.313964
[1398]	valid_0's auc: 0.872356	valid_0's binary_logloss: 0.313964
[1399]	valid_0's auc: 0.872362	valid_0's binary_logloss: 0.313956
[1400]	valid_0's auc: 0.872369	valid_0's binary_logloss: 0.313947
[1401]	valid_0's auc: 0.872373	valid_0's binary_logloss: 0.313942
[1402]	valid_0's auc: 0.872377	valid_0's binary_logloss: 0.313938
[1403]	valid_0's auc: 0.872377	valid_0's binary_logloss: 0.313937
[1404]	valid_0's auc: 0.872379	valid_0's binary_logloss: 0.313935
[1405]	valid_0's auc: 0.872381	valid_0's binary_logloss: 0.313934
[1406]	valid_0's auc: 0.872388	valid_0's binary_logloss: 0.313926
[1407]	valid_0's auc: 0.87239	valid_0's binary_logloss: 0.313924
[1408]	vali

[1518]	valid_0's auc: 0.872804	valid_0's binary_logloss: 0.313461
[1519]	valid_0's auc: 0.87281	valid_0's binary_logloss: 0.313454
[1520]	valid_0's auc: 0.872814	valid_0's binary_logloss: 0.31345
[1521]	valid_0's auc: 0.872813	valid_0's binary_logloss: 0.31345
[1522]	valid_0's auc: 0.872813	valid_0's binary_logloss: 0.313449
[1523]	valid_0's auc: 0.872813	valid_0's binary_logloss: 0.313449
[1524]	valid_0's auc: 0.872816	valid_0's binary_logloss: 0.313445
[1525]	valid_0's auc: 0.872821	valid_0's binary_logloss: 0.313439
[1526]	valid_0's auc: 0.872823	valid_0's binary_logloss: 0.313437
[1527]	valid_0's auc: 0.872836	valid_0's binary_logloss: 0.313423
[1528]	valid_0's auc: 0.872838	valid_0's binary_logloss: 0.31342
[1529]	valid_0's auc: 0.872844	valid_0's binary_logloss: 0.313413
[1530]	valid_0's auc: 0.872848	valid_0's binary_logloss: 0.31341
[1531]	valid_0's auc: 0.872851	valid_0's binary_logloss: 0.313408
[1532]	valid_0's auc: 0.872854	valid_0's binary_logloss: 0.313404
[1533]	valid_0'

[1643]	valid_0's auc: 0.873197	valid_0's binary_logloss: 0.313008
[1644]	valid_0's auc: 0.873197	valid_0's binary_logloss: 0.313007
[1645]	valid_0's auc: 0.873198	valid_0's binary_logloss: 0.313005
[1646]	valid_0's auc: 0.873199	valid_0's binary_logloss: 0.313004
[1647]	valid_0's auc: 0.873203	valid_0's binary_logloss: 0.313
[1648]	valid_0's auc: 0.873202	valid_0's binary_logloss: 0.313
[1649]	valid_0's auc: 0.873207	valid_0's binary_logloss: 0.312995
[1650]	valid_0's auc: 0.87321	valid_0's binary_logloss: 0.312992
[1651]	valid_0's auc: 0.873212	valid_0's binary_logloss: 0.312989
[1652]	valid_0's auc: 0.87322	valid_0's binary_logloss: 0.312982
[1653]	valid_0's auc: 0.873222	valid_0's binary_logloss: 0.31298
[1654]	valid_0's auc: 0.873232	valid_0's binary_logloss: 0.312972
[1655]	valid_0's auc: 0.873232	valid_0's binary_logloss: 0.312972
[1656]	valid_0's auc: 0.873237	valid_0's binary_logloss: 0.312966
[1657]	valid_0's auc: 0.87324	valid_0's binary_logloss: 0.312961
[1658]	valid_0's auc

[1768]	valid_0's auc: 0.873549	valid_0's binary_logloss: 0.31261
[1769]	valid_0's auc: 0.873556	valid_0's binary_logloss: 0.312604
[1770]	valid_0's auc: 0.873557	valid_0's binary_logloss: 0.312602
[1771]	valid_0's auc: 0.873559	valid_0's binary_logloss: 0.3126
[1772]	valid_0's auc: 0.87356	valid_0's binary_logloss: 0.3126
[1773]	valid_0's auc: 0.873562	valid_0's binary_logloss: 0.312598
[1774]	valid_0's auc: 0.873562	valid_0's binary_logloss: 0.312598
[1775]	valid_0's auc: 0.873563	valid_0's binary_logloss: 0.312597
[1776]	valid_0's auc: 0.873564	valid_0's binary_logloss: 0.312596
[1777]	valid_0's auc: 0.873566	valid_0's binary_logloss: 0.312594
[1778]	valid_0's auc: 0.873567	valid_0's binary_logloss: 0.312592
[1779]	valid_0's auc: 0.87357	valid_0's binary_logloss: 0.312589
[1780]	valid_0's auc: 0.873573	valid_0's binary_logloss: 0.312586
[1781]	valid_0's auc: 0.873582	valid_0's binary_logloss: 0.312576
[1782]	valid_0's auc: 0.873587	valid_0's binary_logloss: 0.312571
[1783]	valid_0's 

[1893]	valid_0's auc: 0.873952	valid_0's binary_logloss: 0.312155
[1894]	valid_0's auc: 0.873955	valid_0's binary_logloss: 0.312153
[1895]	valid_0's auc: 0.873958	valid_0's binary_logloss: 0.312149
[1896]	valid_0's auc: 0.873962	valid_0's binary_logloss: 0.312146
[1897]	valid_0's auc: 0.873963	valid_0's binary_logloss: 0.312146
[1898]	valid_0's auc: 0.873967	valid_0's binary_logloss: 0.312141
[1899]	valid_0's auc: 0.873969	valid_0's binary_logloss: 0.312139
[1900]	valid_0's auc: 0.873973	valid_0's binary_logloss: 0.312135
[1901]	valid_0's auc: 0.873977	valid_0's binary_logloss: 0.312131
[1902]	valid_0's auc: 0.873989	valid_0's binary_logloss: 0.312116
[1903]	valid_0's auc: 0.87399	valid_0's binary_logloss: 0.312115
[1904]	valid_0's auc: 0.873991	valid_0's binary_logloss: 0.312113
[1905]	valid_0's auc: 0.873993	valid_0's binary_logloss: 0.312111
[1906]	valid_0's auc: 0.873991	valid_0's binary_logloss: 0.312113
[1907]	valid_0's auc: 0.873992	valid_0's binary_logloss: 0.312112
[1908]	vali

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [19]:
# sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [20]:
sub.to_csv('./result/2000_add_invite.txt', index=None, header=None, sep='\t')

In [21]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi

Unnamed: 0,feature,imp,rate
0,diff_iq_day,831,0.013850
1,diff_iq_hour,2609,0.043483
2,freq,332,0.005533
3,gender,221,0.003683
4,hour,1487,0.024783
...,...,...,...
167,uf_c2_count,130,0.002167
168,uf_c3_count,275,0.004583
169,uf_c4_count,249,0.004150
170,uf_c5_count,0,0.000000


In [2]:
fi.sort_values(by='rate', ascending=False)[-30:]

NameError: name 'fi' is not defined