In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-12-06 14:52:42,398] INFO in <ipython-input-4-f5fcce844f4e>: invite (9489162, 3)
[2019-12-06 14:52:45,097] INFO in <ipython-input-4-f5fcce844f4e>: test (1141683, 2)


In [5]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [6]:
# 加载 user 过去两个月的回答统计特征（除当条记录）
# t1 = pd.read_csv(f'{feature_path}/train_ua_feature.txt', sep='\t')
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_ua_feature.txt', sep='\t')
# test = pd.concat([test, t1], axis=1)

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [8]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [9]:
# 加载 kfold topic feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_topic_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_topic_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [10]:
# 加载 topic feature
# t1 = pd.read_csv(f'{feature_path}/train_topic_feature.txt', sep='\t')
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_topic_feature.txt', sep='\t')
# test = pd.concat([test, t1], axis=1)

In [11]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-06 15:00:16,831] INFO in <ipython-input-11-af285af762e4>: user (1931654, 14)
[2019-12-06 15:00:23,755] INFO in <ipython-input-11-af285af762e4>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-06 15:00:23,765] INFO in <ipython-input-11-af285af762e4>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-06 15:00:25,685] INFO in <ipython-input-11-af285af762e4>: encode gender
[2019-12-06 15:00:27,540] INFO in <ipython-input-11-af285af762e4>: encode freq
[2019-12-06 15:00:29,164] INFO in <ipython-input-11-af285af762e4>: encode uf_c1
[2019-12-06 15:00:30,737] INFO in <ipython-input-11-af285af762e4>: encode uf_c2
[2019-12-06 15:00:32,314] INFO in <ipython-input-11-af285af762e4>: encode uf_c3
[2019-12-06 15:00:33,857] INFO in

In [12]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [13]:
data

Unnamed: 0,day,diff_iq_day,diff_iq_hour,freq,gender,hour,intersection_ft_count,intersection_it_count,intersection_it_score,label,...,uid_hour_count,uid_hour_max,uid_hour_mean,uid_hour_median,uid_hour_min,uid_hour_std,uid_week_count,uid_week_mean,uid_week_median,uid_week_std
0,3865,4,95,4,2,22,1,0,0.000000,0.0,...,2,23,20.400000,22.0,12,4.722288,2,2.200000,2.0,1.643168
1,3844,21,495,1,2,11,0,0,0.000000,0.0,...,2,14,9.875000,9.5,7,3.044316,2,3.625000,4.0,1.922610
2,3862,1,24,4,2,15,0,0,0.000000,0.0,...,4,19,13.071428,13.0,7,3.561855,3,3.357143,3.5,1.736803
3,3849,2,37,0,2,11,0,1,1.066367,0.0,...,2,20,8.714286,8.0,0,6.421689,1,3.428571,3.0,1.718249
4,3867,20,469,1,2,4,0,0,0.000000,0.0,...,2,23,14.900000,16.0,4,6.740425,2,3.000000,3.0,2.108185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,3869,0,0,4,2,20,0,0,0.000000,,...,1,23,19.777779,21.0,13,3.419714,1,3.333333,3.0,2.000000
1141679,3872,0,1,1,2,21,0,0,0.000000,,...,1,23,21.000000,21.5,18,2.160247,1,4.000000,4.5,2.160247
1141680,3871,1,27,0,2,15,1,0,0.000000,,...,1,21,16.916666,18.0,11,3.315483,2,3.083333,3.0,2.020726
1141681,3871,0,8,4,2,8,0,0,0.000000,,...,10,23,11.421053,8.0,7,6.176261,1,3.000000,3.0,1.795055


In [14]:
# count 特征
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [15]:
data['wk'] = data['day'] % 7

In [16]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
# drop_feat += ('u_total_answer',)
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [17]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_cols]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-12-06 15:06:58,722] INFO in <ipython-input-17-727117a4265e>: feature size 196
[2019-12-06 15:08:44,624] INFO in <ipython-input-17-727117a4265e>: train shape (7591329, 196), val shape (1897833, 196), test shape (1141683, 200)


[1]	valid_0's auc: 0.759647	valid_0's binary_logloss: 0.45269
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.777267	valid_0's binary_logloss: 0.441528
[3]	valid_0's auc: 0.7855	valid_0's binary_logloss: 0.432273
[4]	valid_0's auc: 0.796452	valid_0's binary_logloss: 0.423885
[5]	valid_0's auc: 0.800777	valid_0's binary_logloss: 0.417402
[6]	valid_0's auc: 0.802773	valid_0's binary_logloss: 0.411632
[7]	valid_0's auc: 0.806289	valid_0's binary_logloss: 0.405804
[8]	valid_0's auc: 0.807992	valid_0's binary_logloss: 0.401261
[9]	valid_0's auc: 0.810447	valid_0's binary_logloss: 0.396911
[10]	valid_0's auc: 0.811862	valid_0's binary_logloss: 0.393425
[11]	valid_0's auc: 0.8139	valid_0's binary_logloss: 0.390024
[12]	valid_0's auc: 0.815931	valid_0's binary_logloss: 0.386848
[13]	valid_0's auc: 0.817318	valid_0's binary_logloss: 0.384109
[14]	valid_0's auc: 0.818725	valid_0's binary_logloss: 0.381586
[15]	valid_0's auc: 0.819946	valid_0's binary_logloss: 0.

[129]	valid_0's auc: 0.856187	valid_0's binary_logloss: 0.331019
[130]	valid_0's auc: 0.856262	valid_0's binary_logloss: 0.33094
[131]	valid_0's auc: 0.85642	valid_0's binary_logloss: 0.330723
[132]	valid_0's auc: 0.856504	valid_0's binary_logloss: 0.330642
[133]	valid_0's auc: 0.856624	valid_0's binary_logloss: 0.330517
[134]	valid_0's auc: 0.856706	valid_0's binary_logloss: 0.330401
[135]	valid_0's auc: 0.856771	valid_0's binary_logloss: 0.330328
[136]	valid_0's auc: 0.856918	valid_0's binary_logloss: 0.330188
[137]	valid_0's auc: 0.857042	valid_0's binary_logloss: 0.330073
[138]	valid_0's auc: 0.857142	valid_0's binary_logloss: 0.329919
[139]	valid_0's auc: 0.857202	valid_0's binary_logloss: 0.329855
[140]	valid_0's auc: 0.857272	valid_0's binary_logloss: 0.329785
[141]	valid_0's auc: 0.857368	valid_0's binary_logloss: 0.329687
[142]	valid_0's auc: 0.857473	valid_0's binary_logloss: 0.329592
[143]	valid_0's auc: 0.857524	valid_0's binary_logloss: 0.329523
[144]	valid_0's auc: 0.8576

[256]	valid_0's auc: 0.864578	valid_0's binary_logloss: 0.322033
[257]	valid_0's auc: 0.864602	valid_0's binary_logloss: 0.322006
[258]	valid_0's auc: 0.864651	valid_0's binary_logloss: 0.321929
[259]	valid_0's auc: 0.864687	valid_0's binary_logloss: 0.321885
[260]	valid_0's auc: 0.864727	valid_0's binary_logloss: 0.321849
[261]	valid_0's auc: 0.864773	valid_0's binary_logloss: 0.321793
[262]	valid_0's auc: 0.8648	valid_0's binary_logloss: 0.321765
[263]	valid_0's auc: 0.86484	valid_0's binary_logloss: 0.321728
[264]	valid_0's auc: 0.864882	valid_0's binary_logloss: 0.321687
[265]	valid_0's auc: 0.864924	valid_0's binary_logloss: 0.321637
[266]	valid_0's auc: 0.864956	valid_0's binary_logloss: 0.321605
[267]	valid_0's auc: 0.864984	valid_0's binary_logloss: 0.321578
[268]	valid_0's auc: 0.865021	valid_0's binary_logloss: 0.321537
[269]	valid_0's auc: 0.86506	valid_0's binary_logloss: 0.321498
[270]	valid_0's auc: 0.865121	valid_0's binary_logloss: 0.321447
[271]	valid_0's auc: 0.86515	

[383]	valid_0's auc: 0.868299	valid_0's binary_logloss: 0.318056
[384]	valid_0's auc: 0.868306	valid_0's binary_logloss: 0.318049
[385]	valid_0's auc: 0.868334	valid_0's binary_logloss: 0.318022
[386]	valid_0's auc: 0.868359	valid_0's binary_logloss: 0.318
[387]	valid_0's auc: 0.868387	valid_0's binary_logloss: 0.317971
[388]	valid_0's auc: 0.868406	valid_0's binary_logloss: 0.317947
[389]	valid_0's auc: 0.868427	valid_0's binary_logloss: 0.317921
[390]	valid_0's auc: 0.86845	valid_0's binary_logloss: 0.317895
[391]	valid_0's auc: 0.868469	valid_0's binary_logloss: 0.317873
[392]	valid_0's auc: 0.868484	valid_0's binary_logloss: 0.317854
[393]	valid_0's auc: 0.868513	valid_0's binary_logloss: 0.317825
[394]	valid_0's auc: 0.868529	valid_0's binary_logloss: 0.317807
[395]	valid_0's auc: 0.86854	valid_0's binary_logloss: 0.317794
[396]	valid_0's auc: 0.868553	valid_0's binary_logloss: 0.317779
[397]	valid_0's auc: 0.868567	valid_0's binary_logloss: 0.317761
[398]	valid_0's auc: 0.868592	

[510]	valid_0's auc: 0.870509	valid_0's binary_logloss: 0.315639
[511]	valid_0's auc: 0.870529	valid_0's binary_logloss: 0.31562
[512]	valid_0's auc: 0.87054	valid_0's binary_logloss: 0.315606
[513]	valid_0's auc: 0.870548	valid_0's binary_logloss: 0.315597
[514]	valid_0's auc: 0.870568	valid_0's binary_logloss: 0.315576
[515]	valid_0's auc: 0.870587	valid_0's binary_logloss: 0.315547
[516]	valid_0's auc: 0.870619	valid_0's binary_logloss: 0.315496
[517]	valid_0's auc: 0.870658	valid_0's binary_logloss: 0.315462
[518]	valid_0's auc: 0.870666	valid_0's binary_logloss: 0.315452
[519]	valid_0's auc: 0.87067	valid_0's binary_logloss: 0.315448
[520]	valid_0's auc: 0.870697	valid_0's binary_logloss: 0.315422
[521]	valid_0's auc: 0.870712	valid_0's binary_logloss: 0.315405
[522]	valid_0's auc: 0.870731	valid_0's binary_logloss: 0.315387
[523]	valid_0's auc: 0.870748	valid_0's binary_logloss: 0.315367
[524]	valid_0's auc: 0.870766	valid_0's binary_logloss: 0.315342
[525]	valid_0's auc: 0.8708	

[637]	valid_0's auc: 0.87209	valid_0's binary_logloss: 0.313903
[638]	valid_0's auc: 0.872091	valid_0's binary_logloss: 0.313901
[639]	valid_0's auc: 0.872094	valid_0's binary_logloss: 0.313899
[640]	valid_0's auc: 0.872104	valid_0's binary_logloss: 0.313888
[641]	valid_0's auc: 0.872121	valid_0's binary_logloss: 0.313872
[642]	valid_0's auc: 0.872142	valid_0's binary_logloss: 0.313845
[643]	valid_0's auc: 0.872166	valid_0's binary_logloss: 0.313822
[644]	valid_0's auc: 0.87218	valid_0's binary_logloss: 0.313805
[645]	valid_0's auc: 0.872205	valid_0's binary_logloss: 0.313784
[646]	valid_0's auc: 0.872221	valid_0's binary_logloss: 0.313762
[647]	valid_0's auc: 0.872243	valid_0's binary_logloss: 0.313743
[648]	valid_0's auc: 0.872244	valid_0's binary_logloss: 0.313739
[649]	valid_0's auc: 0.87225	valid_0's binary_logloss: 0.313732
[650]	valid_0's auc: 0.872258	valid_0's binary_logloss: 0.313723
[651]	valid_0's auc: 0.87226	valid_0's binary_logloss: 0.313722
[652]	valid_0's auc: 0.872271

[764]	valid_0's auc: 0.873277	valid_0's binary_logloss: 0.312573
[765]	valid_0's auc: 0.873286	valid_0's binary_logloss: 0.312562
[766]	valid_0's auc: 0.873291	valid_0's binary_logloss: 0.312555
[767]	valid_0's auc: 0.8733	valid_0's binary_logloss: 0.312547
[768]	valid_0's auc: 0.873315	valid_0's binary_logloss: 0.312528
[769]	valid_0's auc: 0.873326	valid_0's binary_logloss: 0.312516
[770]	valid_0's auc: 0.873332	valid_0's binary_logloss: 0.312509
[771]	valid_0's auc: 0.873353	valid_0's binary_logloss: 0.312488
[772]	valid_0's auc: 0.873358	valid_0's binary_logloss: 0.312482
[773]	valid_0's auc: 0.873361	valid_0's binary_logloss: 0.312479
[774]	valid_0's auc: 0.873367	valid_0's binary_logloss: 0.312473
[775]	valid_0's auc: 0.873377	valid_0's binary_logloss: 0.312461
[776]	valid_0's auc: 0.873384	valid_0's binary_logloss: 0.312451
[777]	valid_0's auc: 0.87339	valid_0's binary_logloss: 0.312445
[778]	valid_0's auc: 0.873393	valid_0's binary_logloss: 0.312442
[779]	valid_0's auc: 0.87339

[891]	valid_0's auc: 0.874069	valid_0's binary_logloss: 0.31169
[892]	valid_0's auc: 0.87408	valid_0's binary_logloss: 0.311679
[893]	valid_0's auc: 0.874092	valid_0's binary_logloss: 0.311666
[894]	valid_0's auc: 0.874097	valid_0's binary_logloss: 0.311661
[895]	valid_0's auc: 0.874098	valid_0's binary_logloss: 0.31166
[896]	valid_0's auc: 0.874099	valid_0's binary_logloss: 0.311658
[897]	valid_0's auc: 0.874102	valid_0's binary_logloss: 0.311655
[898]	valid_0's auc: 0.874106	valid_0's binary_logloss: 0.311649
[899]	valid_0's auc: 0.874107	valid_0's binary_logloss: 0.311648
[900]	valid_0's auc: 0.874109	valid_0's binary_logloss: 0.311646
[901]	valid_0's auc: 0.874111	valid_0's binary_logloss: 0.311643
[902]	valid_0's auc: 0.874123	valid_0's binary_logloss: 0.311631
[903]	valid_0's auc: 0.874142	valid_0's binary_logloss: 0.311612
[904]	valid_0's auc: 0.874146	valid_0's binary_logloss: 0.311606
[905]	valid_0's auc: 0.87415	valid_0's binary_logloss: 0.311601
[906]	valid_0's auc: 0.874166

[1018]	valid_0's auc: 0.874825	valid_0's binary_logloss: 0.310836
[1019]	valid_0's auc: 0.874826	valid_0's binary_logloss: 0.310835
[1020]	valid_0's auc: 0.874842	valid_0's binary_logloss: 0.310818
[1021]	valid_0's auc: 0.874848	valid_0's binary_logloss: 0.310811
[1022]	valid_0's auc: 0.874851	valid_0's binary_logloss: 0.310807
[1023]	valid_0's auc: 0.874853	valid_0's binary_logloss: 0.310805
[1024]	valid_0's auc: 0.874869	valid_0's binary_logloss: 0.31079
[1025]	valid_0's auc: 0.874872	valid_0's binary_logloss: 0.310787
[1026]	valid_0's auc: 0.874886	valid_0's binary_logloss: 0.310768
[1027]	valid_0's auc: 0.874898	valid_0's binary_logloss: 0.310753
[1028]	valid_0's auc: 0.874905	valid_0's binary_logloss: 0.310744
[1029]	valid_0's auc: 0.874905	valid_0's binary_logloss: 0.310744
[1030]	valid_0's auc: 0.874912	valid_0's binary_logloss: 0.310737
[1031]	valid_0's auc: 0.874927	valid_0's binary_logloss: 0.31072
[1032]	valid_0's auc: 0.874942	valid_0's binary_logloss: 0.310697
[1033]	valid

[1143]	valid_0's auc: 0.875606	valid_0's binary_logloss: 0.309945
[1144]	valid_0's auc: 0.875614	valid_0's binary_logloss: 0.309937
[1145]	valid_0's auc: 0.875621	valid_0's binary_logloss: 0.30993
[1146]	valid_0's auc: 0.875625	valid_0's binary_logloss: 0.309925
[1147]	valid_0's auc: 0.87564	valid_0's binary_logloss: 0.309911
[1148]	valid_0's auc: 0.87564	valid_0's binary_logloss: 0.309911
[1149]	valid_0's auc: 0.875648	valid_0's binary_logloss: 0.309901
[1150]	valid_0's auc: 0.875654	valid_0's binary_logloss: 0.309895
[1151]	valid_0's auc: 0.875657	valid_0's binary_logloss: 0.309891
[1152]	valid_0's auc: 0.87566	valid_0's binary_logloss: 0.309887
[1153]	valid_0's auc: 0.87566	valid_0's binary_logloss: 0.309886
[1154]	valid_0's auc: 0.875666	valid_0's binary_logloss: 0.309879
[1155]	valid_0's auc: 0.875675	valid_0's binary_logloss: 0.309865
[1156]	valid_0's auc: 0.875679	valid_0's binary_logloss: 0.309861
[1157]	valid_0's auc: 0.875689	valid_0's binary_logloss: 0.309852
[1158]	valid_0'

[1268]	valid_0's auc: 0.876265	valid_0's binary_logloss: 0.309188
[1269]	valid_0's auc: 0.876273	valid_0's binary_logloss: 0.309179
[1270]	valid_0's auc: 0.876282	valid_0's binary_logloss: 0.30917
[1271]	valid_0's auc: 0.876282	valid_0's binary_logloss: 0.30917
[1272]	valid_0's auc: 0.876283	valid_0's binary_logloss: 0.309168
[1273]	valid_0's auc: 0.876283	valid_0's binary_logloss: 0.309168
[1274]	valid_0's auc: 0.876289	valid_0's binary_logloss: 0.309161
[1275]	valid_0's auc: 0.876306	valid_0's binary_logloss: 0.30914
[1276]	valid_0's auc: 0.876307	valid_0's binary_logloss: 0.309139
[1277]	valid_0's auc: 0.876316	valid_0's binary_logloss: 0.30913
[1278]	valid_0's auc: 0.876322	valid_0's binary_logloss: 0.309123
[1279]	valid_0's auc: 0.876332	valid_0's binary_logloss: 0.309114
[1280]	valid_0's auc: 0.876333	valid_0's binary_logloss: 0.309112
[1281]	valid_0's auc: 0.876339	valid_0's binary_logloss: 0.309105
[1282]	valid_0's auc: 0.876338	valid_0's binary_logloss: 0.309105
[1283]	valid_0

[1393]	valid_0's auc: 0.876787	valid_0's binary_logloss: 0.308591
[1394]	valid_0's auc: 0.876791	valid_0's binary_logloss: 0.308588
[1395]	valid_0's auc: 0.876793	valid_0's binary_logloss: 0.308585
[1396]	valid_0's auc: 0.876794	valid_0's binary_logloss: 0.308585
[1397]	valid_0's auc: 0.876795	valid_0's binary_logloss: 0.308584
[1398]	valid_0's auc: 0.876813	valid_0's binary_logloss: 0.308566
[1399]	valid_0's auc: 0.876816	valid_0's binary_logloss: 0.308562
[1400]	valid_0's auc: 0.876822	valid_0's binary_logloss: 0.308556
[1401]	valid_0's auc: 0.876829	valid_0's binary_logloss: 0.30855
[1402]	valid_0's auc: 0.876833	valid_0's binary_logloss: 0.308545
[1403]	valid_0's auc: 0.876832	valid_0's binary_logloss: 0.308546
[1404]	valid_0's auc: 0.876836	valid_0's binary_logloss: 0.308542
[1405]	valid_0's auc: 0.876837	valid_0's binary_logloss: 0.30854
[1406]	valid_0's auc: 0.876839	valid_0's binary_logloss: 0.308537
[1407]	valid_0's auc: 0.87684	valid_0's binary_logloss: 0.308535
[1408]	valid_

[1518]	valid_0's auc: 0.877142	valid_0's binary_logloss: 0.308187
[1519]	valid_0's auc: 0.877142	valid_0's binary_logloss: 0.308186
[1520]	valid_0's auc: 0.877142	valid_0's binary_logloss: 0.308186
[1521]	valid_0's auc: 0.877143	valid_0's binary_logloss: 0.308185
[1522]	valid_0's auc: 0.877144	valid_0's binary_logloss: 0.308183
[1523]	valid_0's auc: 0.877146	valid_0's binary_logloss: 0.30818
[1524]	valid_0's auc: 0.877146	valid_0's binary_logloss: 0.30818
[1525]	valid_0's auc: 0.877148	valid_0's binary_logloss: 0.308178
[1526]	valid_0's auc: 0.877148	valid_0's binary_logloss: 0.308178
[1527]	valid_0's auc: 0.877148	valid_0's binary_logloss: 0.308178
[1528]	valid_0's auc: 0.877148	valid_0's binary_logloss: 0.308178
[1529]	valid_0's auc: 0.877154	valid_0's binary_logloss: 0.308172
[1530]	valid_0's auc: 0.877159	valid_0's binary_logloss: 0.308167
[1531]	valid_0's auc: 0.877164	valid_0's binary_logloss: 0.308162
[1532]	valid_0's auc: 0.877166	valid_0's binary_logloss: 0.308159
[1533]	valid

[1643]	valid_0's auc: 0.877473	valid_0's binary_logloss: 0.307817
[1644]	valid_0's auc: 0.877475	valid_0's binary_logloss: 0.307816
[1645]	valid_0's auc: 0.877478	valid_0's binary_logloss: 0.307813
[1646]	valid_0's auc: 0.877478	valid_0's binary_logloss: 0.307813
[1647]	valid_0's auc: 0.877478	valid_0's binary_logloss: 0.307813
[1648]	valid_0's auc: 0.877482	valid_0's binary_logloss: 0.307806
[1649]	valid_0's auc: 0.877487	valid_0's binary_logloss: 0.307802
[1650]	valid_0's auc: 0.877493	valid_0's binary_logloss: 0.307794
[1651]	valid_0's auc: 0.877495	valid_0's binary_logloss: 0.307792
[1652]	valid_0's auc: 0.877501	valid_0's binary_logloss: 0.307786
[1653]	valid_0's auc: 0.877503	valid_0's binary_logloss: 0.307783
[1654]	valid_0's auc: 0.877509	valid_0's binary_logloss: 0.307776
[1655]	valid_0's auc: 0.87752	valid_0's binary_logloss: 0.307767
[1656]	valid_0's auc: 0.877525	valid_0's binary_logloss: 0.307759
[1657]	valid_0's auc: 0.877528	valid_0's binary_logloss: 0.307756
[1658]	vali

[1768]	valid_0's auc: 0.877848	valid_0's binary_logloss: 0.307394
[1769]	valid_0's auc: 0.877855	valid_0's binary_logloss: 0.307384
[1770]	valid_0's auc: 0.87786	valid_0's binary_logloss: 0.307379
[1771]	valid_0's auc: 0.877859	valid_0's binary_logloss: 0.30738
[1772]	valid_0's auc: 0.87786	valid_0's binary_logloss: 0.307378
[1773]	valid_0's auc: 0.877865	valid_0's binary_logloss: 0.307373
[1774]	valid_0's auc: 0.877873	valid_0's binary_logloss: 0.307362
[1775]	valid_0's auc: 0.877873	valid_0's binary_logloss: 0.307361
[1776]	valid_0's auc: 0.877873	valid_0's binary_logloss: 0.307361
[1777]	valid_0's auc: 0.877876	valid_0's binary_logloss: 0.307358
[1778]	valid_0's auc: 0.877877	valid_0's binary_logloss: 0.307356
[1779]	valid_0's auc: 0.877881	valid_0's binary_logloss: 0.307351
[1780]	valid_0's auc: 0.877884	valid_0's binary_logloss: 0.307347
[1781]	valid_0's auc: 0.877886	valid_0's binary_logloss: 0.307344
[1782]	valid_0's auc: 0.877891	valid_0's binary_logloss: 0.30734
[1783]	valid_0

[1893]	valid_0's auc: 0.878177	valid_0's binary_logloss: 0.307016
[1894]	valid_0's auc: 0.878184	valid_0's binary_logloss: 0.307009
[1895]	valid_0's auc: 0.878191	valid_0's binary_logloss: 0.307002
[1896]	valid_0's auc: 0.878196	valid_0's binary_logloss: 0.306994
[1897]	valid_0's auc: 0.878197	valid_0's binary_logloss: 0.306992
[1898]	valid_0's auc: 0.878198	valid_0's binary_logloss: 0.306991
[1899]	valid_0's auc: 0.878205	valid_0's binary_logloss: 0.306983
[1900]	valid_0's auc: 0.878208	valid_0's binary_logloss: 0.306978
[1901]	valid_0's auc: 0.878208	valid_0's binary_logloss: 0.306977
[1902]	valid_0's auc: 0.87821	valid_0's binary_logloss: 0.306974
[1903]	valid_0's auc: 0.878211	valid_0's binary_logloss: 0.306972
[1904]	valid_0's auc: 0.878211	valid_0's binary_logloss: 0.306972
[1905]	valid_0's auc: 0.878213	valid_0's binary_logloss: 0.306969
[1906]	valid_0's auc: 0.878215	valid_0's binary_logloss: 0.306966
[1907]	valid_0's auc: 0.878219	valid_0's binary_logloss: 0.306962
[1908]	vali

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [18]:
# sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [22]:
sub.to_csv('./result/2000_0.878438.txt', index=None, header=None, sep='\t')

In [20]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()

Unnamed: 0,feature,imp,rate
0,diff_iq_day,779,0.012983
1,diff_iq_hour,2296,0.038267
2,freq,287,0.004783
3,gender,164,0.002733
4,hour,1444,0.024067
...,...,...,...
191,uf_c2_count,109,0.001817
192,uf_c3_count,267,0.004450
193,uf_c4_count,203,0.003383
194,uf_c5_count,0,0.000000


In [21]:
fi.sort_values(by='rate', ascending=False)[:60]

Unnamed: 0,feature,imp,rate
99,score,2625,0.04375
1,diff_iq_hour,2296,0.038267
186,uid_enc_count,1765,0.029417
81,qid_hour_count,1497,0.02495
4,hour,1444,0.024067
119,u_inv_kfold_mean,1252,0.020867
72,qid_day_count,1160,0.019333
181,uid_hour_std,1101,0.01835
108,u_ans_kfold_count,1098,0.0183
169,uid_diff_day_daymean,1059,0.01765
