In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-12-01 08:45:30,731] INFO in <ipython-input-4-f5fcce844f4e>: invite (9489162, 3)
[2019-12-01 08:45:32,226] INFO in <ipython-input-4-f5fcce844f4e>: test (1141683, 2)


In [5]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [None]:
# 加载 user 过去两个月的回答统计特征（除当条记录）
# t1 = pd.read_csv(f'{feature_path}/train_ua_feature.txt', sep='\t')
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_ua_feature.txt', sep='\t')
# test = pd.concat([test, t1], axis=1)

In [6]:
# 加载 invete 特征
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [7]:
train

Unnamed: 0,qid,uid,label,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,...,uid_hour_std,uid_week_mean,uid_week_median,uid_week_std,qid_hour_mean,qid_hour_median,qid_hour_std,qid_week_mean,qid_week_median,qid_week_std
0,Q2166419046,M401693808,0,3865,22,,,,,0.000000,...,4.722288,2.200000,2.0,1.643168,18.093023,22.0,6.903463,1.604651,1.0,1.953502
1,Q1550017551,M3392373099,0,3844,11,0.166667,1.0,0.408248,6.0,0.000000,...,3.044316,3.625000,4.0,1.922610,14.533334,13.0,6.334336,3.333333,3.0,1.988060
2,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,3.561855,3.357143,3.5,1.736803,14.500000,13.5,4.847680,3.166667,4.0,2.228602
3,Q2350061229,M1618461867,0,3849,11,,,,,0.000000,...,6.421689,3.428571,3.0,1.718249,18.000000,20.5,4.956958,4.125000,4.0,1.885092
4,Q2443223942,M3544409350,0,3867,4,0.375000,57.0,0.485723,152.0,0.000000,...,6.740425,3.000000,3.0,2.108185,12.405941,13.0,6.369700,2.775578,2.0,1.618129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,Q2372512458,M4285896253,1,3849,11,,,,,0.394737,...,5.837929,2.925926,3.0,2.126667,11.000000,11.0,0.000000,6.000000,6.0,0.000000
9489158,Q3516644442,M4285896253,1,3862,12,,,,,0.314286,...,5.837929,2.925926,3.0,2.126667,12.750000,12.0,2.121320,5.000000,5.0,0.000000
9489159,Q3847094730,M4285896253,0,3852,8,0.000000,0.0,,1.0,0.394737,...,5.837929,2.925926,3.0,2.126667,8.137255,8.0,1.131717,2.058824,2.0,0.310597
9489160,Q2358485548,M4285896253,0,3864,7,,,,,0.314286,...,5.837929,2.925926,3.0,2.126667,7.243243,7.0,1.498247,0.432432,0.0,1.500750


In [8]:
test

Unnamed: 0,qid,uid,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,u_inv_kfold_sum,...,uid_hour_std,uid_week_mean,uid_week_median,uid_week_std,qid_hour_mean,qid_hour_median,qid_hour_std,qid_week_mean,qid_week_median,qid_week_std
0,Q1493039281,M64135255,3870,9,0.0,0.0,0.000000,2.0,0.125000,1.0,...,5.493431,3.600000,4.0,1.837873,11.000000,9.0,2.581989,4.857143,5.0,1.345185
1,Q2023398782,M2536956560,3872,22,,,,,0.000000,0.0,...,4.516636,2.000000,1.0,2.449490,10.344828,12.0,6.820536,3.413793,2.0,2.625660
2,Q4151338694,M3294926344,3874,15,,,,,0.100000,1.0,...,5.851703,2.583333,3.0,1.729863,15.666667,16.0,0.577350,3.000000,3.0,0.000000
3,Q3271436624,M3744310794,3873,4,0.5,3.0,0.547723,6.0,0.000000,0.0,...,7.675719,2.250000,2.5,1.707825,10.428572,10.0,4.503966,3.571429,5.0,1.988060
4,Q3314287018,M1349051752,3872,19,,,,,0.000000,0.0,...,4.549725,3.600000,4.0,1.673320,19.000000,19.0,,1.000000,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,Q1238703523,M2010778235,3869,20,,,,,0.142857,1.0,...,3.419714,3.333333,3.0,2.000000,19.500000,20.0,1.732051,5.083334,5.0,0.288675
1141679,Q1074024036,M3131383616,3872,21,,,,,1.000000,1.0,...,2.160247,4.000000,4.5,2.160247,20.666666,21.0,0.577350,1.000000,1.0,0.000000
1141680,Q3478846332,M1872860897,3871,15,,,,,0.000000,0.0,...,3.315483,3.083333,3.0,2.020726,13.285714,12.0,3.302236,3.285714,3.0,2.751623
1141681,Q734170704,M3574631517,3871,8,,,,,0.125000,2.0,...,6.176261,3.000000,3.0,1.795055,8.333333,8.0,2.022858,0.100000,0.0,0.402578


In [9]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-01 08:53:39,643] INFO in <ipython-input-9-af285af762e4>: user (1931654, 14)
[2019-12-01 08:53:43,071] INFO in <ipython-input-9-af285af762e4>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-01 08:53:43,077] INFO in <ipython-input-9-af285af762e4>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-01 08:53:44,008] INFO in <ipython-input-9-af285af762e4>: encode gender
[2019-12-01 08:53:44,977] INFO in <ipython-input-9-af285af762e4>: encode freq
[2019-12-01 08:53:45,868] INFO in <ipython-input-9-af285af762e4>: encode uf_c1
[2019-12-01 08:53:46,707] INFO in <ipython-input-9-af285af762e4>: encode uf_c2
[2019-12-01 08:53:47,549] INFO in <ipython-input-9-af285af762e4>: encode uf_c3
[2019-12-01 08:53:48,364] INFO in <ipytho

In [10]:
data = pd.concat((train, test), axis=0, sort=True)

In [11]:
data

Unnamed: 0,day,freq,gender,hour,label,q_ans_kfold_count,q_diff_qa_days_max,q_diff_qa_days_mean,q_diff_qa_days_sum,q_has_img_max,...,uid_day_count,uid_enc,uid_hour_count,uid_hour_mean,uid_hour_median,uid_hour_std,uid_week_count,uid_week_mean,uid_week_median,uid_week_std
0,3865,4,2,22,0.0,,,,,,...,1,1508098,2,20.400000,22.0,4.722288,2,2.200000,2.0,1.643168
1,3844,1,2,11,0.0,4.0,33.0,25.00000,100.0,0.0,...,2,1196609,2,9.875000,9.5,3.044316,2,3.625000,4.0,1.922610
2,3862,4,2,15,0.0,,,,,,...,1,657985,4,13.071428,13.0,3.561855,3,3.357143,3.5,1.736803
3,3849,0,2,11,0.0,,,,,,...,1,308831,2,8.714286,8.0,6.421689,1,3.428571,3.0,1.718249
4,3867,1,2,4,0.0,32.0,13.0,9.53125,305.0,1.0,...,2,1272353,2,14.900000,16.0,6.740425,2,3.000000,3.0,2.108185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,3869,4,2,20,,,,,,,...,1,504328,1,19.777779,21.0,3.419714,1,3.333333,3.0,2.000000
1141679,3872,1,2,21,,,,,,,...,1,1066117,1,21.000000,21.5,2.160247,1,4.000000,4.5,2.160247
1141680,3871,0,2,15,,,,,,,...,1,435860,1,16.916666,18.0,3.315483,2,3.083333,3.0,2.020726
1141681,3871,4,2,8,,,,,,,...,1,1287650,10,11.421053,8.0,6.176261,1,3.000000,3.0,1.795055


In [12]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [13]:
data['wk'] = data['day'] % 7
# 选特征
# feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [14]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
# drop_feat += ('u_total_answer',)
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [15]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train)][feature_cols]
y_train_all = data.iloc[:len(train)]['label']
X_test = data.iloc[len(train):]
assert len(X_test) == sub_size

logging.info("train shape %s, test shape %s", train.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-12-01 08:57:01,115] INFO in <ipython-input-15-20ad4e7d53e4>: feature size 144
[2019-12-01 08:57:24,865] INFO in <ipython-input-15-20ad4e7d53e4>: train shape (9489162, 138), test shape (1141683, 137)


[1]	valid_0's auc: 0.761139	valid_0's binary_logloss: 0.453518
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.777798	valid_0's binary_logloss: 0.44269
[3]	valid_0's auc: 0.783243	valid_0's binary_logloss: 0.433951
[4]	valid_0's auc: 0.786766	valid_0's binary_logloss: 0.426539
[5]	valid_0's auc: 0.791955	valid_0's binary_logloss: 0.420382
[6]	valid_0's auc: 0.794086	valid_0's binary_logloss: 0.415145
[7]	valid_0's auc: 0.797096	valid_0's binary_logloss: 0.409934
[8]	valid_0's auc: 0.799158	valid_0's binary_logloss: 0.405679
[9]	valid_0's auc: 0.801577	valid_0's binary_logloss: 0.401624
[10]	valid_0's auc: 0.803787	valid_0's binary_logloss: 0.397939
[11]	valid_0's auc: 0.805174	valid_0's binary_logloss: 0.394891
[12]	valid_0's auc: 0.806633	valid_0's binary_logloss: 0.39204
[13]	valid_0's auc: 0.808029	valid_0's binary_logloss: 0.38927
[14]	valid_0's auc: 0.809421	valid_0's binary_logloss: 0.386911
[15]	valid_0's auc: 0.810827	valid_0's binary_logloss: 

[129]	valid_0's auc: 0.843221	valid_0's binary_logloss: 0.344288
[130]	valid_0's auc: 0.843301	valid_0's binary_logloss: 0.34421
[131]	valid_0's auc: 0.843456	valid_0's binary_logloss: 0.344059
[132]	valid_0's auc: 0.843543	valid_0's binary_logloss: 0.343971
[133]	valid_0's auc: 0.843633	valid_0's binary_logloss: 0.343889
[134]	valid_0's auc: 0.8437	valid_0's binary_logloss: 0.343814
[135]	valid_0's auc: 0.843775	valid_0's binary_logloss: 0.343742
[136]	valid_0's auc: 0.84387	valid_0's binary_logloss: 0.343665
[137]	valid_0's auc: 0.843999	valid_0's binary_logloss: 0.343542
[138]	valid_0's auc: 0.844077	valid_0's binary_logloss: 0.343469
[139]	valid_0's auc: 0.844164	valid_0's binary_logloss: 0.343384
[140]	valid_0's auc: 0.844216	valid_0's binary_logloss: 0.34334
[141]	valid_0's auc: 0.844293	valid_0's binary_logloss: 0.343271
[142]	valid_0's auc: 0.844377	valid_0's binary_logloss: 0.343192
[143]	valid_0's auc: 0.844446	valid_0's binary_logloss: 0.343124
[144]	valid_0's auc: 0.844496	

[256]	valid_0's auc: 0.849964	valid_0's binary_logloss: 0.337741
[257]	valid_0's auc: 0.849979	valid_0's binary_logloss: 0.337724
[258]	valid_0's auc: 0.85	valid_0's binary_logloss: 0.337703
[259]	valid_0's auc: 0.850024	valid_0's binary_logloss: 0.337679
[260]	valid_0's auc: 0.850058	valid_0's binary_logloss: 0.337646
[261]	valid_0's auc: 0.85008	valid_0's binary_logloss: 0.337624
[262]	valid_0's auc: 0.850114	valid_0's binary_logloss: 0.33759
[263]	valid_0's auc: 0.850155	valid_0's binary_logloss: 0.337559
[264]	valid_0's auc: 0.850182	valid_0's binary_logloss: 0.337533
[265]	valid_0's auc: 0.8502	valid_0's binary_logloss: 0.337515
[266]	valid_0's auc: 0.850221	valid_0's binary_logloss: 0.337495
[267]	valid_0's auc: 0.850236	valid_0's binary_logloss: 0.337478
[268]	valid_0's auc: 0.850261	valid_0's binary_logloss: 0.337455
[269]	valid_0's auc: 0.850291	valid_0's binary_logloss: 0.337429
[270]	valid_0's auc: 0.850323	valid_0's binary_logloss: 0.337396
[271]	valid_0's auc: 0.850353	val

[383]	valid_0's auc: 0.852745	valid_0's binary_logloss: 0.335015
[384]	valid_0's auc: 0.852762	valid_0's binary_logloss: 0.335001
[385]	valid_0's auc: 0.852786	valid_0's binary_logloss: 0.334977
[386]	valid_0's auc: 0.852803	valid_0's binary_logloss: 0.334962
[387]	valid_0's auc: 0.852807	valid_0's binary_logloss: 0.334957
[388]	valid_0's auc: 0.852825	valid_0's binary_logloss: 0.33494
[389]	valid_0's auc: 0.852849	valid_0's binary_logloss: 0.33492
[390]	valid_0's auc: 0.852863	valid_0's binary_logloss: 0.334907
[391]	valid_0's auc: 0.852869	valid_0's binary_logloss: 0.3349
[392]	valid_0's auc: 0.852891	valid_0's binary_logloss: 0.334875
[393]	valid_0's auc: 0.852914	valid_0's binary_logloss: 0.33485
[394]	valid_0's auc: 0.852939	valid_0's binary_logloss: 0.334829
[395]	valid_0's auc: 0.852949	valid_0's binary_logloss: 0.334819
[396]	valid_0's auc: 0.852959	valid_0's binary_logloss: 0.334811
[397]	valid_0's auc: 0.852991	valid_0's binary_logloss: 0.334778
[398]	valid_0's auc: 0.853027	

[510]	valid_0's auc: 0.854856	valid_0's binary_logloss: 0.332878
[511]	valid_0's auc: 0.854859	valid_0's binary_logloss: 0.332875
[512]	valid_0's auc: 0.854886	valid_0's binary_logloss: 0.332852
[513]	valid_0's auc: 0.854893	valid_0's binary_logloss: 0.332846
[514]	valid_0's auc: 0.854903	valid_0's binary_logloss: 0.332836
[515]	valid_0's auc: 0.854911	valid_0's binary_logloss: 0.332826
[516]	valid_0's auc: 0.854928	valid_0's binary_logloss: 0.332811
[517]	valid_0's auc: 0.854932	valid_0's binary_logloss: 0.332805
[518]	valid_0's auc: 0.854942	valid_0's binary_logloss: 0.332796
[519]	valid_0's auc: 0.854951	valid_0's binary_logloss: 0.332786
[520]	valid_0's auc: 0.85496	valid_0's binary_logloss: 0.332776
[521]	valid_0's auc: 0.854965	valid_0's binary_logloss: 0.332771
[522]	valid_0's auc: 0.854973	valid_0's binary_logloss: 0.332763
[523]	valid_0's auc: 0.854982	valid_0's binary_logloss: 0.332755
[524]	valid_0's auc: 0.854994	valid_0's binary_logloss: 0.33274
[525]	valid_0's auc: 0.8550

[637]	valid_0's auc: 0.856191	valid_0's binary_logloss: 0.331538
[638]	valid_0's auc: 0.856201	valid_0's binary_logloss: 0.331529
[639]	valid_0's auc: 0.856211	valid_0's binary_logloss: 0.331519
[640]	valid_0's auc: 0.856217	valid_0's binary_logloss: 0.331514
[641]	valid_0's auc: 0.856219	valid_0's binary_logloss: 0.331511
[642]	valid_0's auc: 0.856224	valid_0's binary_logloss: 0.331506
[643]	valid_0's auc: 0.856232	valid_0's binary_logloss: 0.331497
[644]	valid_0's auc: 0.856234	valid_0's binary_logloss: 0.331496
[645]	valid_0's auc: 0.856239	valid_0's binary_logloss: 0.331491
[646]	valid_0's auc: 0.856243	valid_0's binary_logloss: 0.331487
[647]	valid_0's auc: 0.856265	valid_0's binary_logloss: 0.331467
[648]	valid_0's auc: 0.856273	valid_0's binary_logloss: 0.331458
[649]	valid_0's auc: 0.856284	valid_0's binary_logloss: 0.331449
[650]	valid_0's auc: 0.856316	valid_0's binary_logloss: 0.33142
[651]	valid_0's auc: 0.856326	valid_0's binary_logloss: 0.331412
[652]	valid_0's auc: 0.856

[764]	valid_0's auc: 0.85719	valid_0's binary_logloss: 0.330543
[765]	valid_0's auc: 0.857197	valid_0's binary_logloss: 0.330535
[766]	valid_0's auc: 0.857202	valid_0's binary_logloss: 0.330531
[767]	valid_0's auc: 0.85721	valid_0's binary_logloss: 0.330524
[768]	valid_0's auc: 0.857227	valid_0's binary_logloss: 0.330509
[769]	valid_0's auc: 0.857234	valid_0's binary_logloss: 0.3305
[770]	valid_0's auc: 0.857243	valid_0's binary_logloss: 0.33049
[771]	valid_0's auc: 0.857263	valid_0's binary_logloss: 0.330469
[772]	valid_0's auc: 0.857276	valid_0's binary_logloss: 0.330457
[773]	valid_0's auc: 0.857282	valid_0's binary_logloss: 0.33045
[774]	valid_0's auc: 0.857286	valid_0's binary_logloss: 0.330446
[775]	valid_0's auc: 0.85729	valid_0's binary_logloss: 0.330443
[776]	valid_0's auc: 0.857291	valid_0's binary_logloss: 0.330441
[777]	valid_0's auc: 0.857299	valid_0's binary_logloss: 0.330434
[778]	valid_0's auc: 0.857304	valid_0's binary_logloss: 0.33043
[779]	valid_0's auc: 0.857307	val

[891]	valid_0's auc: 0.858019	valid_0's binary_logloss: 0.329701
[892]	valid_0's auc: 0.858021	valid_0's binary_logloss: 0.329699
[893]	valid_0's auc: 0.858024	valid_0's binary_logloss: 0.329697
[894]	valid_0's auc: 0.858026	valid_0's binary_logloss: 0.329694
[895]	valid_0's auc: 0.858042	valid_0's binary_logloss: 0.329678
[896]	valid_0's auc: 0.858044	valid_0's binary_logloss: 0.329676
[897]	valid_0's auc: 0.858047	valid_0's binary_logloss: 0.329673
[898]	valid_0's auc: 0.858056	valid_0's binary_logloss: 0.329665
[899]	valid_0's auc: 0.858064	valid_0's binary_logloss: 0.329657
[900]	valid_0's auc: 0.85807	valid_0's binary_logloss: 0.329651
[901]	valid_0's auc: 0.858073	valid_0's binary_logloss: 0.329648
[902]	valid_0's auc: 0.858076	valid_0's binary_logloss: 0.329645
[903]	valid_0's auc: 0.858078	valid_0's binary_logloss: 0.329643
[904]	valid_0's auc: 0.858081	valid_0's binary_logloss: 0.329641
[905]	valid_0's auc: 0.858083	valid_0's binary_logloss: 0.329638
[906]	valid_0's auc: 0.858

[1018]	valid_0's auc: 0.858694	valid_0's binary_logloss: 0.329018
[1019]	valid_0's auc: 0.858696	valid_0's binary_logloss: 0.329016
[1020]	valid_0's auc: 0.858697	valid_0's binary_logloss: 0.329014
[1021]	valid_0's auc: 0.858699	valid_0's binary_logloss: 0.329012
[1022]	valid_0's auc: 0.858699	valid_0's binary_logloss: 0.329012
[1023]	valid_0's auc: 0.858708	valid_0's binary_logloss: 0.329003
[1024]	valid_0's auc: 0.858709	valid_0's binary_logloss: 0.329001
[1025]	valid_0's auc: 0.858711	valid_0's binary_logloss: 0.328998
[1026]	valid_0's auc: 0.858714	valid_0's binary_logloss: 0.328996
[1027]	valid_0's auc: 0.858717	valid_0's binary_logloss: 0.328993
[1028]	valid_0's auc: 0.858717	valid_0's binary_logloss: 0.328993
[1029]	valid_0's auc: 0.858721	valid_0's binary_logloss: 0.328989
[1030]	valid_0's auc: 0.858735	valid_0's binary_logloss: 0.328975
[1031]	valid_0's auc: 0.858739	valid_0's binary_logloss: 0.32897
[1032]	valid_0's auc: 0.858745	valid_0's binary_logloss: 0.328965
[1033]	vali

[1143]	valid_0's auc: 0.859412	valid_0's binary_logloss: 0.328267
[1144]	valid_0's auc: 0.859415	valid_0's binary_logloss: 0.328265
[1145]	valid_0's auc: 0.859416	valid_0's binary_logloss: 0.328264
[1146]	valid_0's auc: 0.859424	valid_0's binary_logloss: 0.328256
[1147]	valid_0's auc: 0.859431	valid_0's binary_logloss: 0.328247
[1148]	valid_0's auc: 0.859443	valid_0's binary_logloss: 0.328231
[1149]	valid_0's auc: 0.859458	valid_0's binary_logloss: 0.328214
[1150]	valid_0's auc: 0.859467	valid_0's binary_logloss: 0.328206
[1151]	valid_0's auc: 0.859476	valid_0's binary_logloss: 0.328198
[1152]	valid_0's auc: 0.859495	valid_0's binary_logloss: 0.328184
[1153]	valid_0's auc: 0.8595	valid_0's binary_logloss: 0.328178
[1154]	valid_0's auc: 0.859503	valid_0's binary_logloss: 0.328176
[1155]	valid_0's auc: 0.859507	valid_0's binary_logloss: 0.328171
[1156]	valid_0's auc: 0.859511	valid_0's binary_logloss: 0.328166
[1157]	valid_0's auc: 0.859515	valid_0's binary_logloss: 0.328163
[1158]	valid

[1268]	valid_0's auc: 0.859941	valid_0's binary_logloss: 0.327726
[1269]	valid_0's auc: 0.859942	valid_0's binary_logloss: 0.327724
[1270]	valid_0's auc: 0.859943	valid_0's binary_logloss: 0.327724
[1271]	valid_0's auc: 0.859943	valid_0's binary_logloss: 0.327723
[1272]	valid_0's auc: 0.859947	valid_0's binary_logloss: 0.327718
[1273]	valid_0's auc: 0.859948	valid_0's binary_logloss: 0.327718
[1274]	valid_0's auc: 0.859948	valid_0's binary_logloss: 0.327717
[1275]	valid_0's auc: 0.85995	valid_0's binary_logloss: 0.327716
[1276]	valid_0's auc: 0.85995	valid_0's binary_logloss: 0.327715
[1277]	valid_0's auc: 0.859952	valid_0's binary_logloss: 0.327714
[1278]	valid_0's auc: 0.859953	valid_0's binary_logloss: 0.327712
[1279]	valid_0's auc: 0.859953	valid_0's binary_logloss: 0.327712
[1280]	valid_0's auc: 0.859954	valid_0's binary_logloss: 0.327711
[1281]	valid_0's auc: 0.859954	valid_0's binary_logloss: 0.327711
[1282]	valid_0's auc: 0.859959	valid_0's binary_logloss: 0.327706
[1283]	valid

[1393]	valid_0's auc: 0.860496	valid_0's binary_logloss: 0.327137
[1394]	valid_0's auc: 0.860502	valid_0's binary_logloss: 0.32713
[1395]	valid_0's auc: 0.860504	valid_0's binary_logloss: 0.327128
[1396]	valid_0's auc: 0.860505	valid_0's binary_logloss: 0.327127
[1397]	valid_0's auc: 0.860507	valid_0's binary_logloss: 0.327124
[1398]	valid_0's auc: 0.860508	valid_0's binary_logloss: 0.327122
[1399]	valid_0's auc: 0.860511	valid_0's binary_logloss: 0.32712
[1400]	valid_0's auc: 0.860512	valid_0's binary_logloss: 0.327119
[1401]	valid_0's auc: 0.860513	valid_0's binary_logloss: 0.327119
[1402]	valid_0's auc: 0.860518	valid_0's binary_logloss: 0.327113
[1403]	valid_0's auc: 0.860518	valid_0's binary_logloss: 0.327114
[1404]	valid_0's auc: 0.860526	valid_0's binary_logloss: 0.327105
[1405]	valid_0's auc: 0.86053	valid_0's binary_logloss: 0.327101
[1406]	valid_0's auc: 0.860531	valid_0's binary_logloss: 0.3271
[1407]	valid_0's auc: 0.860534	valid_0's binary_logloss: 0.327098
[1408]	valid_0'

[1518]	valid_0's auc: 0.86092	valid_0's binary_logloss: 0.326703
[1519]	valid_0's auc: 0.860921	valid_0's binary_logloss: 0.326702
[1520]	valid_0's auc: 0.860922	valid_0's binary_logloss: 0.326702
[1521]	valid_0's auc: 0.860923	valid_0's binary_logloss: 0.3267
[1522]	valid_0's auc: 0.860931	valid_0's binary_logloss: 0.326691
[1523]	valid_0's auc: 0.860932	valid_0's binary_logloss: 0.32669
[1524]	valid_0's auc: 0.860932	valid_0's binary_logloss: 0.32669
[1525]	valid_0's auc: 0.860933	valid_0's binary_logloss: 0.32669
[1526]	valid_0's auc: 0.860934	valid_0's binary_logloss: 0.326688
[1527]	valid_0's auc: 0.860939	valid_0's binary_logloss: 0.326683
[1528]	valid_0's auc: 0.86094	valid_0's binary_logloss: 0.326681
[1529]	valid_0's auc: 0.860943	valid_0's binary_logloss: 0.326678
[1530]	valid_0's auc: 0.860948	valid_0's binary_logloss: 0.326673
[1531]	valid_0's auc: 0.860948	valid_0's binary_logloss: 0.326673
[1532]	valid_0's auc: 0.860954	valid_0's binary_logloss: 0.326665
[1533]	valid_0's 

[1643]	valid_0's auc: 0.861255	valid_0's binary_logloss: 0.326361
[1644]	valid_0's auc: 0.861255	valid_0's binary_logloss: 0.326361
[1645]	valid_0's auc: 0.861256	valid_0's binary_logloss: 0.326359
[1646]	valid_0's auc: 0.861258	valid_0's binary_logloss: 0.326357
[1647]	valid_0's auc: 0.861257	valid_0's binary_logloss: 0.326357
[1648]	valid_0's auc: 0.861259	valid_0's binary_logloss: 0.326356
[1649]	valid_0's auc: 0.861259	valid_0's binary_logloss: 0.326356
[1650]	valid_0's auc: 0.861261	valid_0's binary_logloss: 0.326354
[1651]	valid_0's auc: 0.861261	valid_0's binary_logloss: 0.326354
[1652]	valid_0's auc: 0.861261	valid_0's binary_logloss: 0.326354
[1653]	valid_0's auc: 0.861262	valid_0's binary_logloss: 0.326353
[1654]	valid_0's auc: 0.861269	valid_0's binary_logloss: 0.326347
[1655]	valid_0's auc: 0.861272	valid_0's binary_logloss: 0.326342
[1656]	valid_0's auc: 0.861273	valid_0's binary_logloss: 0.326341
[1657]	valid_0's auc: 0.861274	valid_0's binary_logloss: 0.32634
[1658]	vali

[1768]	valid_0's auc: 0.861695	valid_0's binary_logloss: 0.325885
[1769]	valid_0's auc: 0.861697	valid_0's binary_logloss: 0.325882
[1770]	valid_0's auc: 0.861699	valid_0's binary_logloss: 0.32588
[1771]	valid_0's auc: 0.8617	valid_0's binary_logloss: 0.325879
[1772]	valid_0's auc: 0.8617	valid_0's binary_logloss: 0.325879
[1773]	valid_0's auc: 0.861702	valid_0's binary_logloss: 0.325877
[1774]	valid_0's auc: 0.861703	valid_0's binary_logloss: 0.325876
[1775]	valid_0's auc: 0.861709	valid_0's binary_logloss: 0.32587
[1776]	valid_0's auc: 0.861714	valid_0's binary_logloss: 0.325865
[1777]	valid_0's auc: 0.861717	valid_0's binary_logloss: 0.325861
[1778]	valid_0's auc: 0.861718	valid_0's binary_logloss: 0.32586
[1779]	valid_0's auc: 0.861724	valid_0's binary_logloss: 0.325854
[1780]	valid_0's auc: 0.861728	valid_0's binary_logloss: 0.32585
[1781]	valid_0's auc: 0.86173	valid_0's binary_logloss: 0.325848
[1782]	valid_0's auc: 0.861733	valid_0's binary_logloss: 0.325845
[1783]	valid_0's au

[1893]	valid_0's auc: 0.862076	valid_0's binary_logloss: 0.325477
[1894]	valid_0's auc: 0.862076	valid_0's binary_logloss: 0.325477
[1895]	valid_0's auc: 0.862076	valid_0's binary_logloss: 0.325476
[1896]	valid_0's auc: 0.862084	valid_0's binary_logloss: 0.32547
[1897]	valid_0's auc: 0.862084	valid_0's binary_logloss: 0.325469
[1898]	valid_0's auc: 0.862085	valid_0's binary_logloss: 0.325467
[1899]	valid_0's auc: 0.862094	valid_0's binary_logloss: 0.325459
[1900]	valid_0's auc: 0.862093	valid_0's binary_logloss: 0.325459
[1901]	valid_0's auc: 0.862095	valid_0's binary_logloss: 0.325458
[1902]	valid_0's auc: 0.862094	valid_0's binary_logloss: 0.325458
[1903]	valid_0's auc: 0.862102	valid_0's binary_logloss: 0.32545
[1904]	valid_0's auc: 0.862106	valid_0's binary_logloss: 0.325446
[1905]	valid_0's auc: 0.862111	valid_0's binary_logloss: 0.325442
[1906]	valid_0's auc: 0.862115	valid_0's binary_logloss: 0.325438
[1907]	valid_0's auc: 0.862115	valid_0's binary_logloss: 0.325437
[1908]	valid

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [16]:
# sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [17]:
sub.to_csv('./result/2000_add_invite.txt', index=None, header=None, sep='\t')

In [18]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi

Unnamed: 0,feature,imp,rate
0,freq,371,0.006183
1,gender,264,0.004400
2,hour,2131,0.035517
3,q_ans_kfold_count,783,0.013050
4,q_diff_qa_days_max,1349,0.022483
...,...,...,...
139,uf_c2_count,157,0.002617
140,uf_c3_count,380,0.006333
141,uf_c4_count,313,0.005217
142,uf_c5_count,0,0.000000


In [19]:
fi.sort_values(by='rate', ascending=False)[:60]

Unnamed: 0,feature,imp,rate
63,score,2955,0.04925
2,hour,2131,0.035517
55,qid_hour_count,2023,0.033717
134,uid_enc_count,1854,0.0309
129,uid_hour_std,1595,0.026583
53,qid_day_count,1564,0.026067
127,uid_hour_mean,1561,0.026017
75,u_inv_kfold_mean,1557,0.02595
64,u_ans_kfold_count,1525,0.025417
5,q_diff_qa_days_mean,1485,0.02475
