In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)

[2019-12-08 08:24:41,787] INFO in <ipython-input-4-86d2094b983e>: invite (9489162, 3)
[2019-12-08 08:24:44,428] INFO in <ipython-input-4-86d2094b983e>: test (1141683, 2)


In [5]:
# 加载 ans kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [8]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [9]:
# 加载 kfold topic feature, QU
t1 = pd.read_csv(f'{feature_path}/train_kfold_topic_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_topic_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [10]:
# 加载 user kfold topic feature，UU
t1 = pd.read_csv(f'{feature_path}/train_kfold_ut_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_ut_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [11]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-08 08:32:59,676] INFO in <ipython-input-11-aaab6ed818bf>: user (1931654, 14)
[2019-12-08 08:33:06,094] INFO in <ipython-input-11-aaab6ed818bf>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-08 08:33:06,103] INFO in <ipython-input-11-aaab6ed818bf>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-08 08:33:07,765] INFO in <ipython-input-11-aaab6ed818bf>: encode gender
[2019-12-08 08:33:09,462] INFO in <ipython-input-11-aaab6ed818bf>: encode freq
[2019-12-08 08:33:11,118] INFO in <ipython-input-11-aaab6ed818bf>: encode uf_c1
[2019-12-08 08:33:12,717] INFO in <ipython-input-11-aaab6ed818bf>: encode uf_c2
[2019-12-08 08:33:14,280] INFO in <ipython-input-11-aaab6ed818bf>: encode uf_c3
[2019-12-08 08:33:15,801] INFO in

In [12]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [13]:
# count 特征
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [14]:
data['wk'] = data['day'] % 7

In [15]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ('q_ans_kfold_count', 'q_diff_qa_days_max', 'q_diff_qa_days_mean', 'q_diff_qa_days_sum', 
#               'q_has_img_max', 'q_has_img_mean', 'q_has_img_sum', 'q_has_video_max', 'q_has_video_mean', 
#               'q_has_video_sum','q_is_dest_max', 'q_is_dest_mean', 'q_is_dest_sum', 'q_is_good_max', 
#               'q_is_good_mean', 'q_is_good_sum', 'q_is_rec_max', 'q_is_rec_mean', 'q_is_rec_sum', 
#               'q_reci_cheer_max', 'q_reci_cheer_mean', 'q_reci_cheer_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'q_reci_comment_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 
#               'q_reci_dis_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'q_reci_mark_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'q_reci_no_help_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 
#               'q_reci_tks_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'q_reci_uncheer_sum', 
#               'q_reci_xxx_max', 'q_reci_xxx_mean', 'q_reci_xxx_sum', 'q_word_count_max', 'q_word_count_mean', 
#               'q_word_count_sum')
# drop_feat += ('u_ans_kfold_count', 'u_diff_qa_days_max', 'u_diff_qa_days_mean', 'u_diff_qa_days_sum', 
#               'u_has_img_max', 'u_has_img_mean', 'u_has_img_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'u_has_video_sum', 'u_is_dest_max', 'u_is_dest_mean', 'u_is_dest_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'u_is_good_sum', 'u_is_rec_max', 'u_is_rec_mean', 'u_is_rec_sum', 'u_reci_cheer_max', 
#               'u_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_comment_max', 'u_reci_comment_mean',
#               'u_reci_comment_sum', 'u_reci_dis_max', 'u_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'u_reci_no_help_sum', 'u_reci_tks_max', 'u_reci_tks_mean', 'u_reci_tks_sum', 'u_reci_uncheer_max', 
#               'u_reci_uncheer_mean', 'u_reci_uncheer_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'u_reci_xxx_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'u_word_count_sum')
# drop_feat += ('u_total_answer',)
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [None]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_cols]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



[2019-12-08 08:39:21,244] INFO in <ipython-input-16-727117a4265e>: feature size 212
[2019-12-08 09:02:59,348] INFO in <ipython-input-16-727117a4265e>: train shape (7591329, 212), val shape (1897833, 212), test shape (1141683, 216)


[1]	valid_0's auc: 0.769389	valid_0's binary_logloss: 0.451429
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.78995	valid_0's binary_logloss: 0.438202
[3]	valid_0's auc: 0.79734	valid_0's binary_logloss: 0.428292
[4]	valid_0's auc: 0.804948	valid_0's binary_logloss: 0.419089
[5]	valid_0's auc: 0.810255	valid_0's binary_logloss: 0.411435
[6]	valid_0's auc: 0.812447	valid_0's binary_logloss: 0.405087
[7]	valid_0's auc: 0.81457	valid_0's binary_logloss: 0.399576
[8]	valid_0's auc: 0.817256	valid_0's binary_logloss: 0.394607
[9]	valid_0's auc: 0.819819	valid_0's binary_logloss: 0.39018
[10]	valid_0's auc: 0.821272	valid_0's binary_logloss: 0.386256
[11]	valid_0's auc: 0.822903	valid_0's binary_logloss: 0.382728
[12]	valid_0's auc: 0.824222	valid_0's binary_logloss: 0.379687
[13]	valid_0's auc: 0.825415	valid_0's binary_logloss: 0.376931
[14]	valid_0's auc: 0.826367	valid_0's binary_logloss: 0.374309
[15]	valid_0's auc: 0.827792	valid_0's binary_logloss: 0

[129]	valid_0's auc: 0.862158	valid_0's binary_logloss: 0.323545
[130]	valid_0's auc: 0.862244	valid_0's binary_logloss: 0.323458
[131]	valid_0's auc: 0.862317	valid_0's binary_logloss: 0.323378
[132]	valid_0's auc: 0.862422	valid_0's binary_logloss: 0.323265
[133]	valid_0's auc: 0.862502	valid_0's binary_logloss: 0.323183
[134]	valid_0's auc: 0.862573	valid_0's binary_logloss: 0.323084
[135]	valid_0's auc: 0.862709	valid_0's binary_logloss: 0.322904
[136]	valid_0's auc: 0.86278	valid_0's binary_logloss: 0.322808
[137]	valid_0's auc: 0.862856	valid_0's binary_logloss: 0.322701
[138]	valid_0's auc: 0.86292	valid_0's binary_logloss: 0.322627
[139]	valid_0's auc: 0.863029	valid_0's binary_logloss: 0.32251
[140]	valid_0's auc: 0.863118	valid_0's binary_logloss: 0.322418
[141]	valid_0's auc: 0.863234	valid_0's binary_logloss: 0.322306
[142]	valid_0's auc: 0.863307	valid_0's binary_logloss: 0.32222
[143]	valid_0's auc: 0.863423	valid_0's binary_logloss: 0.322113
[144]	valid_0's auc: 0.863542

[256]	valid_0's auc: 0.869977	valid_0's binary_logloss: 0.314897
[257]	valid_0's auc: 0.870011	valid_0's binary_logloss: 0.314865
[258]	valid_0's auc: 0.870042	valid_0's binary_logloss: 0.314822
[259]	valid_0's auc: 0.870085	valid_0's binary_logloss: 0.314778
[260]	valid_0's auc: 0.870143	valid_0's binary_logloss: 0.314706
[261]	valid_0's auc: 0.870173	valid_0's binary_logloss: 0.314666
[262]	valid_0's auc: 0.870209	valid_0's binary_logloss: 0.314637
[263]	valid_0's auc: 0.870259	valid_0's binary_logloss: 0.314586
[264]	valid_0's auc: 0.870295	valid_0's binary_logloss: 0.314537
[265]	valid_0's auc: 0.870345	valid_0's binary_logloss: 0.31448
[266]	valid_0's auc: 0.870376	valid_0's binary_logloss: 0.314445
[267]	valid_0's auc: 0.870421	valid_0's binary_logloss: 0.314401
[268]	valid_0's auc: 0.870453	valid_0's binary_logloss: 0.314363
[269]	valid_0's auc: 0.870479	valid_0's binary_logloss: 0.314332
[270]	valid_0's auc: 0.870517	valid_0's binary_logloss: 0.314291
[271]	valid_0's auc: 0.870

[383]	valid_0's auc: 0.87358	valid_0's binary_logloss: 0.310915
[384]	valid_0's auc: 0.873593	valid_0's binary_logloss: 0.310902
[385]	valid_0's auc: 0.873604	valid_0's binary_logloss: 0.310891
[386]	valid_0's auc: 0.873616	valid_0's binary_logloss: 0.310879
[387]	valid_0's auc: 0.873638	valid_0's binary_logloss: 0.310855
[388]	valid_0's auc: 0.873657	valid_0's binary_logloss: 0.310836
[389]	valid_0's auc: 0.873667	valid_0's binary_logloss: 0.310825
[390]	valid_0's auc: 0.873688	valid_0's binary_logloss: 0.310802
[391]	valid_0's auc: 0.8737	valid_0's binary_logloss: 0.31078
[392]	valid_0's auc: 0.873723	valid_0's binary_logloss: 0.310748
[393]	valid_0's auc: 0.873741	valid_0's binary_logloss: 0.31073
[394]	valid_0's auc: 0.873755	valid_0's binary_logloss: 0.310715
[395]	valid_0's auc: 0.873793	valid_0's binary_logloss: 0.310676
[396]	valid_0's auc: 0.873817	valid_0's binary_logloss: 0.310649
[397]	valid_0's auc: 0.873834	valid_0's binary_logloss: 0.310631
[398]	valid_0's auc: 0.873849	

[510]	valid_0's auc: 0.875734	valid_0's binary_logloss: 0.308506
[511]	valid_0's auc: 0.875739	valid_0's binary_logloss: 0.308501
[512]	valid_0's auc: 0.875753	valid_0's binary_logloss: 0.308484
[513]	valid_0's auc: 0.87577	valid_0's binary_logloss: 0.308466
[514]	valid_0's auc: 0.875803	valid_0's binary_logloss: 0.308435
[515]	valid_0's auc: 0.875829	valid_0's binary_logloss: 0.30841
[516]	valid_0's auc: 0.875838	valid_0's binary_logloss: 0.308394
[517]	valid_0's auc: 0.875852	valid_0's binary_logloss: 0.308381
[518]	valid_0's auc: 0.875869	valid_0's binary_logloss: 0.308363
[519]	valid_0's auc: 0.875888	valid_0's binary_logloss: 0.308346
[520]	valid_0's auc: 0.875897	valid_0's binary_logloss: 0.308338
[521]	valid_0's auc: 0.875921	valid_0's binary_logloss: 0.308309
[522]	valid_0's auc: 0.875932	valid_0's binary_logloss: 0.308293
[523]	valid_0's auc: 0.875939	valid_0's binary_logloss: 0.308284
[524]	valid_0's auc: 0.875953	valid_0's binary_logloss: 0.308266
[525]	valid_0's auc: 0.8759

[637]	valid_0's auc: 0.877143	valid_0's binary_logloss: 0.306887
[638]	valid_0's auc: 0.877143	valid_0's binary_logloss: 0.306887
[639]	valid_0's auc: 0.877148	valid_0's binary_logloss: 0.306882
[640]	valid_0's auc: 0.877157	valid_0's binary_logloss: 0.306871
[641]	valid_0's auc: 0.877176	valid_0's binary_logloss: 0.306852
[642]	valid_0's auc: 0.877191	valid_0's binary_logloss: 0.306839
[643]	valid_0's auc: 0.877217	valid_0's binary_logloss: 0.306811
[644]	valid_0's auc: 0.877228	valid_0's binary_logloss: 0.306798
[645]	valid_0's auc: 0.877229	valid_0's binary_logloss: 0.306796
[646]	valid_0's auc: 0.877235	valid_0's binary_logloss: 0.306786
[647]	valid_0's auc: 0.87725	valid_0's binary_logloss: 0.306771
[648]	valid_0's auc: 0.877259	valid_0's binary_logloss: 0.306759
[649]	valid_0's auc: 0.877278	valid_0's binary_logloss: 0.306734
[650]	valid_0's auc: 0.877293	valid_0's binary_logloss: 0.306716
[651]	valid_0's auc: 0.877309	valid_0's binary_logloss: 0.306701
[652]	valid_0's auc: 0.877

[764]	valid_0's auc: 0.878144	valid_0's binary_logloss: 0.305716
[765]	valid_0's auc: 0.878149	valid_0's binary_logloss: 0.30571
[766]	valid_0's auc: 0.878164	valid_0's binary_logloss: 0.305692
[767]	valid_0's auc: 0.878165	valid_0's binary_logloss: 0.30569
[768]	valid_0's auc: 0.878171	valid_0's binary_logloss: 0.305684
[769]	valid_0's auc: 0.878181	valid_0's binary_logloss: 0.305674
[770]	valid_0's auc: 0.878183	valid_0's binary_logloss: 0.305672
[771]	valid_0's auc: 0.878187	valid_0's binary_logloss: 0.305666
[772]	valid_0's auc: 0.878202	valid_0's binary_logloss: 0.305649
[773]	valid_0's auc: 0.878204	valid_0's binary_logloss: 0.305645
[774]	valid_0's auc: 0.878221	valid_0's binary_logloss: 0.305626
[775]	valid_0's auc: 0.878224	valid_0's binary_logloss: 0.305622
[776]	valid_0's auc: 0.878225	valid_0's binary_logloss: 0.305621
[777]	valid_0's auc: 0.87823	valid_0's binary_logloss: 0.305612
[778]	valid_0's auc: 0.878239	valid_0's binary_logloss: 0.305601
[779]	valid_0's auc: 0.87826

[891]	valid_0's auc: 0.878941	valid_0's binary_logloss: 0.304786
[892]	valid_0's auc: 0.878951	valid_0's binary_logloss: 0.304775
[893]	valid_0's auc: 0.878954	valid_0's binary_logloss: 0.304771
[894]	valid_0's auc: 0.878955	valid_0's binary_logloss: 0.30477
[895]	valid_0's auc: 0.878956	valid_0's binary_logloss: 0.304769
[896]	valid_0's auc: 0.87896	valid_0's binary_logloss: 0.304765
[897]	valid_0's auc: 0.878972	valid_0's binary_logloss: 0.304748
[898]	valid_0's auc: 0.878973	valid_0's binary_logloss: 0.304747
[899]	valid_0's auc: 0.878978	valid_0's binary_logloss: 0.304741
[900]	valid_0's auc: 0.878985	valid_0's binary_logloss: 0.304734
[901]	valid_0's auc: 0.878985	valid_0's binary_logloss: 0.304733
[902]	valid_0's auc: 0.878986	valid_0's binary_logloss: 0.304732
[903]	valid_0's auc: 0.878987	valid_0's binary_logloss: 0.304732
[904]	valid_0's auc: 0.878987	valid_0's binary_logloss: 0.304731
[905]	valid_0's auc: 0.878989	valid_0's binary_logloss: 0.304729
[906]	valid_0's auc: 0.8789

[1018]	valid_0's auc: 0.879706	valid_0's binary_logloss: 0.303903
[1019]	valid_0's auc: 0.879709	valid_0's binary_logloss: 0.303899
[1020]	valid_0's auc: 0.879717	valid_0's binary_logloss: 0.30389
[1021]	valid_0's auc: 0.879727	valid_0's binary_logloss: 0.303879
[1022]	valid_0's auc: 0.879729	valid_0's binary_logloss: 0.303876
[1023]	valid_0's auc: 0.879733	valid_0's binary_logloss: 0.303871
[1024]	valid_0's auc: 0.879736	valid_0's binary_logloss: 0.303869
[1025]	valid_0's auc: 0.879744	valid_0's binary_logloss: 0.303859
[1026]	valid_0's auc: 0.879753	valid_0's binary_logloss: 0.30385
[1027]	valid_0's auc: 0.879755	valid_0's binary_logloss: 0.303849
[1028]	valid_0's auc: 0.879757	valid_0's binary_logloss: 0.303846
[1029]	valid_0's auc: 0.879758	valid_0's binary_logloss: 0.303845
[1030]	valid_0's auc: 0.879758	valid_0's binary_logloss: 0.303844
[1031]	valid_0's auc: 0.879761	valid_0's binary_logloss: 0.30384
[1032]	valid_0's auc: 0.879769	valid_0's binary_logloss: 0.303829
[1033]	valid_

In [None]:
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]
sub.to_csv('./result/2000.txt', index=None, header=None, sep='\t')

In [None]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi.sort_values(by='rate', ascending=False)[:60]