In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import pickle

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']

del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)

del test['dt']
logging.info("test %s", test.shape)

[2019-12-09 16:42:01,979] INFO in <ipython-input-4-e22c8289580a>: invite (9489162, 3)
[2019-12-09 16:42:03,537] INFO in <ipython-input-4-e22c8289580a>: test (1141683, 2)


In [5]:
# 加载 ans kfold feature
cols = ['day', 'hour', 'q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count', 
         'u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count', 
         'q_ans_kfold_count', 'u_ans_kfold_count', 'q_diff_qa_days_sum', 'q_diff_qa_days_max', 
         'q_diff_qa_days_mean', 'u_diff_qa_days_sum', 'u_diff_qa_days_max', 'u_diff_qa_days_mean']
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t', usecols=cols)
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t', usecols=cols)
test = pd.concat([test, t1], axis=1)

In [6]:
train['week'] = train['day']%7
test['week'] = test['day']%7

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [8]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [9]:
# 加载 kfold topic feature, QU
t1 = pd.read_csv(f'{feature_path}/train_kfold_topic_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_topic_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [10]:
# 加载 user kfold topic feature，UU
t1 = pd.read_csv(f'{feature_path}/train_kfold_ut_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_ut_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [11]:
train

Unnamed: 0,qid,uid,label,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,...,min_uu_sim,max_uu_sim,sum_uu_sim,mean_uu_sim,std_uu_sim,min_uu_sim_eucl,max_uu_sim_eucl,sum_uu_sim_eucl,mean_uu_sim_eucl,std_uu_sim_eucl
0,Q2166419046,M401693808,0,3865,22,,,,,0.000000,...,,,,,,,,,,
1,Q1550017551,M3392373099,0,3844,11,0.166667,1.0,0.408248,6.0,0.000000,...,,,,,,,,,,
2,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,0.207833,0.984294,8.984843,0.598990,0.285644,4.781230,30.308872,288.8628,19.257520,8.653774
3,Q2350061229,M1618461867,0,3849,11,,,,,0.000000,...,,,,,,,,,,
4,Q2443223942,M3544409350,0,3867,4,0.375000,57.0,0.485723,152.0,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,Q2372512458,M4285896253,1,3849,11,,,,,0.394737,...,-0.373491,0.981553,24.051767,0.126588,0.302936,4.855520,40.901810,5760.9873,30.320986,7.090570
9489158,Q3516644442,M4285896253,1,3862,12,,,,,0.314286,...,-0.310439,0.981553,29.133970,0.153337,0.272248,4.855520,45.733260,6070.9185,31.952202,7.340276
9489159,Q3847094730,M4285896253,0,3852,8,0.000000,0.0,,1.0,0.394737,...,-0.373491,0.981553,24.051767,0.126588,0.302936,4.855520,40.901810,5760.9873,30.320986,7.090570
9489160,Q2358485548,M4285896253,0,3864,7,,,,,0.314286,...,-0.310439,0.981553,29.133970,0.153337,0.272248,4.855520,45.733260,6070.9185,31.952202,7.340276


In [12]:
# 加载 kfold label 特征
single_targets = ['uid', 'qid', 'freq', 'score', 
                  'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                  'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
                  'diff_iq_day', 'diff_iq_hour', 
                  'intersection_ft_count', 'intersection_it_count']
for f in single_targets:
#     extract_feat_1 = [f+'_kfold_count', f+'_label_mean', f+'_label_sum', f+'_label_std']
#     extract_feat_2 = [f+'_kfold_hour_count', f+'_label_hour_mean', 
#                       f+'_label_hour_sum', f+'_label_hour_std']
#     extract_feat_3 = [f+'_kfold_week_count', f+'_label_week_mean', 
#                       f+'_label_week_sum', f+'_label_week_std']
#     extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3

    logging.info('adding kfold label feature, at: %s', f)
    
    t1 = pickle.load(open(f'{feature_path}/single_kfold_feat/train_{f}.pkl', 'rb'))
    train = pd.concat([train, t1], axis=1)
    
    t1 = pickle.load(open(f'{feature_path}/single_kfold_feat/test_{f}_merged.pkl', 'rb'))
    test = pd.concat([test, t1], axis=1)
    
#     t1 = pickle.load(open(f'{feature_path}/single_kfold_feat/test_{f}_t1.pkl', 'rb'))
#     test = pd.merge(test, t1, on=f, how='left')
#     t1 = pickle.load(open(f'{feature_path}/single_kfold_feat/test_{f}_t2.pkl', 'rb'))
#     test = pd.merge(test, t1, on=[f, 'hour'], how='left')
#     t1 = pickle.load(open(f'{feature_path}/single_kfold_feat/test_{f}_t3.pkl', 'rb'))
#     test = pd.merge(test, t1, on=[f, 'week'], how='left')
#     for i in range(0, len(extract_feat), 2):
#         test[extract_feat[i]] = test[extract_feat[i]].fillna(0).astype('int32')
#     for i in range(1, len(extract_feat), 2):
#         test[extract_feat[i]] = test[extract_feat[i]].astype('float32')

[2019-12-09 17:08:48,971] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uid
[2019-12-09 17:09:02,081] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: qid
[2019-12-09 17:09:16,009] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: freq
[2019-12-09 17:09:30,321] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: score
[2019-12-09 17:09:46,115] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uf_b1
[2019-12-09 17:10:02,078] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uf_b2
[2019-12-09 17:10:19,345] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uf_b3
[2019-12-09 17:10:37,132] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uf_b4
[2019-12-09 17:10:56,136] INFO in <ipython-input-12-792d4e8953f7>: adding kfold label feature, at: uf_b5
[2019-12-09 17:11:15,780] INFO in <ipython-input-12-792d4e89

In [13]:
train

Unnamed: 0,qid,uid,label,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,...,intersection_it_count_label_sum,intersection_it_count_label_std,intersection_it_count_kfold_hour_count,intersection_it_count_label_hour_mean,intersection_it_count_label_hour_sum,intersection_it_count_label_hour_std,intersection_it_count_kfold_week_count,intersection_it_count_label_week_mean,intersection_it_count_label_week_sum,intersection_it_count_label_week_std
0,Q2166419046,M401693808,0,3865,22,,,,,0.000000,...,1179461,0.391889,260538,0.171806,44762,0.377212,818421,0.177723,145452,0.382279
1,Q1550017551,M3392373099,0,3844,11,0.166667,1.0,0.408248,6.0,0.000000,...,1130802,0.379072,489263,0.174875,85560,0.379861,966324,0.168472,162799,0.374286
2,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,1179461,0.391889,370132,0.178488,66064,0.382923,840145,0.189349,159081,0.391786
3,Q2350061229,M1618461867,0,3849,11,,,,,0.000000,...,72031,0.340461,41449,0.127506,5285,0.333543,72900,0.134280,9789,0.340955
4,Q2443223942,M3544409350,0,3867,4,0.375000,57.0,0.485723,152.0,0.000000,...,1179461,0.391889,28506,0.248614,7087,0.432217,1070070,0.187207,200325,0.390078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,Q2372512458,M4285896253,1,3849,11,,,,,0.394737,...,1184442,0.382966,455098,0.183901,83693,0.387404,885195,0.180645,159906,0.384724
9489158,Q3516644442,M4285896253,1,3862,12,,,,,0.314286,...,1179461,0.391889,358731,0.185314,66478,0.388553,840145,0.189349,159081,0.391786
9489159,Q3847094730,M4285896253,0,3852,8,0.000000,0.0,,1.0,0.394737,...,1184442,0.382966,719732,0.164349,118287,0.370592,1067729,0.180605,192837,0.384691
9489160,Q2358485548,M4285896253,0,3864,7,,,,,0.314286,...,1179461,0.391889,420698,0.199923,84107,0.399942,852160,0.183287,156190,0.386902


In [14]:
test

Unnamed: 0,qid,uid,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,u_inv_kfold_sum,...,intersection_it_count_label_sum,intersection_it_count_label_std,intersection_it_count_kfold_hour_count,intersection_it_count_label_hour_mean,intersection_it_count_label_hour_sum,intersection_it_count_label_hour_std,intersection_it_count_kfold_week_count,intersection_it_count_label_week_mean,intersection_it_count_label_week_sum,intersection_it_count_label_week_std
0,Q1493039281,M64135255,3870,9,0.0,0.0,0.000000,2.0,0.125000,1.0,...,73763,0.338908,32422,0.164129,5321,0.370397,70854,0.138965,9846,0.345912
1,Q2023398782,M2536956560,3872,22,,,,,0.000000,0.0,...,1203297,0.385397,310342,0.153678,47692,0.360640,918459,0.172960,158856,0.378213
2,Q4151338694,M3294926344,3874,15,,,,,0.100000,1.0,...,1203297,0.385397,376357,0.174779,65779,0.379778,1070512,0.179891,192575,0.384096
3,Q3271436624,M3744310794,3873,4,0.5,3.0,0.547723,6.0,0.000000,0.0,...,1203297,0.385397,28863,0.251647,7263,0.433965,1040677,0.178691,185959,0.383093
4,Q3314287018,M1349051752,3872,19,,,,,0.000000,0.0,...,1203297,0.385397,391682,0.150698,59025,0.357754,918459,0.172960,158856,0.378213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,Q1238703523,M2010778235,3869,20,,,,,0.142857,1.0,...,1203297,0.385397,255560,0.177842,45449,0.382381,916839,0.180572,165555,0.384663
1141679,Q1074024036,M3131383616,3872,21,,,,,1.000000,1.0,...,1203297,0.385397,285587,0.158766,45341,0.365458,918459,0.172960,158856,0.378213
1141680,Q3478846332,M1872860897,3871,15,,,,,0.000000,0.0,...,1203297,0.385397,376357,0.174779,65779,0.379778,921743,0.176812,162975,0.381510
1141681,Q734170704,M3574631517,3871,8,,,,,0.125000,2.0,...,1203297,0.385397,749352,0.162356,121661,0.368777,921743,0.176812,162975,0.381510


In [None]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-09 17:15:56,395] INFO in <ipython-input-15-aaab6ed818bf>: user (1931654, 14)
[2019-12-09 17:16:00,105] INFO in <ipython-input-15-aaab6ed818bf>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-09 17:16:00,110] INFO in <ipython-input-15-aaab6ed818bf>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-09 17:16:01,104] INFO in <ipython-input-15-aaab6ed818bf>: encode gender
[2019-12-09 17:16:02,010] INFO in <ipython-input-15-aaab6ed818bf>: encode freq
[2019-12-09 17:16:02,849] INFO in <ipython-input-15-aaab6ed818bf>: encode uf_c1
[2019-12-09 17:16:03,653] INFO in <ipython-input-15-aaab6ed818bf>: encode uf_c2
[2019-12-09 17:16:04,448] INFO in <ipython-input-15-aaab6ed818bf>: encode uf_c3
[2019-12-09 17:16:05,250] INFO in

In [None]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [None]:
# count 特征
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [None]:
drop_feat = ('label', 'uid', 'qid', 'dt', 'day') 
# drop_feat += ['q_is_good_sum', 'q_is_good_max', 'q_is_good_mean', 'u_is_good_sum', 'u_is_good_max', 
#               'u_is_good_mean', 'q_is_rec_sum', 'q_is_rec_max', 'q_is_rec_mean', 'u_is_rec_sum', 
#               'u_is_rec_max', 'u_is_rec_mean', 'q_is_dest_sum', 'q_is_dest_max', 'q_is_dest_mean', 
#               'u_is_dest_sum', 'u_is_dest_max', 'u_is_dest_mean', 'q_has_img_sum', 'q_has_img_max', 
#               'q_has_img_mean', 'u_has_img_sum', 'u_has_img_max', 'u_has_img_mean', 'q_has_video_sum', 
#               'q_has_video_max', 'q_has_video_mean', 'u_has_video_sum', 'u_has_video_max', 'u_has_video_mean', 
#               'q_word_count_sum', 'q_word_count_max', 'q_word_count_mean', 'u_word_count_sum', 
#               'u_word_count_max', 'u_word_count_mean', 'q_reci_cheer_sum', 'q_reci_cheer_max', 
#               'q_reci_cheer_mean', 'u_reci_cheer_sum', 'u_reci_cheer_max', 'u_reci_cheer_mean', 
#               'q_reci_uncheer_sum', 'q_reci_uncheer_max', 'q_reci_uncheer_mean', 'u_reci_uncheer_sum',
#               'u_reci_uncheer_max', 'u_reci_uncheer_mean', 'q_reci_comment_sum', 'q_reci_comment_max', 
#               'q_reci_comment_mean', 'u_reci_comment_sum', 'u_reci_comment_max', 'u_reci_comment_mean', 
#               'q_reci_mark_sum', 'q_reci_mark_max', 'q_reci_mark_mean', 'u_reci_mark_sum', 'u_reci_mark_max', 
#               'u_reci_mark_mean', 'q_reci_tks_sum', 'q_reci_tks_max', 'q_reci_tks_mean', 'u_reci_tks_sum', 
#               'u_reci_tks_max', 'u_reci_tks_mean', 'q_reci_xxx_sum', 'q_reci_xxx_max', 'q_reci_xxx_mean', 
#               'u_reci_xxx_sum', 'u_reci_xxx_max', 'u_reci_xxx_mean', 'q_reci_no_help_sum', 'q_reci_no_help_max',
#               'q_reci_no_help_mean', 'u_reci_no_help_sum', 'u_reci_no_help_max', 'u_reci_no_help_mean', 
#               'q_reci_dis_sum', 'q_reci_dis_max', 'q_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_dis_max', 
#               'u_reci_dis_mean']

feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [None]:
feature_cols

In [None]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_cols]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



In [None]:
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]
sub.to_csv('./result/2000.txt', index=None, header=None, sep='\t')

In [None]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi_sorted = fi.sort_values(by='rate', ascending=False)

In [None]:
# import pickle
# pickle.dump(fi_sorted, open('./feature_importance.pkl', 'wb'))

In [None]:
fi_sorted[:60]

In [None]:
temp = []
for i in list(fi_sorted[-80:-60].feature):
    temp.append(i)
temp