In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'
print(os.getcwd())

/extend/yezhizi/ZhiHuComp/zhcup


In [4]:
# 加载邀请回答数据
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
sub = test.copy()
sub_size = len(sub)
del test['dt']
logging.info("test %s", test.shape)



[2019-12-10 17:28:42,298] INFO in <ipython-input-4-f5fcce844f4e>: invite (9489162, 3)
[2019-12-10 17:28:44,494] INFO in <ipython-input-4-f5fcce844f4e>: test (1141683, 2)


In [5]:
# 加载 kfold feature
t1 = pd.read_csv(f'{feature_path}/train_kfold_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_kfold_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [6]:
# 加载 user 过去两个月的回答统计特征（除当条记录）
# t1 = pd.read_csv(f'{feature_path}/train_ua_feature.txt', sep='\t')
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_ua_feature.txt', sep='\t')
# test = pd.concat([test, t1], axis=1)

In [7]:
# 加载 invete feature 1
t1 = pd.read_csv(f'{feature_path}/train_invite_feature.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [8]:
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t')
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t')
test = pd.concat([test, t1], axis=1)

In [9]:
# # # 加载 member feature
# cate_feats = ['gender', 'freq',
#               'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5',
#               'uf_c5',
#               'cate_clustered']
# use_cols = []
# for i1 in range(len(cate_feats)):
#     for i2 in range(i1+1, len(cate_feats)):
#         use_cols.append('{}_{}_count'.format(cate_feats[i1], cate_feats[i2]))

# t1 = pd.read_csv(f'{feature_path}/train_member_feature.txt', sep='\t', usecols=use_cols)
# train = pd.concat([train, t1], axis=1)

# t1 = pd.read_csv(f'{feature_path}/test_member_feature.txt', sep='\t', usecols=use_cols)
# test = pd.concat([test, t1], axis=1)



In [10]:
# # 特征计数伪接受率
# # for feat in use_cols:
# for feat in train.keys():
#     if feat[-6:] != '_count':
#         continue

#     tmp = train[[feat, 'label']]
#     tmp.columns = [feat, feat+'_psd_lb']
#     t1 = tmp.groupby([feat]).mean().reset_index()
#     train = pd.merge(train, t1, on=[feat], how='left')
#     test = pd.merge(test, t1, on=[feat], how='left')

print(test.head())

qid          uid   day  hour  q_inv_kfold_mean  q_inv_kfold_sum  \
0  Q1493039281    M64135255  3870     9               0.0              0.0   
1  Q2023398782  M2536956560  3872    22               NaN              NaN   
2  Q4151338694  M3294926344  3874    15               NaN              NaN   
3  Q3271436624  M3744310794  3873     4               0.5              3.0   
4  Q3314287018  M1349051752  3872    19               NaN              NaN   

   q_inv_kfold_std  q_inv_kfold_count  u_inv_kfold_mean  u_inv_kfold_sum  ...  \
0         0.000000                2.0             0.125              1.0  ...   
1              NaN                NaN             0.000              0.0  ...   
2              NaN                NaN             0.100              1.0  ...   
3         0.547723                6.0             0.000              0.0  ...   
4              NaN                NaN             0.000              0.0  ...   

   qid_gender_count  qid_uf_b1_count  qid_uf_b2_count  

In [11]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t', usecols=[i for i in np.arange(21).tolist() if i not in [2,3,4,5,6]])
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)
    
q_lb = LabelEncoder()
q_lb.fit(list(train['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train['qid_enc'] = q_lb.transform(train['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train['uid_enc'] = u_lb.transform(train['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train.shape, test.shape)

[2019-12-10 17:33:56,656] INFO in <ipython-input-11-ddd3e7aab310>: user (1931654, 14)
[2019-12-10 17:34:00,162] INFO in <ipython-input-11-ddd3e7aab310>: user unq uid       1931654
gender          3
freq            5
uf_b1           2
uf_b2           2
uf_b3           2
uf_b4           2
uf_b5           2
uf_c1        2561
uf_c2         291
uf_c3         428
uf_c4        1556
uf_c5           2
score         732
dtype: int64
[2019-12-10 17:34:00,168] INFO in <ipython-input-11-ddd3e7aab310>: user cat ['gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-10 17:34:00,913] INFO in <ipython-input-11-ddd3e7aab310>: encode gender
[2019-12-10 17:34:01,795] INFO in <ipython-input-11-ddd3e7aab310>: encode freq
[2019-12-10 17:34:02,326] INFO in <ipython-input-11-ddd3e7aab310>: encode uf_c1
[2019-12-10 17:34:02,833] INFO in <ipython-input-11-ddd3e7aab310>: encode uf_c2
[2019-12-10 17:34:03,300] INFO in <ipython-input-11-ddd3e7aab310>: encode uf_c3
[2019-12-10 17:34:03,950] INFO in

In [12]:
data = pd.concat((train, test), axis=0, sort=True)
len_train = len(train)
del train

In [13]:
data

Unnamed: 0,day,diff_iq_day,diff_iq_hour,freq,gender,hour,intersection_ft_count,intersection_it_count,intersection_it_score,label,...,uid_hour_count,uid_hour_max,uid_hour_mean,uid_hour_median,uid_hour_min,uid_hour_std,uid_week_count,uid_week_mean,uid_week_median,uid_week_std
0,3865,4,95,4,2,22,1,0,0.000000,0.0,...,2,23,20.400000,22.0,12,4.722288,2,2.200000,2.0,1.643168
1,3844,21,495,1,2,11,0,0,0.000000,0.0,...,2,14,9.875000,9.5,7,3.044316,2,3.625000,4.0,1.922610
2,3862,1,24,4,2,15,0,0,0.000000,0.0,...,4,19,13.071428,13.0,7,3.561855,3,3.357143,3.5,1.736803
3,3849,2,37,0,2,11,0,1,1.066367,0.0,...,2,20,8.714286,8.0,0,6.421689,1,3.428571,3.0,1.718249
4,3867,20,469,1,2,4,0,0,0.000000,0.0,...,2,23,14.900000,16.0,4,6.740425,2,3.000000,3.0,2.108185
5,3841,6,143,0,1,16,1,0,0.000000,0.0,...,1,20,11.800000,10.0,5,5.266245,1,3.900000,4.0,1.100505
6,3861,244,5864,0,1,20,0,0,0.000000,0.0,...,2,20,11.800000,10.0,5,5.266245,5,3.900000,4.0,1.100505
7,3850,0,2,1,2,8,0,0,0.000000,1.0,...,2,11,7.727273,9.0,0,3.951985,3,3.000000,3.0,2.529822
8,3850,1,30,4,2,19,0,0,0.000000,0.0,...,2,19,17.200000,17.0,14,2.049390,1,2.400000,3.0,1.816590
9,3839,2,44,0,1,15,0,0,0.000000,0.0,...,3,20,14.416667,15.0,8,4.294993,2,3.583333,3.5,2.020726


In [14]:
# count 特征
# count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
# for feat in count_fea:
#     col_name = '{}_count'.format(feat)
#     data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
#     data.loc[data[col_name] < 2, feat] = -1
#     data[feat] += 1
#     data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
#     data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())

In [15]:
# 所有没有归一化的计数特征, 次数小于2的计数叠加为取值为-1的计数, 然后归一化
# 操作有点复杂, 对于交叉特征, 要先找出两个都是-1的当前计数是 a, 然后找出计数小于2的数量 b, 然后这两个的计数都改为 a+b
# 理解错了, 是将小于2的特征看做一个新的特征,不是和取值-1的合并
# for feat in data.keys():
#     if feat[-6:] == '_count':
#         feat_sub = feat.split('_')[:-1]
        
#         # 切成两部分, 两部分都不是特征下一个
#         is_feat = False
#         for i in range(len(feat_sub)-1):
#             feat1 = '_'.join(feat_sub[:i+1])
#             feat2 = '_'.join(feat_sub[i+1:])
#             if ((feat1 in data.keys()) & (feat2 in data.keys())):
#                 feat_sub = [feat1, feat2]
#                 is_feat = True
#                 break
    
#         if not is_feat:
#             continue

#         idx_less = data[feat].values < 2
#         num_less = np.sum(idx_less)

#         # idx_neg_1 = (data[feat_sub[0]] == -1)
#         # for sub_f in feat_sub[1:]:
#         #     idx_neg_1 = (idx_neg_1 & (data[sub_f] == -1))
        
#         # num_neg_1 = np.sum(idx_neg_1)

#         # new_count = num_less + num_neg_1
#         # data.loc[(idx_less | idx_neg_1), feat] = new_count
#         # print(feat, num_less, num_neg_1, new_count)

#         data.loc[idx_less, feat] = num_less
#         print(feat, num_less)

#         data[feat] = (data[feat] - data[feat].min()) / (data[feat].max() - data[feat].min()).astype('float32')



In [16]:
data['wk'] = data['day'] % 7

[2019-12-10 17:35:45,405] INFO in utils: Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2019-12-10 17:35:45,406] INFO in utils: NumExpr defaulting to 8 threads.


In [17]:
# count 二阶组合特征
# count_feats = ['freq', 'gender', 'score', 'qid_enc', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour']

# # 'qid_enc' 与其他所有特征组合计数: 877709: 839029
# count_feats = ['qid_enc', 'freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']

# tmp_data = data[count_feats]
# for fi in (count_feats[0:1]):
#     other_feats = [f for f in count_feats if f != fi]
#     for fj in other_feats:
#         t1 = tmp_data.groupby([fi, fj])
#         leftover = [f for f in other_feats if f != fj]

#         print(fi, fj)
#         # 计数,一次就好,都一样的 
#         t2 = t1[leftover[0]].agg(['count'])
#         t2.columns = [fi+'_'+fj+sub for sub in ['_count']]
#         data = pd.merge(data, t2, on=[fi, fj], how='left')

#         # for fk in leftover:
#         #     t2 = t1[fk].agg(['mean', 'sum', 'std'])
#         #     t2.columns = [fi+'_'+fj+'_'+fk+sub for sub in ['_mean', '_sum', '_std']]
#         #     data = pd.merge(data, t2, on=[fi, fj], how='left')
#         #     print(fi, fj, fk)
# print(data.head())

In [18]:
# # 'uid_enc', 与其他所有特征组合计数: 878069 删除了些特征 878379:839113
# count_feats = ['uid_enc', 'hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']

# tmp_data = data[count_feats]
# fi = count_feats[0]
# other_feats = [f for f in count_feats if f != fi]
# for fj in other_feats:
#     t1 = tmp_data.groupby([fi, fj])
#     leftover = [f for f in other_feats if f != fj]
#     print(fi, fj)
#     # 计数,一次就好,都一样的 
#     t2 = t1[leftover[0]].agg(['count'])
#     t2.columns = [fi+'_'+fj+sub for sub in ['_count']]
#     data = pd.merge(data, t2, on=[fi, fj], how='left')

In [35]:
# # 'hour' 与其他所有特征组合计数: 875196
# count_feats = ['hour', 'freq', 'gender', 'score', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']

# tmp_data = data[count_feats]
# for fi in (count_feats[0:1]):
#     other_feats = [f for f in count_feats if f != fi]
#     for fj in other_feats:
#         t1 = tmp_data.groupby([fi, fj])
#         leftover = [f for f in other_feats if f != fj]

#         print(fi, fj)
#         # 计数,一次就好,都一样的 
#         t2 = t1[leftover[0]].agg(['count'])
#         t2.columns = [fi+'_'+fj+sub for sub in ['_count']]
#         data = pd.merge(data, t2, on=[fi, fj], how='left')

In [36]:
# # 'day' 与其他所有特征组合计数: 881459: 
# count_feats = ['day', 'freq', 'gender', 'score', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'hour']

# tmp_data = data[count_feats]
# for fi in (count_feats[0:1]):
#     other_feats = [f for f in count_feats if f != fi]
#     for fj in other_feats:
#         t1 = tmp_data.groupby([fi, fj])
#         leftover = [f for f in other_feats if f != fj]

#         print(fi, fj)
#         # 计数,一次就好,都一样的 
#         t2 = t1[leftover[0]].agg(['count'])
#         t2.columns = [fi+'_'+fj+sub for sub in ['_count']]
#         data = pd.merge(data, t2, on=[fi, fj], how='left')

day freq
day gender
day score
day uf_b1
day uf_b2
day uf_b3
day uf_b4
day uf_b5
day uf_c1
day uf_c2
day uf_c3
day uf_c4
day uf_c5
day q_hour
day wk
day q_week
day diff_iq_day
day diff_iq_hour
day hour


In [48]:
feat_counts_save = []

other_feats = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for fj in other_feats:
    feat_counts_save.append(fj+'_count')

fi = 'qid_enc'
other_feats = ['freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']
for fj in other_feats:
    feat_counts_save.append(fi+'_'+fj+'_count')
fi = 'uid_enc'
other_feats = ['hour', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'day']
for fj in other_feats:
    feat_counts_save.append(fi+'_'+fj+'_count')
fi = 'day'
other_feats = ['freq', 'gender', 'score', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'wk', 'q_week', 'diff_iq_day', 'diff_iq_hour', 'hour']
for fj in other_feats:
    feat_counts_save.append(fi+'_'+fj+'_count')

print(feat_counts_save)

# data[feat_counts_save].to_csv(f'{feature_path}/count_features.txt', index=None, header=None, sep='\t')
# 加载 invete feature 2
t1 = pd.read_csv(f'{feature_path}/count_features.txt', sep='\t', header=None)
t1.columns = feat_counts_save
print(len(t1), len(data))
t1 = t1.reset_index(drop= True)
data = data.reset_index(drop= True)
data = pd.concat([data, t1], axis=1)

['uid_enc_count', 'qid_enc_count', 'gender_count', 'freq_count', 'uf_c1_count', 'uf_c2_count', 'uf_c3_count', 'uf_c4_count', 'uf_c5_count', 'qid_enc_freq_count', 'qid_enc_gender_count', 'qid_enc_score_count', 'qid_enc_uid_enc_count', 'qid_enc_uf_b1_count', 'qid_enc_uf_b2_count', 'qid_enc_uf_b3_count', 'qid_enc_uf_b4_count', 'qid_enc_uf_b5_count', 'qid_enc_uf_c1_count', 'qid_enc_uf_c2_count', 'qid_enc_uf_c3_count', 'qid_enc_uf_c4_count', 'qid_enc_uf_c5_count', 'qid_enc_hour_count', 'qid_enc_q_hour_count', 'qid_enc_wk_count', 'qid_enc_q_week_count', 'qid_enc_diff_iq_day_count', 'qid_enc_diff_iq_hour_count', 'qid_enc_day_count', 'uid_enc_hour_count', 'uid_enc_q_hour_count', 'uid_enc_wk_count', 'uid_enc_q_week_count', 'uid_enc_diff_iq_day_count', 'uid_enc_diff_iq_hour_count', 'uid_enc_day_count', 'day_freq_count', 'day_gender_count', 'day_score_count', 'day_uf_b1_count', 'day_uf_b2_count', 'day_uf_b3_count', 'day_uf_b4_count', 'day_uf_b5_count', 'day_uf_c1_count', 'day_uf_c2_count', 'day_u

In [47]:
# # # 'diff_iq_day' 与其他所有特征组合计数: 8810
# count_feats = ['diff_iq_day', 'freq', 'gender', 'score', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'wk', 'q_week', 'diff_iq_hour', 'hour']

# tmp_data = data[count_feats]
# for fi in (count_feats[0:1]):
#     other_feats = [f for f in count_feats if f != fi]
#     for fj in other_feats:
#         t1 = tmp_data.groupby([fi, fj])
#         leftover = [f for f in other_feats if f != fj]

#         print(fi, fj)
#         # 计数,一次就好,都一样的 
#         t2 = t1[leftover[0]].agg(['count'])
#         t2.columns = [fi+'_'+fj+sub for sub in ['_count']]
#         data = pd.merge(data, t2, on=[fi, fj], how='left')

diff_iq_day freq
diff_iq_day gender
diff_iq_day score
diff_iq_day uf_b1
diff_iq_day uf_b2
diff_iq_day uf_b3
diff_iq_day uf_b4
diff_iq_day uf_b5
diff_iq_day uf_c1
diff_iq_day uf_c2
diff_iq_day uf_c3
diff_iq_day uf_c4
diff_iq_day uf_c5
diff_iq_day q_hour
diff_iq_day wk
diff_iq_day q_week
diff_iq_day diff_iq_hour
diff_iq_day hour


In [37]:
# # 'qid_enc','wk' 与其他所有特征组合计数: 878245: 839068
# count_feats = ['qid_enc', 'wk', 'freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'hour', 'q_hour', 'day', 'q_week', 'diff_iq_day', 'diff_iq_hour']

# tmp_data = data[count_feats]
# fi = count_feats[0:2]
# other_feats = [f for f in count_feats if f not in fi]
# for fj in other_feats:
#     t1 = tmp_data.groupby(fi + [fj])
#     leftover = [f for f in other_feats if f != fj]
#     print(fi + [fj])
#     # 计数,一次就好,都一样的 
#     t2 = t1[leftover[0]].agg(['count'])
#     t2.columns = ['_'.join(fi)+'_'+fj+sub for sub in ['_count']]
#     data = pd.merge(data, t2, on=fi + [fj], how='left')
# print(data.head(1))

In [38]:
# 'qid_enc','hour' 与其他所有特征组合计数: 878196
#count_feats = ['qid_enc', 'hour', 'freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'day', 'q_week', 'diff_iq_day', 'diff_iq_hour']

#tmp_data = data[count_feats]
#fi = count_feats[0:2]
#other_feats = [f for f in count_feats if f not in fi]
#for fj in other_feats:
#    t1 = tmp_data.groupby(fi + [fj])
#    leftover = [f for f in other_feats if f != fj]
##    print(fi + [fj])
    # 计数,一次就好,都一样的 
  ##  t2 = t1[leftover[0]].agg(['count'])
 #   t2.columns = ['_'.join(fi)+'_'+fj+sub for sub in ['_count']]
 #   data = pd.merge(data, t2, on=fi + [fj], how='left')
#print(data.head(1))

In [39]:
# # 'qid_enc','day' 与其他所有特征组合计数: 
# count_feats = ['qid_enc', 'day', 'freq', 'gender', 'score', 'uid_enc', 'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'q_hour', 'q_week', 'diff_iq_day', 'diff_iq_hour']

# tmp_data = data[count_feats]
# fi = count_feats[0:2]
# other_feats = [f for f in count_feats if f not in fi]
# for fj in other_feats:
#     t1 = tmp_data.groupby(fi + [fj])
#     leftover = [f for f in other_feats if f != fj]
#     print(fi + [fj])
#     # 计数,一次就好,都一样的 
#     t2 = t1[leftover[0]].agg(['count'])
#     t2.columns = ['_'.join(fi)+'_'+fj+sub for sub in ['_count']]
#     data = pd.merge(data, t2, on=fi + [fj], how='left')
# print(data.head(1))

In [49]:
drop_feat = ['label', 'uid', 'qid', 'dt', 'day'] 
drop_feat += ['u_is_rec_mean', 'u_reci_uncheer_mean', 'q_is_dest_sum', 'u_reci_uncheer_sum', 'u_is_rec_max', 
             'u_is_dest_mean','q_reci_uncheer_mean', 'q_reci_uncheer_sum', 'u_is_dest_sum', 'q_is_dest_max',
             'q_reci_uncheer_max', 'u_reci_tks_max', 'q_reci_mark_max','u_reci_dis_max', 'q_has_video_mean',
             'q_reci_no_help_mean', 'count_u_topic', 'u_has_video_mean', 'q_reci_dis_sum', 'q_reci_mark_sum',
             'q_reci_tks_sum','q_reci_tks_max','q_reci_dis_max','u_reci_mark_max','q_is_good_mean',
             'q_reci_no_help_sum', 'q_reci_xxx_max', 'u_reci_xxx_max','u_reci_no_help_sum','u_reci_xxx_sum',
              'u_is_good_mean','q_reci_no_help_max','u_has_img_max','u_is_good_sum','u_reci_no_help_max',
              'u_has_video_sum','uf_b5','q_reci_xxx_sum','q_is_good_sum','q_has_img_max','q_has_video_sum',
              'q_has_video_max','u_has_video_max','q_is_good_max','q_is_rec_max','u_is_good_max',
              'q_is_dest_mean','u_reci_uncheer_max','uf_c5_count','u_is_dest_max','q_is_rec_mean',
              'q_is_rec_sum','u_is_rec_sum', 'q_reci_xxx_mean','u_reci_xxx_mean','u_reci_comment_max',
              'q_reci_comment_sum','u_reci_cheer_max','u_reci_dis_sum','u_reci_tks_sum','q_has_img_sum',
              'q_reci_comment_max','q_reci_cheer_max','u_reci_no_help_mean','u_has_img_sum','u_reci_mark_sum']
feature_cols = [x for x in data.columns if x not in drop_feat]
# feature_cols

In [50]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len_train][feature_cols]
y_train_all = data.iloc[:len_train]['label']
X_test = data.iloc[len_train:]
assert len(X_test) == sub_size

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

logging.info("train shape %s, val shape %s, test shape %s", X_train.shape, X_val.shape, X_test.shape)

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)



s binary_logloss: 0.306385
[1488]	valid_0's auc: 0.880053	valid_0's binary_logloss: 0.306385
[1489]	valid_0's auc: 0.880055	valid_0's binary_logloss: 0.306383
[1490]	valid_0's auc: 0.88006	valid_0's binary_logloss: 0.306376
[1491]	valid_0's auc: 0.880061	valid_0's binary_logloss: 0.306376
[1492]	valid_0's auc: 0.880064	valid_0's binary_logloss: 0.306373
[1493]	valid_0's auc: 0.880064	valid_0's binary_logloss: 0.306373
[1494]	valid_0's auc: 0.880067	valid_0's binary_logloss: 0.306368
[1495]	valid_0's auc: 0.88007	valid_0's binary_logloss: 0.306365
[1496]	valid_0's auc: 0.880074	valid_0's binary_logloss: 0.30636
[1497]	valid_0's auc: 0.880082	valid_0's binary_logloss: 0.30635
[1498]	valid_0's auc: 0.880086	valid_0's binary_logloss: 0.306346
[1499]	valid_0's auc: 0.880092	valid_0's binary_logloss: 0.30634
[1500]	valid_0's auc: 0.880093	valid_0's binary_logloss: 0.306339
[1501]	valid_0's auc: 0.880096	valid_0's binary_logloss: 0.306336
[1502]	valid_0's auc: 0.880099	valid_0's binary_loglos

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [51]:
# sub = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# sub.columns = ['qid', 'uid', 'dt']
sub['label'] = model_lgb.predict_proba(X_test[feature_cols])[:, 1]

In [52]:
sub.to_csv('./result/submit_%.6f.txt' % (model_lgb.best_score_['valid_0']['auc']), index=None, header=None, sep='\t')

In [53]:
fi = pd.DataFrame({'feature': feature_cols, 'imp': model_lgb.feature_importances_})
fi['rate'] = fi['imp'] / fi['imp'].sum()
fi

Unnamed: 0,feature,imp,rate
0,diff_iq_day,150,0.002875
1,diff_iq_hour,489,0.009373
2,freq,153,0.002933
3,gender,96,0.001840
4,hour,909,0.017424
5,intersection_ft_count,205,0.003929
6,intersection_it_count,59,0.001131
7,intersection_it_score,176,0.003374
8,q_ans_kfold_count,550,0.010542
9,q_diff_qa_days_max,518,0.009929


In [54]:
fi.sort_values(by='rate', ascending=False)[-30:]

Unnamed: 0,feature,imp,rate
119,qid_enc_uf_c4_count,81,0.001553
159,diff_iq_day_uf_b3_count,77,0.001476
106,uf_c4_count,76,0.001457
122,qid_enc_q_hour_count,73,0.001399
157,diff_iq_day_uf_b1_count,73,0.001399
34,qid_diff_week_weekmedian,72,0.00138
86,uid_diff_week_weekmedian,71,0.001361
103,uf_c1_count,71,0.001361
131,uid_enc_q_week_count,66,0.001265
96,uid_week_median,66,0.001265


In [55]:
print(np.min(t1))
print( model_lgb.best_score_['valid_0']['auc'])
print([feat for feat in data.keys() if ((feat[-6:] != '_count') & (feat[-7:] != '_psd_lb') & (feat[-4:] != '_max') & (feat[-4:] != '_min') & (feat[-4:] != '_sum') 
                                      & (feat[-4:] != '_std') & (feat[-6:] != 'median') 
                                      & (feat[-4:] != 'mean'))])

freq  gender  score  uf_b1  uf_b2  uf_b3  uf_b4  uf_b5  \
diff_iq_day hour                                                           
-32         12       0       0    461      0      0      0      0      0   
            16       0       0    448      0      0      0      0      0   
-26         2        0       0    260      0      0      0      0      0   
-20         14       0       0    820      0      1      0      1      0   
            23       0       0    590      0      0      0      0      0   
-18         12       4       1    287      1      0      0      0      0   
            17       0       1    601      0      0      0      1      0   
            20       0       0    359      0      0      0      0      0   
            21       0       1    344      1      1      0      0      0   
-16         17       4       0    300      0      0      0      0      0   
            19       4       2    320      1      0      0      0      0   
-15         11       4       0