In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import multiprocessing
import traceback
import pickle
import gc

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

base_path = './data'
feature_path = './feature'

In [3]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-12 12:31:31,515] INFO in <ipython-input-3-af34e33feb4f>: invite (9489162, 4)
[2019-12-12 12:31:33,520] INFO in <ipython-input-3-af34e33feb4f>: test (1141683, 3)


In [4]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['week'] = train['day'] % 7
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['week'] = test['day'] % 7
test['hour'] = extract_hour(test['dt'])

In [5]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']
del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')

del user
gc.collect()

[2019-12-12 12:33:05,324] INFO in <ipython-input-5-5451e5d8303c>: user (1931654, 14)


0

In [6]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2'], ques['topic']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
ques['q_week'] = ques['q_day'] % 7

del ques['q_dt']

# merge ques
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

del ques
gc.collect()

[2019-12-12 12:34:20,053] INFO in <ipython-input-6-56a30ffdc19b>: ques (1829900, 2)


0

In [7]:
train['diff_iq_day'] = train['day'] - train['q_day']
train['diff_iq_hour'] = train['diff_iq_day'] * 24 + (train['hour'] - train['q_hour'])

test['diff_iq_day'] = test['day'] - test['q_day']
test['diff_iq_hour'] = test['diff_iq_day'] * 24 + (test['hour'] - test['q_hour'])

In [8]:
def diff_iq_day_map(x):
    if x>=31:
        return 31
    if x<0:
        return 0
    return x

train['diff_iq_day'] = train['diff_iq_day'].apply(diff_iq_day_map)
test['diff_iq_day'] = test['diff_iq_day'].apply(diff_iq_day_map)

def diff_iq_hour_map(x):
    if x<0:
        return 0
    if x>200:
        return 40
    return x // 5

train['diff_iq_hour'] = train['diff_iq_hour'].apply(diff_iq_hour_map)
test['diff_iq_day'] = test['diff_iq_day'].apply(diff_iq_day_map)

In [9]:
def score_map(x):
    if x<=280:
        return -1
    if x<=300:
        return 0
    if 300<x<=350:
        return 1
    if 350<x<=400:
        return 2
    if 400<x<=500:
        return 3
    if 500<x<=600:
        return 4
    if 600<x<=700:
        return 5
    if 700<x<=800:
        return 6
    return 7

train['score'] = train['score'].apply(score_map)
test['score'] = test['score'].apply(score_map)

In [10]:
# 加载 invete feature 2: intersection_ft_count, intersection_it_count
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
test = pd.concat([test, t1], axis=1)

# 划分 intersection_ft_count
def to_bin_1(x):
    if x>=3:
        return 3
    return x

train['intersection_ft_count'] = train['intersection_ft_count'].apply(to_bin_1)
test['intersection_ft_count'] = test['intersection_ft_count'].apply(to_bin_1)

# 划分 intersection_it_count
def to_bin_2(x):
    if x>=4:
        return 4
    return x

train['intersection_it_count'] = train['intersection_it_count'].apply(to_bin_2)
test['intersection_it_count'] = test['intersection_it_count'].apply(to_bin_2)

In [11]:
# 加载 kfold topic feature, QU
t1 = pd.read_csv(f'./feature/train_kfold_topic_feature.txt', sep='\t', 
                 usecols=['qu_topic_count_weight', 'qu_topic_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'./feature/test_kfold_topic_feature.txt', sep='\t', 
                 usecols=['qu_topic_count_weight', 'qu_topic_count'])
test = pd.concat([test, t1], axis=1)

def qu_weight_map(x):
    if x<=2:
        return x
    if x<=4:
        return x
    for i in range(1, 16):
        if x<=4+5*i:
            return 4+i
    return 20
train['qu_topic_count_weight'] = train['qu_topic_count_weight'].apply(qu_weight_map)
test['qu_topic_count_weight'] = test['qu_topic_count_weight'].apply(qu_weight_map)

def qu_count_map(x):
    if x>=6:
        return 5
    return x
train['qu_topic_count'] = train['qu_topic_count'].apply(qu_count_map)
test['qu_topic_count'] = test['qu_topic_count'].apply(qu_count_map)

In [12]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    
train['fold'] = train['day'].apply(fold_fn)

In [13]:
train.columns

Index(['qid', 'uid', 'dt', 'label', 'day', 'week', 'hour', 'gender', 'freq',
       'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3',
       'uf_c4', 'uf_c5', 'score', 'q_day', 'q_hour', 'q_week', 'diff_iq_day',
       'diff_iq_hour', 'intersection_ft_count', 'intersection_it_count',
       'qu_topic_count_weight', 'qu_topic_count', 'fold'],
      dtype='object')

# 当天基础统计信息

In [47]:
# 训练集上当天统计
def day_stat(train_df, f):
    logging.info('day answer stat on: %s', f)
    name_prefix = f + '_day_'
    extract_feat = [name_prefix+'labelcnt', name_prefix+'labelsum', name_prefix+'labelrate', name_prefix+'labelneg']
    t1 = train_df.groupby([f,'day'], as_index=False)['label'].agg({
        name_prefix + 'labelcnt': 'count',
        name_prefix + 'labelsum': 'sum',
        name_prefix + 'labelrate': 'mean',
    })
    t1[name_prefix+'labelneg'] = t1[name_prefix+'labelcnt'] - t1[name_prefix+'labelsum']  # 负样本
    res = pd.merge(train_df, t1, on=[f,'day'], how='left')
    return res[extract_feat]

In [None]:
single_feat = ['uid', 'qid', 'freq', 'gender', 'score', 'week', 'hour', 
               'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
               'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
               'diff_iq_day', 'diff_iq_hour', 
               'intersection_ft_count', 'intersection_it_count']  # 少了 uid, qid
for feat in single_feat:
    t1 = day_stat(train, feat)
    logging.info('stat on %s by day, extract finished. shape: %s', feat, t1.shape)
    # 压缩数据
    t = t1.dtypes
    for x in t[t == 'int64'].index:
        t1[x] = t1[x].astype('int32')

    for x in t[t == 'float64'].index:
        t1[x] = t1[x].astype('float32')
    
    pickle.dump(t1, open(f'./temp_label_feat/{feat}_day.pkl', 'wb'))

# Kfold 一阶

In [48]:
feat = 'gender'
t1 = pickle.load(open(f'./temp_label_feat/{feat}_day.pkl', 'rb'))
t1 = pd.concat([train, t1], axis=1)

In [49]:
t1

Unnamed: 0,qid,uid,dt,label,day,week,hour,gender,freq,uf_b1,...,diff_iq_hour,intersection_ft_count,intersection_it_count,qu_topic_count_weight,qu_topic_count,fold,gender_day_labelcnt,gender_day_labelsum,gender_day_labelrate,gender_day_labelneg
0,Q2166419046,M401693808,D3865-H22,0,3865,1,22,unknown,weekly,0,...,19,1,0,0.0,0.0,3,186746,29398,0.157422,157348
1,Q1550017551,M3392373099,D3844-H11,0,3844,1,11,unknown,monthly,1,...,40,0,0,0.0,0.0,0,115731,19632,0.169635,96099
2,Q604029601,M2317670257,D3862-H15,0,3862,5,15,unknown,weekly,1,...,4,0,0,0.0,0.0,3,176253,28267,0.160377,147986
3,Q2350061229,M1618461867,D3849-H11,0,3849,6,11,unknown,daily,1,...,7,0,1,0.0,0.0,1,113738,21990,0.193339,91748
4,Q2443223942,M3544409350,D3867-H4,0,3867,3,4,unknown,monthly,1,...,40,0,0,0.0,0.0,3,163102,26149,0.160323,136953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,Q2372512458,M4285896253,D3849-H11,1,3849,6,11,male,daily,1,...,40,0,0,2.0,1.0,1,79039,15646,0.197953,63393
9489158,Q3516644442,M4285896253,D3862-H12,1,3862,5,12,male,daily,1,...,0,1,0,0.0,0.0,3,104151,16073,0.154324,88078
9489159,Q3847094730,M4285896253,D3852-H8,0,3852,2,8,male,daily,1,...,1,1,0,3.0,1.0,1,87353,15310,0.175266,72043
9489160,Q2358485548,M4285896253,D3864-H7,0,3864,0,7,male,daily,1,...,0,0,0,0.0,0.0,3,99346,15398,0.154994,83948


# Kfold 二阶交叉

In [16]:
def kfold_train_label_stat(train_df, test_df, base_feat, other_feat):
    t1 = train_df.copy()
    t2 = test_df.copy()
    extract_feat = []
    for of in other_feat:
        logging.info('at %s', of)
        for bf in base_feat:
            colname1 = of + '_' + bf + '_kfold_mean'
            colname2 = of + '_' + bf + '_kfold_median'
            extract_feat += [colname1, colname2]
            
            # train
            for fold_ in range(4):
                log_df = train_df[train_df['fold']!=fold_]
                val_df = train_df[train_df['fold']==fold_]
                # mean
                order_label = log_df.groupby(of)[bf].mean()
                t1.loc[t1['fold']==fold_, colname1] = val_df[of].map(order_label)
                # median
                order_label = log_df.groupby(of)[bf].median()
                t1.loc[t1['fold']==fold_, colname2] = val_df[of].map(order_label)
                
            # test
            order_label = train_df.groupby(of)[bf].mean()
            t2[colname1] = test_df[of].map(order_label)
            order_label = train_df.groupby(of)[bf].median()
            t2[colname2] = test_df[of].map(order_label)
            
    return t1[extract_feat], t2[extract_feat]

## uid 交叉

In [15]:
t1 = pickle.load(open('./temp_label_feat/uid_day.pkl', 'rb'))
train = pd.concat([train, t1], axis=1)

In [20]:
base = ['uid_day_labelcnt', 'uid_day_labelsum', 'uid_day_labelrate', 'uid_day_labelneg']
other = ['week', 'hour', 'diff_iq_day', 'diff_iq_hour', 'intersection_ft_count', 'intersection_it_count',
        'qu_topic_count', 'qu_topic_count_weight']
tt1, tt2 = kfold_train_label_stat(train, test, base, other)

[2019-12-12 09:23:04,629] INFO in <ipython-input-16-96d675d44b46>: at week
[2019-12-12 09:25:12,779] INFO in <ipython-input-16-96d675d44b46>: at hour
[2019-12-12 09:26:44,464] INFO in <ipython-input-16-96d675d44b46>: at diff_iq_day
[2019-12-12 09:27:56,639] INFO in <ipython-input-16-96d675d44b46>: at diff_iq_hour
[2019-12-12 09:30:15,586] INFO in <ipython-input-16-96d675d44b46>: at intersection_ft_count
[2019-12-12 09:33:00,103] INFO in <ipython-input-16-96d675d44b46>: at intersection_it_count
[2019-12-12 09:35:49,799] INFO in <ipython-input-16-96d675d44b46>: at qu_topic_count
[2019-12-12 09:38:56,780] INFO in <ipython-input-16-96d675d44b46>: at qu_topic_count_weight


In [37]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]

week_uid_day_labelcnt_kfold_median
week_uid_day_labelsum_kfold_median
week_uid_day_labelrate_kfold_median
week_uid_day_labelneg_kfold_median
hour_uid_day_labelsum_kfold_median
hour_uid_day_labelrate_kfold_median
diff_iq_day_uid_day_labelsum_kfold_median
diff_iq_day_uid_day_labelrate_kfold_median
diff_iq_day_uid_day_labelneg_kfold_median
diff_iq_hour_uid_day_labelsum_kfold_median
diff_iq_hour_uid_day_labelrate_kfold_median
diff_iq_hour_uid_day_labelneg_kfold_median
intersection_ft_count_uid_day_labelsum_kfold_median
intersection_ft_count_uid_day_labelrate_kfold_median
intersection_ft_count_uid_day_labelneg_kfold_median
intersection_it_count_uid_day_labelcnt_kfold_median
intersection_it_count_uid_day_labelsum_kfold_median
intersection_it_count_uid_day_labelrate_kfold_median
intersection_it_count_uid_day_labelneg_kfold_median
qu_topic_count_weight_uid_day_labelneg_kfold_median


In [38]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [42]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_uid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_uid_2order_label_feature.pkl', 'wb'))

## qid 交叉

In [14]:
t1 = pickle.load(open('./temp_label_feat/qid_day.pkl', 'rb'))
train = pd.concat([train, t1], axis=1)

In [17]:
base = ['qid_day_labelcnt', 'qid_day_labelsum', 'qid_day_labelrate', 'qid_day_labelneg']
other = ['gender', 'freq', 'score', 
         'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c5', 
         'diff_iq_day', 'diff_iq_hour', 'qu_topic_count', 'qu_topic_count_weight']
tt1, tt2 = kfold_train_label_stat(train, test, base, other)

[2019-12-12 12:40:18,607] INFO in <ipython-input-16-8c7a1848765e>: at gender
[2019-12-12 12:43:58,092] INFO in <ipython-input-16-8c7a1848765e>: at freq
[2019-12-12 12:47:42,038] INFO in <ipython-input-16-8c7a1848765e>: at score
[2019-12-12 12:50:29,617] INFO in <ipython-input-16-8c7a1848765e>: at uf_b1
[2019-12-12 12:53:23,770] INFO in <ipython-input-16-8c7a1848765e>: at uf_b2
[2019-12-12 12:56:19,026] INFO in <ipython-input-16-8c7a1848765e>: at uf_b3
[2019-12-12 12:59:24,867] INFO in <ipython-input-16-8c7a1848765e>: at uf_b4
[2019-12-12 13:02:40,236] INFO in <ipython-input-16-8c7a1848765e>: at uf_b5
[2019-12-12 13:06:01,747] INFO in <ipython-input-16-8c7a1848765e>: at uf_c5
[2019-12-12 13:10:23,723] INFO in <ipython-input-16-8c7a1848765e>: at diff_iq_day
[2019-12-12 13:13:25,531] INFO in <ipython-input-16-8c7a1848765e>: at diff_iq_hour
[2019-12-12 13:16:48,583] INFO in <ipython-input-16-8c7a1848765e>: at qu_topic_count
[2019-12-12 13:20:46,114] INFO in <ipython-input-16-8c7a1848765e>:

In [18]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]
gc.collect()  

gender_qid_day_labelsum_kfold_median
uf_b1_qid_day_labelcnt_kfold_median
uf_b1_qid_day_labelsum_kfold_median
uf_b2_qid_day_labelcnt_kfold_median
uf_b2_qid_day_labelsum_kfold_median
uf_b3_qid_day_labelcnt_kfold_median
uf_b3_qid_day_labelsum_kfold_median
uf_b4_qid_day_labelsum_kfold_median
uf_b5_qid_day_labelcnt_kfold_median
uf_b5_qid_day_labelsum_kfold_median
uf_c5_qid_day_labelsum_kfold_median


0

In [19]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [20]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_qid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_qid_2order_label_feature.pkl', 'wb'))

## 二分类交叉

In [None]:
t1 = pickle.load(open('./temp_label_feat/uf_b2_day.pkl', 'rb'))
train = pd.concat([train, t1], axis=1)

In [None]:
base = ['uf_b2_day_labelcnt', 'uf_b2_day_labelsum', 'uf_b2_day_labelrate', 'uf_b2_day_labelneg']
other = ['uf_b1', 'uf_b3']
tt1, tt2 = kfold_train_label_stat(train, test, base, other)

In [37]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]

week_uid_day_labelcnt_kfold_median
week_uid_day_labelsum_kfold_median
week_uid_day_labelrate_kfold_median
week_uid_day_labelneg_kfold_median
hour_uid_day_labelsum_kfold_median
hour_uid_day_labelrate_kfold_median
diff_iq_day_uid_day_labelsum_kfold_median
diff_iq_day_uid_day_labelrate_kfold_median
diff_iq_day_uid_day_labelneg_kfold_median
diff_iq_hour_uid_day_labelsum_kfold_median
diff_iq_hour_uid_day_labelrate_kfold_median
diff_iq_hour_uid_day_labelneg_kfold_median
intersection_ft_count_uid_day_labelsum_kfold_median
intersection_ft_count_uid_day_labelrate_kfold_median
intersection_ft_count_uid_day_labelneg_kfold_median
intersection_it_count_uid_day_labelcnt_kfold_median
intersection_it_count_uid_day_labelsum_kfold_median
intersection_it_count_uid_day_labelrate_kfold_median
intersection_it_count_uid_day_labelneg_kfold_median
qu_topic_count_weight_uid_day_labelneg_kfold_median


In [38]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [42]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_uid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_uid_2order_label_feature.pkl', 'wb'))