In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import multiprocessing
import traceback
import pickle
import gc

In [5]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

base_path = './data'
feature_path = './feature'

In [6]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-13 03:57:14,028] INFO in <ipython-input-6-af34e33feb4f>: invite (9489162, 4)
[2019-12-13 03:57:16,421] INFO in <ipython-input-6-af34e33feb4f>: test (1141683, 3)


In [7]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['week'] = train['day'] % 7
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['week'] = test['day'] % 7
test['hour'] = extract_hour(test['dt'])

In [8]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']
del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')

del user
gc.collect()

[2019-12-13 03:58:47,459] INFO in <ipython-input-8-5451e5d8303c>: user (1931654, 14)


0

In [9]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2'], ques['topic']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
ques['q_week'] = ques['q_day'] % 7

del ques['q_dt']

# merge ques
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

del ques
gc.collect()

[2019-12-13 04:00:00,995] INFO in <ipython-input-9-56a30ffdc19b>: ques (1829900, 2)


0

In [10]:
train['diff_iq_day'] = train['day'] - train['q_day']
train['diff_iq_hour'] = train['diff_iq_day'] * 24 + (train['hour'] - train['q_hour'])

test['diff_iq_day'] = test['day'] - test['q_day']
test['diff_iq_hour'] = test['diff_iq_day'] * 24 + (test['hour'] - test['q_hour'])

In [11]:
def diff_iq_day_map(x):
    if x>=31:
        return 31
    if x<0:
        return 0
    return x

train['diff_iq_day'] = train['diff_iq_day'].apply(diff_iq_day_map)
test['diff_iq_day'] = test['diff_iq_day'].apply(diff_iq_day_map)

def diff_iq_hour_map(x):
    if x<0:
        return 0
    if x>200:
        return 40
    return x // 5

train['diff_iq_hour'] = train['diff_iq_hour'].apply(diff_iq_hour_map)
test['diff_iq_hour'] = test['diff_iq_hour'].apply(diff_iq_hour_map)

In [12]:
def score_map(x):
    if x<=280:
        return -1
    if x<=300:
        return 0
    if 300<x<=350:
        return 1
    if 350<x<=400:
        return 2
    if 400<x<=500:
        return 3
    if 500<x<=600:
        return 4
    if 600<x<=700:
        return 5
    if 700<x<=800:
        return 6
    return 7

train['score'] = train['score'].apply(score_map)
test['score'] = test['score'].apply(score_map)

In [13]:
# 加载 invete feature 2: intersection_ft_count, intersection_it_count
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
test = pd.concat([test, t1], axis=1)

# 划分 intersection_ft_count
def to_bin_1(x):
    if x>=3:
        return 3
    return x

train['intersection_ft_count'] = train['intersection_ft_count'].apply(to_bin_1)
test['intersection_ft_count'] = test['intersection_ft_count'].apply(to_bin_1)

# 划分 intersection_it_count
def to_bin_2(x):
    if x>=4:
        return 4
    return x

train['intersection_it_count'] = train['intersection_it_count'].apply(to_bin_2)
test['intersection_it_count'] = test['intersection_it_count'].apply(to_bin_2)

In [14]:
# 加载 kfold topic feature, QU
t1 = pd.read_csv(f'./feature/train_kfold_topic_feature.txt', sep='\t', 
                 usecols=['qu_topic_count_weight', 'qu_topic_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'./feature/test_kfold_topic_feature.txt', sep='\t', 
                 usecols=['qu_topic_count_weight', 'qu_topic_count'])
test = pd.concat([test, t1], axis=1)

def qu_weight_map(x):
    if x<=2:
        return x
    if x<=4:
        return x
    for i in range(1, 16):
        if x<=4+5*i:
            return 4+i
    return 20
train['qu_topic_count_weight'] = train['qu_topic_count_weight'].apply(qu_weight_map)
test['qu_topic_count_weight'] = test['qu_topic_count_weight'].apply(qu_weight_map)

def qu_count_map(x):
    if x>=6:
        return 5
    return x
train['qu_topic_count'] = train['qu_topic_count'].apply(qu_count_map)
test['qu_topic_count'] = test['qu_topic_count'].apply(qu_count_map)

In [15]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    
train['fold'] = train['day'].apply(fold_fn)

In [16]:
train.columns

Index(['qid', 'uid', 'dt', 'label', 'day', 'week', 'hour', 'gender', 'freq',
       'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3',
       'uf_c4', 'uf_c5', 'score', 'q_day', 'q_hour', 'q_week', 'diff_iq_day',
       'diff_iq_hour', 'intersection_ft_count', 'intersection_it_count',
       'qu_topic_count_weight', 'qu_topic_count', 'fold'],
      dtype='object')

# 当天基础统计信息

In [17]:
# 训练集上当天统计
def day_stat(train_df, f):
    logging.info('day answer stat on: %s', f)
    extract_feat = f + '_day_labelrate'
    t1 = train_df.groupby([f,'day'], as_index=False)['label'].agg({
        extract_feat: 'mean',
    })
    res = pd.merge(train_df, t1, on=[f,'day'], how='left')
    return res[extract_feat]

In [None]:
single_feat = ['uid', 'qid', 'freq', 'gender', 'score', 'week', 'hour', 
               'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
               'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
               'diff_iq_day', 'diff_iq_hour', 
               'intersection_ft_count', 'intersection_it_count']
# single_feat = ['diff_iq_hour']
for feat in single_feat:
    t1 = day_stat(train, feat)
    logging.info('stat on %s by day, extract finished. shape: %s', feat, t1.shape)
    # 压缩数据
    t1 = t1.astype('float32')
    
    pickle.dump(t1, open(f'./temp_label_feat/{feat}_day.pkl', 'wb'))

# Kfold 一阶

In [18]:
def kfold_1order_label_stat(train_df, test_df, single_feat_list):
    t1 = train_df.copy()
    t2 = test_df.copy()
    
    extract_feat = []
    
    # train
    for fold_ in range(4):
        logging.info('in train, fold: %s', fold_)
        log_df = train_df[train_df['fold']!=fold_]
        val_df = train_df[train_df['fold']==fold_]
        for feat in single_feat_list:
            f = feat + '_day_labelrate'
            colname1 = 'labelrate_' + feat + '_kfold_mean'
            colname2 = 'labelrate_' + feat + '_kfold_median'
            # mean
            order_label = log_df.groupby(feat)[f].mean()
            t1.loc[t1['fold']==fold_, colname1] = val_df[feat].map(order_label)
            # median
            order_label = log_df.groupby(feat)[f].median()
            t1.loc[t1['fold']==fold_, colname2] = val_df[feat].map(order_label)
    len_t1 = len(t1)
    for feat in single_feat_list:
        extract_feat += ['labelrate_'+feat+'_kfold_mean', 'labelrate_'+feat+'_kfold_median']
        assert len_t1 == t1[feat].count()
        
    # test
    for feat in single_feat_list:
        logging.info('in test, feat: %s', feat)
        f = feat + '_day_labelrate'
        colname1 = 'labelrate_' + feat + '_kfold_mean'
        colname2 = 'labelrate_' + feat + '_kfold_median'
        order_label = train_df.groupby(feat)[f].mean()
        t2[colname1] = test_df[feat].map(order_label)
        order_label = train_df.groupby(feat)[f].median()
        t2[colname2] = test_df[feat].map(order_label)
           
    return t1[extract_feat], t2[extract_feat]

In [19]:
single_feat = ['uid', 'qid', 'freq', 'gender', 'score', 'week', 'hour', 
               'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
               'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
               'diff_iq_day', 'diff_iq_hour', 
               'intersection_ft_count', 'intersection_it_count']

t1 = train.copy()
for feat in single_feat:
    logging.info('concat %s', feat)
    t2 = pickle.load(open(f'./temp_label_feat/{feat}_day.pkl', 'rb'))
    t1 = pd.concat([t1, t2], axis=1)
logging.info('t1 shape: %s', t1.shape)

[2019-12-13 04:04:40,269] INFO in <ipython-input-19-7cd45ae0eb12>: concat uid
[2019-12-13 04:04:45,619] INFO in <ipython-input-19-7cd45ae0eb12>: concat qid
[2019-12-13 04:05:02,516] INFO in <ipython-input-19-7cd45ae0eb12>: concat freq
[2019-12-13 04:05:18,065] INFO in <ipython-input-19-7cd45ae0eb12>: concat gender
[2019-12-13 04:05:32,838] INFO in <ipython-input-19-7cd45ae0eb12>: concat score
[2019-12-13 04:05:47,518] INFO in <ipython-input-19-7cd45ae0eb12>: concat week
[2019-12-13 04:06:03,049] INFO in <ipython-input-19-7cd45ae0eb12>: concat hour
[2019-12-13 04:06:17,633] INFO in <ipython-input-19-7cd45ae0eb12>: concat uf_b1
[2019-12-13 04:06:25,619] INFO in <ipython-input-19-7cd45ae0eb12>: concat uf_b2
[2019-12-13 04:06:41,279] INFO in <ipython-input-19-7cd45ae0eb12>: concat uf_b3
[2019-12-13 04:06:54,191] INFO in <ipython-input-19-7cd45ae0eb12>: concat uf_b4
[2019-12-13 04:07:08,762] INFO in <ipython-input-19-7cd45ae0eb12>: concat uf_b5
[2019-12-13 04:07:24,768] INFO in <ipython-inp

In [20]:
tt1, tt2 = kfold_1order_label_stat(t1, test, single_feat)
logging.info('train feature shape: %s, test feature shape: %s', tt1.shape, tt2.shape)

[2019-12-13 04:09:59,126] INFO in <ipython-input-18-ea4c60656de3>: in train, fold: 0
[2019-12-13 04:13:24,839] INFO in <ipython-input-18-ea4c60656de3>: in train, fold: 1
[2019-12-13 04:15:14,730] INFO in <ipython-input-18-ea4c60656de3>: in train, fold: 2
[2019-12-13 04:17:08,814] INFO in <ipython-input-18-ea4c60656de3>: in train, fold: 3
[2019-12-13 04:19:12,219] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: uid
[2019-12-13 04:19:41,019] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: qid
[2019-12-13 04:20:05,870] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: freq
[2019-12-13 04:20:10,510] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: gender
[2019-12-13 04:20:15,035] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: score
[2019-12-13 04:20:16,434] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: week
[2019-12-13 04:20:17,818] INFO in <ipython-input-18-ea4c60656de3>: in test, feat: hour
[2019-12-13 04:20:19,275] INFO in <ipython-input-1

In [21]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]

In [22]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [23]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_1order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_1order_label_feature.pkl', 'wb'))

# Kfold 二阶交叉

In [24]:
def kfold_2order_label_stat(train_df, test_df, base_feat, other_feat):
    t1 = train_df.copy()
    t2 = test_df.copy()
    extract_feat = []
    for of in other_feat:
        logging.info('at %s', of)
        for bf in base_feat:
            colname1 = 'labelrate_' + of + '_' + bf + '_kfold_mean'
            colname2 = 'labelrate_' + of + '_' + bf + '_kfold_median'
            extract_feat += [colname1, colname2]
            
            # train
            for fold_ in range(4):
                log_df = train_df[train_df['fold']!=fold_]
                val_df = train_df[train_df['fold']==fold_]
                # mean
                order_label = log_df.groupby(of)[bf].mean()
                t1.loc[t1['fold']==fold_, colname1] = val_df[of].map(order_label)
                # median
                order_label = log_df.groupby(of)[bf].median()
                t1.loc[t1['fold']==fold_, colname2] = val_df[of].map(order_label)
                
            # test
            order_label = train_df.groupby(of)[bf].mean()
            t2[colname1] = test_df[of].map(order_label)
            order_label = train_df.groupby(of)[bf].median()
            t2[colname2] = test_df[of].map(order_label)
            
    return t1[extract_feat], t2[extract_feat]

## uid 交叉

In [25]:
t1 = pickle.load(open('./temp_label_feat/uid_day.pkl', 'rb'))
t1 = pd.concat([train, t1], axis=1)
logging.info('t1 shape: %s', t1.shape)

[2019-12-13 04:22:16,703] INFO in <ipython-input-25-9eec0be8829a>: t1 shape: (9489162, 31)


In [26]:
base = ['uid_day_labelrate']
other = ['week', 'hour', 'diff_iq_day', 'diff_iq_hour', 'intersection_ft_count', 'intersection_it_count',
        'qu_topic_count', 'qu_topic_count_weight']
tt1, tt2 = kfold_2order_label_stat(t1, test, base, other)
logging.info('tt1 shape: %s, tt2 shape: %s', tt1.shape, tt2.shape)

[2019-12-13 04:22:27,276] INFO in <ipython-input-24-58a7414427f1>: at week
[2019-12-13 04:23:05,506] INFO in <ipython-input-24-58a7414427f1>: at hour
[2019-12-13 04:23:43,973] INFO in <ipython-input-24-58a7414427f1>: at diff_iq_day
[2019-12-13 04:24:21,552] INFO in <ipython-input-24-58a7414427f1>: at diff_iq_hour
[2019-12-13 04:24:58,535] INFO in <ipython-input-24-58a7414427f1>: at intersection_ft_count
[2019-12-13 04:25:34,448] INFO in <ipython-input-24-58a7414427f1>: at intersection_it_count
[2019-12-13 04:26:14,388] INFO in <ipython-input-24-58a7414427f1>: at qu_topic_count
[2019-12-13 04:26:54,415] INFO in <ipython-input-24-58a7414427f1>: at qu_topic_count_weight
[2019-12-13 04:27:40,970] INFO in <ipython-input-26-bd55177bfa65>: tt1 shape: (9489162, 16), tt2 shape: (1141683, 16)


In [27]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]

labelrate_week_uid_day_labelrate_kfold_median
labelrate_hour_uid_day_labelrate_kfold_median
labelrate_diff_iq_day_uid_day_labelrate_kfold_median
labelrate_diff_iq_hour_uid_day_labelrate_kfold_median
labelrate_intersection_ft_count_uid_day_labelrate_kfold_median
labelrate_intersection_it_count_uid_day_labelrate_kfold_median


In [28]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [29]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_uid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_uid_2order_label_feature.pkl', 'wb'))

## qid 交叉

In [30]:
t1 = pickle.load(open('./temp_label_feat/qid_day.pkl', 'rb'))
t1 = pd.concat([train, t1], axis=1)
logging.info('t1 shape: %s', t1.shape)

[2019-12-13 04:28:01,454] INFO in <ipython-input-30-cb9d4300239c>: t1 shape: (9489162, 31)


In [31]:
base = ['qid_day_labelrate']
other = ['gender', 'freq', 'score', 
         'uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'uf_c5', 
         'diff_iq_day', 'diff_iq_hour', 'qu_topic_count', 'qu_topic_count_weight']
tt1, tt2 = kfold_2order_label_stat(t1, test, base, other)
logging.info('tt1 shape: %s, tt2 shape: %s', tt1.shape, tt2.shape)

[2019-12-13 04:28:14,441] INFO in <ipython-input-24-58a7414427f1>: at gender
[2019-12-13 04:29:02,696] INFO in <ipython-input-24-58a7414427f1>: at freq
[2019-12-13 04:29:58,384] INFO in <ipython-input-24-58a7414427f1>: at score
[2019-12-13 04:30:35,746] INFO in <ipython-input-24-58a7414427f1>: at uf_b1
[2019-12-13 04:31:12,610] INFO in <ipython-input-24-58a7414427f1>: at uf_b2
[2019-12-13 04:31:50,238] INFO in <ipython-input-24-58a7414427f1>: at uf_b3
[2019-12-13 04:32:26,850] INFO in <ipython-input-24-58a7414427f1>: at uf_b4
[2019-12-13 04:33:04,261] INFO in <ipython-input-24-58a7414427f1>: at uf_b5
[2019-12-13 04:33:42,460] INFO in <ipython-input-24-58a7414427f1>: at uf_c5
[2019-12-13 04:34:37,259] INFO in <ipython-input-24-58a7414427f1>: at diff_iq_day
[2019-12-13 04:35:16,552] INFO in <ipython-input-24-58a7414427f1>: at diff_iq_hour
[2019-12-13 04:35:58,573] INFO in <ipython-input-24-58a7414427f1>: at qu_topic_count
[2019-12-13 04:36:42,049] INFO in <ipython-input-24-58a7414427f1>:

In [32]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]
gc.collect()  

0

In [33]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [34]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_qid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_qid_2order_label_feature.pkl', 'wb'))

## 二分类交叉

In [None]:
t1 = pickle.load(open('./temp_label_feat/uf_b2_day.pkl', 'rb'))
t1 = pd.concat([train, t1], axis=1)
logging.info('t1 shape: %s', t1.shape)

In [None]:
base = ['uf_b2_day_labelrate']
other = ['uf_b1', 'uf_b3']
tt1, tt2 = kfold_2order_label_stat(t1, test, base, other)
logging.info('tt1 shape: %s, tt2 shape: %s', tt1.shape, tt2.shape)

In [None]:
# 删掉所有取值都一样的列
for i in tt1.columns:
    if len(tt1[i].value_counts())==1:
        print(i)
        del tt1[i], tt2[i]

In [None]:
# 压缩数据
t = tt1.dtypes
for x in t[t == 'int64'].index:
    tt1[x] = tt1[x].astype('int32')

for x in t[t == 'float64'].index:
    tt1[x] = tt1[x].astype('float32')
    
# 压缩数据
t = tt2.dtypes
for x in t[t == 'int64'].index:
    tt2[x] = tt2[x].astype('int32')

for x in t[t == 'float64'].index:
    tt2[x] = tt2[x].astype('float32')

In [None]:
pickle.dump(tt1, open(f'{feature_path}/train_kfold_uid_2order_label_feature.pkl', 'wb'))
pickle.dump(tt2, open(f'{feature_path}/test_kfold_uid_2order_label_feature.pkl', 'wb'))