In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import multiprocessing
import traceback
import pickle
import gc

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-10 09:13:00,413] INFO in <ipython-input-4-af34e33feb4f>: invite (9489162, 4)
[2019-12-10 09:13:02,602] INFO in <ipython-input-4-af34e33feb4f>: test (1141683, 3)


In [5]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['week'] = train['day'] % 7
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['week'] = test['day'] % 7
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

In [6]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

[2019-12-10 09:13:39,852] INFO in <ipython-input-6-cbb4bab8955e>: user (1931654, 14)


In [7]:
# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')

In [8]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2'], ques['topic']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
ques['q_week'] = ques['q_day'] % 7

del ques['q_dt']

[2019-12-10 09:14:50,655] INFO in <ipython-input-8-3053b1f3a755>: ques (1829900, 2)


In [None]:
# merge ques
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [None]:
train['diff_iq_day'] = train['day'] - train['q_day']
train['diff_iq_hour'] = train['diff_iq_day'] * 24 + (train['hour'] - train['q_hour'])

test['diff_iq_day'] = test['day'] - test['q_day']
test['diff_iq_hour'] = test['diff_iq_day'] * 24 + (test['hour'] - test['q_hour'])

In [None]:
def diff_iq_day_map(x):
    if x>=31:
        return 31
    if x<0:
        return 0
    return x

train['diff_iq_day'] = train['diff_iq_day'].apply(diff_iq_day_map)
test['diff_iq_day'] = test['diff_iq_day'].apply(diff_iq_day_map)

def diff_iq_hour_map(x):
    if x<0:
        return 0
    if x>200:
        return 40
    return x // 5
train['diff_iq_hour'] = train['diff_iq_hour'].apply(diff_iq_hour_map)
test['diff_iq_day'] = test['diff_iq_day'].apply(diff_iq_day_map)

In [None]:
def score_map(x):
    if x<=280:
        return -1
    if x<=300:
        return 0
    if 300<x<=350:
        return 1
    if 350<x<=400:
        return 2
    if 400<x<=500:
        return 3
    if 500<x<=600:
        return 4
    if 600<x<=700:
        return 5
    if 700<x<=800:
        return 6
    return 7

train['score'] = train['score'].apply(score_map)

In [None]:
# 加载 invete feature 2: intersection_ft_count, intersection_it_count
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
test = pd.concat([test, t1], axis=1)

In [None]:
# 划分 intersection_ft_count
def to_bin_1(x):
    if x>=3:
        return 3
    return x

train['intersection_ft_count'] = train['intersection_ft_count'].apply(to_bin_1)
test['intersection_ft_count'] = test['intersection_ft_count'].apply(to_bin_1)

# 划分 intersection_it_count
def to_bin_2(x):
    if x>=4:
        return 4
    return x

train['intersection_it_count'] = train['intersection_it_count'].apply(to_bin_2)
test['intersection_it_count'] = test['intersection_it_count'].apply(to_bin_2)

In [None]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3

In [None]:
train['fold'] = train['day'].apply(fold_fn)

In [None]:
# train 一阶
def single_train_feat(df_, feat):
    df = df_.copy()
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
#     extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
#                       feat+'_label_hour_sum', feat+'_label_hour_std']
#     extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
#                       feat+'_label_week_sum', feat+'_label_week_std']
#     extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    for c in extract_feat:
        df[c] = -10000
    for i in range(4):
        t1 = df[df['fold']!=i].groupby(feat)['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
        t1.loc[t1['count']<5, ['mean', 'std']] = np.nan
        t1.columns = [feat] + extract_feat_1
        df.loc[df['fold']==i, extract_feat_1] = pd.merge(df.loc[df['fold']==i, feat], t1, on=feat, 
                                                         how='left')[extract_feat_1].values
        # 某小时
#         t1 = df[df['fold']!=i].groupby([feat, 'hour'])['label'].agg(['count', 
#                                                                      'mean', 'sum', 'std']).reset_index()
#         t1.loc[t1['count']<5, ['mean', 'std']] = np.nan
#         t1.columns = [feat, 'hour'] + extract_feat_2
#         df.loc[df['fold']==i, extract_feat_2] = pd.merge(df.loc[df['fold']==i, [feat, 'hour']], 
#                                                          t1, on=[feat, 'hour'], 
#                                                          how='left')[extract_feat_2].values
#         # 一周的某一天
#         t1 = df[df['fold']!=i].groupby([feat, 'week'])['label'].agg(['count', 
#                                                                      'mean', 'sum', 'std']).reset_index()
#         t1.loc[t1['count']<5, ['mean', 'std']] = np.nan
#         t1.columns = [feat, 'week'] + extract_feat_3
#         df.loc[df['fold']==i, extract_feat_3] = pd.merge(df.loc[df['fold']==i, [feat, 'week']], 
#                                                          t1, on=[feat, 'week'], 
#                                                          how='left')[extract_feat_3].values
    # 数据压缩
    for c in range(0, len(extract_feat), 2):
        df[extract_feat[c]] = df[extract_feat[c]].fillna(0).astype('int32')
    for c in range(1, len(extract_feat), 2):
        df[extract_feat[c]] = df[extract_feat[c]].astype('float32')

    return df[extract_feat]

In [None]:
single_targets = ['uid', 'qid', 'freq', 'score', 
                  'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                  'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
                  'diff_iq_day', 'diff_iq_hour', 
                  'intersection_ft_count', 'intersection_it_count']

In [None]:
n_proc = len(single_targets)

def kfold_worker_1(df, feat):
    try:
        t1 = single_train_feat(df, feat)
        logging.info('%s, feature shape: %s', feat, t1.shape)
        
        pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/train_{feat}.pkl', 'wb'))
        logging.info('%s feature saved!', feat)
        del t1
        gc.collect()
    except:
        print(traceback.print_exct())

def multi_proc_train(df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(kfold_worker_1, (df, f))
    pool.close()
    pool.join()


In [None]:
multi_proc_train(train, single_targets)

In [None]:
# test 一阶
def single_test_feat(df, feat):
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
    extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
                      feat+'_label_hour_sum', feat+'_label_hour_std']
    extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
                      feat+'_label_week_sum', feat+'_label_week_std']
    extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    
    t1 = df.groupby(feat)['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t1.loc[t1['count']<5, ['mean', 'std']] = np.nan
    t1.columns = [feat] + extract_feat_1
    
    t2 = df.groupby([feat, 'hour'])['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t2.loc[t2['count']<5, ['mean', 'std']] = np.nan
    t2.columns = [feat, 'hour'] + extract_feat_2
    
    t3 = df.groupby([feat, 'week'])['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t3.loc[t3['count']<5, ['mean', 'std']] = np.nan
    t3.columns = [feat, 'week'] + extract_feat_3
    
    # 数据压缩
    for c in range(0, 4, 2):
        t1[extract_feat_1[c]] = ((t1[extract_feat_1[c]])*23/30).astype('int32')
        t2[extract_feat_2[c]] = ((t2[extract_feat_2[c]])*23/30).astype('int32')
        t3[extract_feat_3[c]] = ((t3[extract_feat_3[c]])*23/30).astype('int32')
    for c in range(1, 4, 2):
        t1[extract_feat_1[c]] = t1[extract_feat_1[c]].astype('float32')
        t2[extract_feat_2[c]] = t2[extract_feat_2[c]].astype('float32')
        t3[extract_feat_3[c]] = t3[extract_feat_3[c]].astype('float32')
    
    return t1, t2, t3

In [None]:
n_proc = len(single_targets)

def kfold_worker_2(train_df, feat):
    try:
        t1, t2, t3 = single_test_feat(train_df, feat)
        logging.info('%s, feature shape: %s', feat, t1.shape)
        
        pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/test_{feat}_t1.pkl', 'wb'))
        pickle.dump(t2, open(f'{feature_path}/single_kfold_feat/test_{feat}_t2.pkl', 'wb'))
        pickle.dump(t3, open(f'{feature_path}/single_kfold_feat/test_{feat}_t3.pkl', 'wb'))
        logging.info('%s feature saved!', feat)
        del t1, t2, t3
        gc.collect()
    except:
        print(traceback.print_exct())

def multi_proc_test(train_df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(kfold_worker_2, (train_df, f))
    pool.close()
    pool.join()

In [None]:
multi_proc_test(train, single_targets)

In [None]:
def my_merge(test_df, feat_df_list, feat):
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
    extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
                      feat+'_label_hour_sum', feat+'_label_hour_std']
    extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
                      feat+'_label_week_sum', feat+'_label_week_std']
    extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    t1 = pd.merge(test, feat_df_list[0], on=[feat], how='left')
    t1 = pd.merge(t1, feat_df_list[1], on=[feat, 'hour'], how='left')
    t1 = pd.merge(t1, feat_df_list[2], on=[feat, 'week'], how='left')
    for i in range(0, len(extract_feat), 2):
        t1[extract_feat[i]] = t1[extract_feat[i]].fillna(0).astype('int32')
    for i in range(1, len(extract_feat), 2):
        t1[extract_feat[i]] = t1[extract_feat[i]].astype('float32')

    return t1[extract_feat]

In [None]:
def merge_worker(test_df, feat):
    l1 = []
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t1.pkl', 'rb')))
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t2.pkl', 'rb')))
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t3.pkl', 'rb')))
    t1 = my_merge(test_df, l1, feat)
    logging.info('merged %s feature, shape: %s', feat, t1.shape)
    pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/test_{feat}_merged.pkl', 'wb'))

def multi_proc_merge(test_df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(merge_worker, (test_df, f))
    pool.close()
    pool.join()

In [None]:
# single_targets = ['uid', 'qid', 'freq', 'score', 
#                   'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
#                   'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
#                   'diff_iq_day', 'diff_iq_hour', 
#                   'intersection_ft_count', 'intersection_it_count']
multi_proc_merge(test, single_targets)

In [None]:
t1 = pickle.load(open('feature/single_kfold_feat/train_uid.pkl', 'rb'))
t2 = pickle.load(open('feature/single_kfold_feat/test_uid_merged.pkl', 'rb'))
t3 = ['qid', 'freq', 'score', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
      'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'diff_iq_day', 'diff_iq_hour', 
      'intersection_ft_count', 'intersection_it_count']
for f in t3:
    logging.info('adding kfold label feature, at: %s', f)
    
    tt1 = pickle.load(open(f'{feature_path}/single_kfold_feat/train_{f}.pkl', 'rb'))
    t1 = pd.concat([t1, tt1], axis=1)
    logging.info('train shape: %s', t1.shape)
    
    tt1 = pickle.load(open(f'{feature_path}/single_kfold_feat/test_{f}_merged.pkl', 'rb'))
    t2 = pd.concat([t2, tt1], axis=1)
    logging.info('test shape: %s', t2.shape)

pickle.dump(t1, open(f'{feature_path}/train_kfold_label_feature.pkl', 'wb'))
pickle.dump(t2, open(f'{feature_path}/test_kfold_label_feature.pkl', 'wb'))

In [None]:
pair_feat = [['uf_b1', 'uf_b2'], ['uf_b2', 'uf_b3'],]