In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import multiprocessing
import traceback
import pickle
import gc

In [3]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [4]:
base_path = './data'
feature_path = './feature'

In [5]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-09 14:30:21,837] INFO in <ipython-input-5-af34e33feb4f>: invite (9489162, 4)
[2019-12-09 14:30:23,302] INFO in <ipython-input-5-af34e33feb4f>: test (1141683, 3)


In [6]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['week'] = train['day'] % 7
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['week'] = test['day'] % 7
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

In [7]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq', 'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',  'score', 'follow_topic', 'inter_topic']

del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

[2019-12-09 14:31:10,187] INFO in <ipython-input-7-cbb4bab8955e>: user (1931654, 14)


In [8]:
# merge user
train = pd.merge(train, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')

In [9]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2'], ques['topic']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
ques['q_week'] = ques['q_day'] % 7

del ques['q_dt']

[2019-12-09 14:31:48,056] INFO in <ipython-input-9-3053b1f3a755>: ques (1829900, 2)


In [10]:
# merge ques
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [11]:
train['diff_iq_day'] = train['day'] - train['q_day']
train['diff_iq_hour'] = train['diff_iq_day'] * 24 + (train['hour'] - train['q_hour'])

test['diff_iq_day'] = test['day'] - test['q_day']
test['diff_iq_hour'] = test['diff_iq_day'] * 24 + (test['hour'] - test['q_hour'])

In [12]:
# 加载 invete feature 2: intersection_ft_count, intersection_it_count
t1 = pd.read_csv(f'{feature_path}/train_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
train = pd.concat([train, t1], axis=1)

t1 = pd.read_csv(f'{feature_path}/test_invite_feature_2.txt', sep='\t', 
                 usecols=['intersection_ft_count', 'intersection_it_count'])
test = pd.concat([test, t1], axis=1)

In [13]:
# 划分 intersection_ft_count
def to_bin_1(x):
    if x>=3:
        return 3
    return x

train['intersection_ft_count'] = train['intersection_ft_count'].apply(to_bin_1)
test['intersection_ft_count'] = test['intersection_ft_count'].apply(to_bin_1)

# 划分 intersection_it_count
def to_bin_2(x):
    if x>=4:
        return 4
    return x

train['intersection_it_count'] = train['intersection_it_count'].apply(to_bin_2)
test['intersection_it_count'] = test['intersection_it_count'].apply(to_bin_2)

In [14]:
train['label'].mean()

0.1773515933229931

In [15]:
# intersection_ft_count 有0~5，划分为0~3 
for i in range(0, 3):
    print(train[train['intersection_ft_count']==i]['label'].mean())
print(train[train['intersection_ft_count']>=3]['label'].mean())

0.17087539518378606
0.19507707344646075
0.22500282617325626
0.2665859302869339


In [16]:
# intersection_ft_count 有0~5，划分为0~4
for i in range(0, 4):
    print(train[train['intersection_it_count']==i]['label'].mean())
print(train[train['intersection_it_count']>=4]['label'].mean())

0.18145724765232774
0.13238402139873304
0.1459849618021744
0.19836384254009395
0.2152317880794702


In [17]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3

In [18]:
train['fold'] = train['day'].apply(fold_fn)

In [130]:
# train 一阶
def single_train_feat(df_, feat):
    df = df_.copy()
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
    extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
                      feat+'_label_hour_sum', feat+'_label_hour_std']
    extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
                      feat+'_label_week_sum', feat+'_label_week_std']
    extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    for c in extract_feat:
        df[c] = -10000
    for i in range(4):
        t1 = df[df['fold']!=i].groupby(feat)['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
        t1.columns = [feat] + extract_feat_1
        df.loc[df['fold']==i, extract_feat_1] = pd.merge(df.loc[df['fold']==i, feat], t1, on=feat, 
                                                         how='left')[extract_feat_1].values
        # 某小时
        t1 = df[df['fold']!=i].groupby([feat, 'hour'])['label'].agg(['count', 
                                                                     'mean', 'sum', 'std']).reset_index()
        t1.columns = [feat, 'hour'] + extract_feat_2
        df.loc[df['fold']==i, extract_feat_2] = pd.merge(df.loc[df['fold']==i, [feat, 'hour']], 
                                                         t1, on=[feat, 'hour'], 
                                                         how='left')[extract_feat_2].values
        # 一周的某一天
        t1 = df[df['fold']!=i].groupby([feat, 'week'])['label'].agg(['count', 
                                                                     'mean', 'sum', 'std']).reset_index()
        t1.columns = [feat, 'week'] + extract_feat_3
        df.loc[df['fold']==i, extract_feat_3] = pd.merge(df.loc[df['fold']==i, [feat, 'week']], 
                                                         t1, on=[feat, 'week'], 
                                                         how='left')[extract_feat_3].values
    # 数据压缩
    for c in range(0, len(extract_feat), 2):
        df[extract_feat[c]] = df[extract_feat[c]].fillna(0).astype('int32')
    for c in range(1, len(extract_feat), 2):
        df[extract_feat[c]] = df[extract_feat[c]].astype('float32')

    return df[extract_feat]

In [19]:
single_targets = ['uid', 'qid', 'freq', 'score', 
                  'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                  'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
                  'diff_iq_day', 'diff_iq_hour', 
                  'intersection_ft_count', 'intersection_it_count']

In [132]:
n_proc = len(single_targets)

def kfold_worker_1(df, feat):
    try:
        t1 = single_train_feat(df, feat)
        logging.info('%s, feature shape: %s', feat, t1.shape)
        
        pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/train_{feat}.pkl', 'wb'))
        logging.info('%s feature saved!', feat)
        del t1
        gc.collect()
    except:
        print(traceback.print_exct())

def multi_proc_train(df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(kfold_worker_1, (df, f))
    pool.close()
    pool.join()


In [133]:
multi_proc_train(train, single_targets)

[2019-12-09 11:42:28,118] INFO in <ipython-input-132-b9d848fe07d8>: freq, feature shape: (9489162, 12)
[2019-12-09 11:42:29,621] INFO in <ipython-input-132-b9d848fe07d8>: freq feature saved!
[2019-12-09 11:43:02,866] INFO in <ipython-input-132-b9d848fe07d8>: score, feature shape: (9489162, 12)
[2019-12-09 11:43:04,536] INFO in <ipython-input-132-b9d848fe07d8>: score feature saved!
[2019-12-09 11:45:00,648] INFO in <ipython-input-132-b9d848fe07d8>: qid, feature shape: (9489162, 12)
[2019-12-09 11:45:02,699] INFO in <ipython-input-132-b9d848fe07d8>: qid feature saved!
[2019-12-09 11:45:13,097] INFO in <ipython-input-132-b9d848fe07d8>: uf_b1, feature shape: (9489162, 12)
[2019-12-09 11:45:14,609] INFO in <ipython-input-132-b9d848fe07d8>: uf_b1 feature saved!
[2019-12-09 11:46:02,426] INFO in <ipython-input-132-b9d848fe07d8>: uid, feature shape: (9489162, 12)
[2019-12-09 11:46:04,782] INFO in <ipython-input-132-b9d848fe07d8>: uid feature saved!
[2019-12-09 11:46:21,583] INFO in <ipython-in

In [33]:
# test 一阶
def single_test_feat(df, feat):
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
    extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
                      feat+'_label_hour_sum', feat+'_label_hour_std']
    extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
                      feat+'_label_week_sum', feat+'_label_week_std']
    extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    
    t1 = df.groupby(feat)['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t1.columns = [feat] + extract_feat_1
    
    t2 = df.groupby([feat, 'hour'])['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t2.columns = [feat, 'hour'] + extract_feat_2
    
    t3 = df.groupby([feat, 'week'])['label'].agg(['count', 'mean', 'sum', 'std']).reset_index()
    t3.columns = [feat, 'week'] + extract_feat_3
    
    # 数据压缩
    for c in range(0, 4, 2):
        t1[extract_feat_1[c]] = ((t1[extract_feat_1[c]])*23/30).astype('int32')
        t2[extract_feat_2[c]] = ((t2[extract_feat_2[c]])*23/30).astype('int32')
        t3[extract_feat_3[c]] = ((t3[extract_feat_3[c]])*23/30).astype('int32')
    for c in range(1, 4, 2):
        t1[extract_feat_1[c]] = t1[extract_feat_1[c]].astype('float32')
        t2[extract_feat_2[c]] = t2[extract_feat_2[c]].astype('float32')
        t3[extract_feat_3[c]] = t3[extract_feat_3[c]].astype('float32')
    
    return t1, t2, t3

In [35]:
l1, l2, l3 = single_test_feat(train[:10000], 'freq')

In [37]:
n_proc = len(single_targets)

def kfold_worker_2(train_df, feat):
    try:
        t1, t2, t3 = single_test_feat(train_df, feat)
        logging.info('%s, feature shape: %s', feat, t1.shape)
        
        pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/test_{feat}_t1.pkl', 'wb'))
        pickle.dump(t2, open(f'{feature_path}/single_kfold_feat/test_{feat}_t2.pkl', 'wb'))
        pickle.dump(t3, open(f'{feature_path}/single_kfold_feat/test_{feat}_t3.pkl', 'wb'))
        logging.info('%s feature saved!', feat)
        del t1, t2, t3
        gc.collect()
    except:
        print(traceback.print_exct())

def multi_proc_test(train_df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(kfold_worker_2, (train_df, f))
    pool.close()
    pool.join()

In [40]:
multi_proc_test(train, single_targets)

[2019-12-09 14:51:38,532] INFO in <ipython-input-37-503dffd499d1>: qid, feature shape: (926203, 5)
[2019-12-09 14:51:40,116] INFO in <ipython-input-37-503dffd499d1>: uid, feature shape: (1358213, 5)
[2019-12-09 14:51:40,139] INFO in <ipython-input-37-503dffd499d1>: qid feature saved!
[2019-12-09 14:51:44,223] INFO in <ipython-input-37-503dffd499d1>: uid feature saved!
[2019-12-09 14:52:01,043] INFO in <ipython-input-37-503dffd499d1>: freq, feature shape: (5, 5)
[2019-12-09 14:52:01,051] INFO in <ipython-input-37-503dffd499d1>: freq feature saved!
[2019-12-09 14:52:09,769] INFO in <ipython-input-37-503dffd499d1>: score, feature shape: (724, 5)
[2019-12-09 14:52:09,778] INFO in <ipython-input-37-503dffd499d1>: score feature saved!
[2019-12-09 14:52:33,871] INFO in <ipython-input-37-503dffd499d1>: uf_b1, feature shape: (2, 5)
[2019-12-09 14:52:33,877] INFO in <ipython-input-37-503dffd499d1>: uf_b1 feature saved!
[2019-12-09 14:52:56,658] INFO in <ipython-input-37-503dffd499d1>: uf_b2, fea

In [42]:
def my_merge(test_df, feat_df_list, feat):
    extract_feat_1 = [feat+'_kfold_count', feat+'_label_mean', feat+'_label_sum', feat+'_label_std']
    extract_feat_2 = [feat+'_kfold_hour_count', feat+'_label_hour_mean', 
                      feat+'_label_hour_sum', feat+'_label_hour_std']
    extract_feat_3 = [feat+'_kfold_week_count', feat+'_label_week_mean', 
                      feat+'_label_week_sum', feat+'_label_week_std']
    extract_feat = extract_feat_1 + extract_feat_2 + extract_feat_3
    t1 = pd.merge(test, feat_df_list[0], on=[feat], how='left')
    t1 = pd.merge(t1, feat_df_list[1], on=[feat, 'hour'], how='left')
    t1 = pd.merge(t1, feat_df_list[2], on=[feat, 'week'], how='left')
    for i in range(0, len(extract_feat), 2):
        t1[extract_feat[i]] = t1[extract_feat[i]].fillna(0).astype('int32')
    for i in range(1, len(extract_feat), 2):
        t1[extract_feat[i]] = t1[extract_feat[i]].astype('float32')

    return t1[extract_feat]

In [43]:
l2 = []
l2.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_freq_t1.pkl', 'rb')))
l2.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_freq_t2.pkl', 'rb')))
l2.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_freq_t3.pkl', 'rb')))
t1 = my_merge(test[:60], l2, 'freq')

In [44]:
t1

Unnamed: 0,freq_kfold_count,freq_label_mean,freq_label_sum,freq_label_std,freq_kfold_hour_count,freq_label_hour_mean,freq_label_hour_sum,freq_label_hour_std,freq_kfold_week_count,freq_label_week_mean,freq_label_week_sum,freq_label_week_std
0,2941221,0.168021,494187,0.373885,188374,0.188355,35481,0.390996,379473,0.174408,66183,0.379461
1,3147340,0.159146,500885,0.365812,121117,0.128796,15599,0.334975,423071,0.151811,64226,0.358838
2,2941221,0.168021,494187,0.373885,179718,0.160438,28833,0.367012,473041,0.166582,78800,0.372603
3,115848,0.330607,38300,0.470433,563,0.489796,276,0.500236,17831,0.313685,5593,0.464000
4,2941221,0.168021,494187,0.373885,200781,0.134217,26948,0.340886,406345,0.157120,63844,0.363914
...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,2941221,0.168021,494187,0.373885,119087,0.165896,19756,0.371989,413991,0.167266,69246,0.373214
1141679,856464,0.228575,195766,0.419915,41938,0.201400,8446,0.401050,125974,0.219275,27623,0.413757
1141680,3147340,0.159146,500885,0.365812,167377,0.147436,24677,0.354541,434280,0.153933,66850,0.360885
1141681,2941221,0.168021,494187,0.373885,279252,0.169203,47250,0.374932,408130,0.162868,66471,0.369246


In [46]:
def merge_worker(test_df, feat):
    l1 = []
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t1.pkl', 'rb')))
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t2.pkl', 'rb')))
    l1.append(pickle.load(open(f'{feature_path}/single_kfold_feat/test_{feat}_t3.pkl', 'rb')))
    t1 = my_merge(test_df, l1, feat)
    logging.info('merged %s feature, shape: %s', feat, t1.shape)
    pickle.dump(t1, open(f'{feature_path}/single_kfold_feat/test_{feat}_merged.pkl', 'wb'))

def multi_proc_merge(test_df, feat_list):
    pool = multiprocessing.Pool(processes=n_proc)
    for f in feat_list:
        pool.apply_async(merge_worker, (test_df, f))
    pool.close()
    pool.join()

In [47]:
# single_targets = ['uid', 'qid', 'freq', 'score', 
#                   'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
#                   'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5',
#                   'diff_iq_day', 'diff_iq_hour', 
#                   'intersection_ft_count', 'intersection_it_count']
multi_proc_merge(test, single_targets)

[2019-12-09 17:04:05,466] INFO in <ipython-input-46-ac86bc0b42a2>: merged freq feature, shape: (1141683, 12)
[2019-12-09 17:04:08,366] INFO in <ipython-input-46-ac86bc0b42a2>: merged score feature, shape: (1141683, 12)
[2019-12-09 17:04:10,151] INFO in <ipython-input-46-ac86bc0b42a2>: merged qid feature, shape: (1141683, 12)
[2019-12-09 17:04:11,779] INFO in <ipython-input-46-ac86bc0b42a2>: merged uf_b1 feature, shape: (1141683, 12)
[2019-12-09 17:04:12,513] INFO in <ipython-input-46-ac86bc0b42a2>: merged uf_b2 feature, shape: (1141683, 12)
[2019-12-09 17:04:14,657] INFO in <ipython-input-46-ac86bc0b42a2>: merged uid feature, shape: (1141683, 12)
[2019-12-09 17:04:16,954] INFO in <ipython-input-46-ac86bc0b42a2>: merged uf_b3 feature, shape: (1141683, 12)
[2019-12-09 17:04:19,655] INFO in <ipython-input-46-ac86bc0b42a2>: merged uf_b4 feature, shape: (1141683, 12)
[2019-12-09 17:04:21,284] INFO in <ipython-input-46-ac86bc0b42a2>: merged uf_b5 feature, shape: (1141683, 12)
[2019-12-09 17: