In [70]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [109]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

# import warnings
# warnings.filterwarnings('ignore')

In [110]:
base_path = './data'

In [111]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-11-29 09:34:13,046] INFO in <ipython-input-111-af34e33feb4f>: invite (9489162, 4)
[2019-11-29 09:34:14,521] INFO in <ipython-input-111-af34e33feb4f>: test (1141683, 3)


In [112]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

In [113]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-11-29 09:35:08,215] INFO in <ipython-input-113-9063dae39e17>: ques (1829900, 3)


In [114]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid', how='left')
del ques

# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']


[2019-11-29 09:36:28,525] INFO in <ipython-input-114-cdaa443c2e7e>: ans (4513735, 18)


In [115]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    else:
        return -1     # 更前的一个月

In [116]:
def extract_kfold_train_feature(data_df_, ans_df_): 
    train_df = data_df_.copy()
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    train_df['fold'] = train_df['day'].apply(fold_fn)
    train_df_copy = train_df.copy()
    
    # 给 ans 加 fold 信息
    ans_df['fold'] = ans_df['a_day'].apply(fold_fn)
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat
    
    for feat in extract_feat:
        train_df[feat] = -10000
        
    for fold_ in range(4):
        logging.info("fold %s", fold_)
        
        log_trn = train_df_copy.loc[train_df_copy['fold'] != fold_]    # 提这些记录里的信息
        logging.info("log_trn shape %s", log_trn.shape)
        val_df = train_df_copy.loc[train_df_copy['fold'] == fold_]
        logging.info("val_df shape %s", val_df.shape)
        log_ans = ans_df.loc[ans_df['fold'] != fold_]  # 排除掉当前 fold 的 ans
        logging.info("log_ans shape %s", log_ans.shape)
        
        # ques
        logging.info("question info")
        t1 = log_trn.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
        train_df.loc[train_df['fold']==fold_, extract_q_feat] = pd.merge(val_df, t1, on='qid', 
                                                                         how='left')[extract_q_feat].values

        # user
        logging.info("user info")
        t1 = log_trn.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['uid'] + extract_u_feat
        train_df.loc[train_df['fold']==fold_, extract_u_feat] = pd.merge(val_df, t1, on='uid', 
                                                                         how='left')[extract_u_feat].values
        
        # ans
        ans_q_group = log_ans.groupby('qid')
        ans_u_group = log_ans.groupby('uid')
        
        logging.info("ans: q_ans_kfold_count")
        t1 = ans_q_group['aid'].count().reset_index()
        t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['q_ans_kfold_count']] = pd.merge(val_df, t1, on='qid', 
                                                                                how='left')['q_ans_kfold_count'].values
        
        logging.info("ans: u_ans_kfold_count")
        t1 = ans_u_group['aid'].count().reset_index()
        t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['u_ans_kfold_count']] = pd.merge(val_df, t1, on='uid', 
                                                                                how='left')['u_ans_kfold_count'].values
        
        for col in a_feat:
            logging.info("ans: %s sum max mean", col)
            
            t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
            t1.columns = ['qid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='qid', 
                                                                     how='left')[f_name].values
            
            t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
            t1.columns = ['uid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='uid', 
                                                                     how='left')[f_name].values
            
    for feat in extract_feat:
        assert len(train_df[train_df[feat]==-10000]) == 0
    del train_df['fold']
    return train_df

In [117]:
train_kfold = extract_kfold_train_feature(train, ans)

[2019-11-29 09:36:52,089] INFO in <ipython-input-116-b81cb14d6795>: ans_df shape (4513735, 23)
[2019-11-29 09:37:06,690] INFO in <ipython-input-116-b81cb14d6795>: ans_df shape (4513735, 24)
[2019-11-29 09:39:11,751] INFO in <ipython-input-116-b81cb14d6795>: fold 0
[2019-11-29 09:39:12,750] INFO in <ipython-input-116-b81cb14d6795>: log_trn shape (7070010, 6)
[2019-11-29 09:39:13,126] INFO in <ipython-input-116-b81cb14d6795>: val_df shape (2419152, 6)
[2019-11-29 09:39:15,070] INFO in <ipython-input-116-b81cb14d6795>: log_ans shape (3796380, 24)
[2019-11-29 09:39:15,071] INFO in <ipython-input-116-b81cb14d6795>: question info
[2019-11-29 09:41:15,368] INFO in <ipython-input-116-b81cb14d6795>: user info
[2019-11-29 09:41:45,443] INFO in <ipython-input-116-b81cb14d6795>: ans: q_ans_kfold_count
[2019-11-29 09:41:56,927] INFO in <ipython-input-116-b81cb14d6795>: ans: u_ans_kfold_count
[2019-11-29 09:42:05,313] INFO in <ipython-input-116-b81cb14d6795>: ans: is_good sum max mean
[2019-11-29 09

[2019-11-29 10:03:53,843] INFO in <ipython-input-116-b81cb14d6795>: ans: reci_tks sum max mean
[2019-11-29 10:04:11,306] INFO in <ipython-input-116-b81cb14d6795>: ans: reci_xxx sum max mean
[2019-11-29 10:04:28,724] INFO in <ipython-input-116-b81cb14d6795>: ans: reci_no_help sum max mean
[2019-11-29 10:04:46,151] INFO in <ipython-input-116-b81cb14d6795>: ans: reci_dis sum max mean
[2019-11-29 10:05:03,631] INFO in <ipython-input-116-b81cb14d6795>: ans: diff_qa_days sum max mean


In [56]:
def extract_kfold_test_feature(test_df_, data_df_, ans_df_): 
    train_df = data_df_.copy()
    logging.info("train_df shape %s", train_df.shape)
    test_df = test_df_.copy()
    logging.info("test_df shape %s", test_df.shape)
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
#     a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
#               'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
#               'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    a_feat = ['diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat

    # ques
    logging.info("question info")
    t1 = train_df.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    # user
    logging.info("user info")
    t1 = train_df.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid'] + extract_u_feat
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    # ans
    ans_q_group = ans_df.groupby('qid')
    ans_u_group = ans_df.groupby('uid')
    
    logging.info("ans: q_ans_kfold_count")
    t1 = ans_q_group['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    logging.info("ans: u_ans_kfold_count")
    t1 = ans_u_group['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    for col in a_feat:
        logging.info("ans: %s sum max mean", col)
        
        t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        t1.columns = ['qid'] + f_name
        test_df = pd.merge(test_df, t1, on='qid', how='left')

        t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        t1.columns = ['uid'] + f_name
        test_df = pd.merge(test_df, t1, on='uid', how='left')

    return test_df

In [57]:
test_kfold = extract_kfold_test_feature(test, train[['uid', 'qid', 'day', 'hour', 'label']], ans)

[2019-11-29 07:05:34,210] INFO in <ipython-input-56-a98a4be4d7c5>: train_df shape (9489162, 5)
[2019-11-29 07:05:34,293] INFO in <ipython-input-56-a98a4be4d7c5>: test_df shape (1141683, 4)
[2019-11-29 07:05:35,103] INFO in <ipython-input-56-a98a4be4d7c5>: ans_df shape (4513735, 23)
[2019-11-29 07:05:35,104] INFO in <ipython-input-56-a98a4be4d7c5>: question info
[2019-11-29 07:05:43,066] INFO in <ipython-input-56-a98a4be4d7c5>: user info
[2019-11-29 07:05:53,488] INFO in <ipython-input-56-a98a4be4d7c5>: ans: q_ans_kfold_count
[2019-11-29 07:06:01,030] INFO in <ipython-input-56-a98a4be4d7c5>: ans: u_ans_kfold_count
[2019-11-29 07:06:05,716] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_good sum max mean
[2019-11-29 07:06:11,178] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_rec sum max mean
[2019-11-29 07:06:16,381] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_dest sum max mean
[2019-11-29 07:06:21,467] INFO in <ipython-input-56-a98a4be4d7c5>: ans: has_img sum max mean
[2019-11

In [121]:
del train_kfold['uid'], train_kfold['qid'], train_kfold['label']

In [None]:
del test_kfold['uid'], test_kfold['qid']

In [122]:
# 压缩数据
t = train_kfold.dtypes
for x in t[t == 'int64'].index:
    train_kfold[x] = train_kfold[x].astype('int32')

for x in t[t == 'float64'].index:
    train_kfold[x] = train_kfold[x].astype('float32')

t = test_kfold.dtypes
for x in t[t == 'int64'].index:
    test_kfold[x] = test_kfold[x].astype('int32')

for x in t[t == 'float64'].index:
    test_kfold[x] = test_kfold[x].astype('float32')

In [123]:
train_kfold.to_csv('feature/train_kfold_feature.txt', index=False, sep='\t')

In [None]:
test_kfold.to_csv('feature/test_kfold_feature.txt', index=False, sep='\t')