In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [4]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

# import warnings
# warnings.filterwarnings('ignore')

In [5]:
base_path = './data'

In [6]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

# test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
# test.columns = ['qid', 'uid', 'dt']
# logging.info("test %s", test.shape)

test2 = pd.read_csv(f'{base_path}/invite_info_evaluate_2_0926.txt', sep='\t', header=None)
test2.columns = ['qid', 'uid', 'dt']
logging.info("test2 %s", test2.shape)

[2019-12-17 08:01:25,936] INFO in <ipython-input-6-7433c7333c71>: invite (9489162, 4)
[2019-12-17 08:01:27,095] INFO in <ipython-input-6-7433c7333c71>: test2 (1141718, 3)


In [7]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

# test['day'] = extract_day(test['dt'])
# test['hour'] = extract_hour(test['dt'])
# del train['dt'], test['dt']

test2['day'] = extract_day(test2['dt'])
test2['hour'] = extract_hour(test2['dt'])
del train['dt'], test2['dt']

In [8]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-12-17 08:02:32,443] INFO in <ipython-input-8-9063dae39e17>: ques (1829900, 3)


In [9]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid', how='left')
del ques

# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']


[2019-12-17 08:04:45,241] INFO in <ipython-input-9-cdaa443c2e7e>: ans (4513735, 18)


In [None]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    else:
        return -1     # 更前的一个月

In [None]:
def extract_kfold_train_feature(data_df_, ans_df_): 
    train_df = data_df_.copy()
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    train_df['fold'] = train_df['day'].apply(fold_fn)
    train_df_copy = train_df.copy()
    
    # 给 ans 加 fold 信息
    ans_df['fold'] = ans_df['a_day'].apply(fold_fn)
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat
    
    for feat in extract_feat:
        train_df[feat] = -10000
        
    for fold_ in range(4):
        logging.info("fold %s", fold_)
        
        log_trn = train_df_copy.loc[train_df_copy['fold'] != fold_]    # 提这些记录里的信息
        logging.info("log_trn shape %s", log_trn.shape)
        val_df = train_df_copy.loc[train_df_copy['fold'] == fold_]
        logging.info("val_df shape %s", val_df.shape)
        log_ans = ans_df.loc[ans_df['fold'] != fold_]  # 排除掉当前 fold 的 ans
        logging.info("log_ans shape %s", log_ans.shape)
        
        # ques
        logging.info("question info")
        t1 = log_trn.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
        train_df.loc[train_df['fold']==fold_, extract_q_feat] = pd.merge(val_df, t1, on='qid', 
                                                                         how='left')[extract_q_feat].values

        # user
        logging.info("user info")
        t1 = log_trn.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['uid'] + extract_u_feat
        train_df.loc[train_df['fold']==fold_, extract_u_feat] = pd.merge(val_df, t1, on='uid', 
                                                                         how='left')[extract_u_feat].values
        
        # ans
        ans_q_group = log_ans.groupby('qid')
        ans_u_group = log_ans.groupby('uid')
        
        logging.info("ans: q_ans_kfold_count")
        t1 = ans_q_group['aid'].count().reset_index()
        t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['q_ans_kfold_count']] = pd.merge(val_df, t1, on='qid', 
                                                                                how='left')['q_ans_kfold_count'].values
        
        logging.info("ans: u_ans_kfold_count")
        t1 = ans_u_group['aid'].count().reset_index()
        t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['u_ans_kfold_count']] = pd.merge(val_df, t1, on='uid', 
                                                                                how='left')['u_ans_kfold_count'].values
        
        for col in a_feat:
            logging.info("ans: %s sum max mean", col)
            
            t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
            t1.columns = ['qid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='qid', 
                                                                     how='left')[f_name].values
            
            t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
            t1.columns = ['uid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='uid', 
                                                                     how='left')[f_name].values
            
    for feat in extract_feat:
        assert len(train_df[train_df[feat]==-10000]) == 0
    del train_df['fold']
    return train_df

In [None]:
train_kfold = extract_kfold_train_feature(train, ans)

In [10]:
def extract_kfold_test_feature(test_df_, data_df_, ans_df_): 
    train_df = data_df_.copy()
    logging.info("train_df shape %s", train_df.shape)
    test_df = test_df_.copy()
    logging.info("test_df shape %s", test_df.shape)
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
#     a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
#               'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
#               'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    a_feat = ['diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat

    # ques
    logging.info("question info")
    t1 = train_df.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    # user
    logging.info("user info")
    t1 = train_df.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid'] + extract_u_feat
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    # ans
    ans_q_group = ans_df.groupby('qid')
    ans_u_group = ans_df.groupby('uid')
    
    logging.info("ans: q_ans_kfold_count")
    t1 = ans_q_group['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    logging.info("ans: u_ans_kfold_count")
    t1 = ans_u_group['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    for col in a_feat:
        logging.info("ans: %s sum max mean", col)
        
        t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        t1.columns = ['qid'] + f_name
        test_df = pd.merge(test_df, t1, on='qid', how='left')

        t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        t1.columns = ['uid'] + f_name
        test_df = pd.merge(test_df, t1, on='uid', how='left')

    return test_df

In [None]:
test_kfold = extract_kfold_test_feature(test, train[['uid', 'qid', 'day', 'hour', 'label']], ans)

In [11]:
test_kfold_2 = extract_kfold_test_feature(test2, train[['uid', 'qid', 'day', 'hour', 'label']], ans)

[2019-12-17 08:05:30,220] INFO in <ipython-input-10-28b7c91e998f>: train_df shape (9489162, 5)
[2019-12-17 08:05:30,383] INFO in <ipython-input-10-28b7c91e998f>: test_df shape (1141718, 4)
[2019-12-17 08:05:33,421] INFO in <ipython-input-10-28b7c91e998f>: ans_df shape (4513735, 23)
[2019-12-17 08:05:33,423] INFO in <ipython-input-10-28b7c91e998f>: question info
[2019-12-17 08:05:48,712] INFO in <ipython-input-10-28b7c91e998f>: user info
[2019-12-17 08:06:08,240] INFO in <ipython-input-10-28b7c91e998f>: ans: q_ans_kfold_count
[2019-12-17 08:06:22,613] INFO in <ipython-input-10-28b7c91e998f>: ans: u_ans_kfold_count
[2019-12-17 08:06:31,500] INFO in <ipython-input-10-28b7c91e998f>: ans: diff_qa_days sum max mean


In [None]:
del train_kfold['uid'], train_kfold['qid'], train_kfold['label']

In [None]:
del test_kfold['uid'], test_kfold['qid']

In [12]:
# 压缩数据
# t = train_kfold.dtypes
# for x in t[t == 'int64'].index:
#     train_kfold[x] = train_kfold[x].astype('int32')

# for x in t[t == 'float64'].index:
#     train_kfold[x] = train_kfold[x].astype('float32')

# t = test_kfold.dtypes
# for x in t[t == 'int64'].index:
#     test_kfold[x] = test_kfold[x].astype('int32')

# for x in t[t == 'float64'].index:
#     test_kfold[x] = test_kfold[x].astype('float32')

# 新测试集
t = test_kfold_2.dtypes
for x in t[t == 'int64'].index:
    test_kfold_2[x] = test_kfold_2[x].astype('int32')

for x in t[t == 'float64'].index:
    test_kfold_2[x] = test_kfold_2[x].astype('float32')

In [None]:
train_kfold.to_csv('feature/train_kfold_feature.txt', index=False, sep='\t')

In [None]:
test_kfold.to_csv('feature/test_kfold_feature.txt', index=False, sep='\t')

In [13]:
test_kfold_2.to_csv('feature/test2_kfold_feature.txt', index=False, sep='\t')