In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

# import warnings
# warnings.filterwarnings('ignore')

In [3]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [4]:
base_path = './data'

In [5]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

[2019-11-28 12:22:14,036] INFO in <ipython-input-5-d8d667ebe145>: invite (9489162, 4)
[2019-11-28 12:22:15,656] INFO in <ipython-input-5-d8d667ebe145>: test (1141683, 3)


In [6]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-11-28 12:23:10,829] INFO in <ipython-input-6-9063dae39e17>: ques (1829900, 3)


In [7]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid', how='left')
del ques

# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']


[2019-11-28 12:24:27,491] INFO in <ipython-input-7-cdaa443c2e7e>: ans (4513735, 18)


In [8]:
len(ans)

4513735

In [10]:
def extract_kfold_train_feature(data_df_, ans_df_): 
    train_df = data_df_.copy()
    train_df_copy = train_df.copy()
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    train_df['fold'] = None
    folds = KFold(n_splits=5, shuffle=True, random_state=1989)
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df)):
        train_df.loc[val_idx, 'fold'] = fold_
    
    # 给 ans 加 fold 信息
    t1 = train_df[['uid', 'qid', 'fold']].drop_duplicates(subset=['uid', 'qid'], keep='last', inplace=False)
    ans_df = pd.merge(ans_df, t1, on=['uid', 'qid'], how='left')
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat
    
    for feat in extract_feat:
        train_df[feat] = -10000
        
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df)):
        logging.info("fold %s", fold_)
        
        log_trn = train_df_copy.loc[trn_idx]    # 提这些记录里的信息
        logging.info("log_trn shape %s", log_trn.shape)
        val_df = train_df_copy.loc[val_idx]
        logging.info("val_df shape %s", val_df.shape)
        log_ans = ans_df.loc[ans_df['fold'] != fold_]  # 排除掉当前 fold 的 ans
        logging.info("log_ans shape %s", log_ans.shape)
        
        # ques
        logging.info("question info")
        t1 = log_trn.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
        train_df.loc[val_idx, extract_q_feat] = pd.merge(val_df, t1, on='qid', how='left')[extract_q_feat].values

        # user
        logging.info("user info")
        t1 = log_trn.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['uid'] + extract_u_feat
        train_df.loc[val_idx, extract_u_feat] = pd.merge(val_df, t1, on='uid', how='left')[extract_u_feat].values
        
        # ans
        ans_q_group = log_ans.groupby('qid')
        ans_u_group = log_ans.groupby('uid')
        
        logging.info("ans: q_ans_kfold_count")
        t1 = ans_q_group['aid'].count().reset_index()
        t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
        train_df.loc[val_idx, ['q_ans_kfold_count']] = pd.merge(val_df, t1, 
                                                                on='qid', how='left')['q_ans_kfold_count'].values
        
        logging.info("ans: u_ans_kfold_count")
        t1 = ans_u_group['aid'].count().reset_index()
        t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
        train_df.loc[val_idx, ['u_ans_kfold_count']] = pd.merge(val_df, t1, 
                                                                on='uid', how='left')['u_ans_kfold_count'].values
        
        for col in a_feat:
            logging.info("ans: %s sum max mean", col)
            
            t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
            t1.columns = ['qid'] + f_name
            train_df.loc[val_idx, f_name] = pd.merge(val_df, t1, on='qid', how='left')[f_name].values
            
            t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
            t1.columns = ['uid'] + f_name
            train_df.loc[val_idx, f_name] = pd.merge(val_df, t1, on='uid', how='left')[f_name].values
            
    for feat in extract_feat:
        assert len(train_df[train_df[feat]==-10000]) == 0
    del train_df['fold']
    return train_df

In [11]:
train_kfold = extract_kfold_train_feature(train, ans)

[2019-11-28 12:33:53,044] INFO in <ipython-input-10-19cc53cc0914>: ans_df shape (4513735, 23)
[2019-11-28 12:34:23,808] INFO in <ipython-input-10-19cc53cc0914>: ans_df shape (4513735, 24)
[2019-11-28 12:34:33,036] INFO in <ipython-input-10-19cc53cc0914>: fold 0
[2019-11-28 12:34:33,887] INFO in <ipython-input-10-19cc53cc0914>: log_trn shape (7591329, 5)
[2019-11-28 12:34:34,178] INFO in <ipython-input-10-19cc53cc0914>: val_df shape (1897833, 5)
[2019-11-28 12:34:36,273] INFO in <ipython-input-10-19cc53cc0914>: log_ans shape (4214053, 24)
[2019-11-28 12:34:36,274] INFO in <ipython-input-10-19cc53cc0914>: question info
[2019-11-28 12:35:10,465] INFO in <ipython-input-10-19cc53cc0914>: user info
[2019-11-28 12:35:38,393] INFO in <ipython-input-10-19cc53cc0914>: ans: q_ans_kfold_count
[2019-11-28 12:35:49,341] INFO in <ipython-input-10-19cc53cc0914>: ans: u_ans_kfold_count
[2019-11-28 12:35:57,906] INFO in <ipython-input-10-19cc53cc0914>: ans: is_good sum max mean
[2019-11-28 12:36:31,191]

[2019-11-28 12:55:20,608] INFO in <ipython-input-10-19cc53cc0914>: ans: reci_tks sum max mean
[2019-11-28 12:55:34,359] INFO in <ipython-input-10-19cc53cc0914>: ans: reci_xxx sum max mean
[2019-11-28 12:55:48,449] INFO in <ipython-input-10-19cc53cc0914>: ans: reci_no_help sum max mean
[2019-11-28 12:56:02,269] INFO in <ipython-input-10-19cc53cc0914>: ans: reci_dis sum max mean
[2019-11-28 12:56:16,299] INFO in <ipython-input-10-19cc53cc0914>: ans: diff_qa_days sum max mean
[2019-11-28 12:56:30,205] INFO in <ipython-input-10-19cc53cc0914>: fold 4
[2019-11-28 12:56:31,515] INFO in <ipython-input-10-19cc53cc0914>: log_trn shape (7591330, 5)
[2019-11-28 12:56:31,800] INFO in <ipython-input-10-19cc53cc0914>: val_df shape (1897832, 5)
[2019-11-28 12:56:32,725] INFO in <ipython-input-10-19cc53cc0914>: log_ans shape (4213254, 24)
[2019-11-28 12:56:32,726] INFO in <ipython-input-10-19cc53cc0914>: question info
[2019-11-28 12:56:46,513] INFO in <ipython-input-10-19cc53cc0914>: user info
[2019-11

In [12]:
def extract_kfold_test_feature(test_df_, data_df_, ans_df_): 
    train_df = data_df_.copy()
    logging.info("train_df shape %s", train_df.shape)
    test_df = test_df_.copy()
    logging.info("test_df shape %s", test_df.shape)
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat

    # ques
    logging.info("question info")
    t1 = train_df.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    # user
    logging.info("user info")
    t1 = train_df.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid'] + extract_u_feat
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    # ans
    ans_q_group = ans_df.groupby('qid')
    ans_u_group = ans_df.groupby('uid')
    
    logging.info("ans: q_ans_kfold_count")
    t1 = ans_q_group['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    logging.info("ans: u_ans_kfold_count")
    t1 = ans_u_group['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    for col in a_feat:
        logging.info("ans: %s sum max mean", col)
        
        t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        t1.columns = ['qid'] + f_name
        test_df = pd.merge(test_df, t1, on='qid', how='left')

        t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        t1.columns = ['uid'] + f_name
        test_df = pd.merge(test_df, t1, on='uid', how='left')

    return test_df

In [13]:
test_kfold = extract_kfold_test_feature(test, train[['uid', 'qid', 'day', 'hour', 'label']], ans)

[2019-11-28 13:00:45,992] INFO in <ipython-input-12-a98a4be4d7c5>: train_df shape (9489162, 5)
[2019-11-28 13:00:46,062] INFO in <ipython-input-12-a98a4be4d7c5>: test_df shape (1141683, 4)
[2019-11-28 13:00:46,398] INFO in <ipython-input-12-a98a4be4d7c5>: ans_df shape (4513735, 23)
[2019-11-28 13:00:46,399] INFO in <ipython-input-12-a98a4be4d7c5>: question info
[2019-11-28 13:00:54,155] INFO in <ipython-input-12-a98a4be4d7c5>: user info
[2019-11-28 13:01:04,022] INFO in <ipython-input-12-a98a4be4d7c5>: ans: q_ans_kfold_count
[2019-11-28 13:01:10,880] INFO in <ipython-input-12-a98a4be4d7c5>: ans: u_ans_kfold_count
[2019-11-28 13:01:15,302] INFO in <ipython-input-12-a98a4be4d7c5>: ans: is_good sum max mean
[2019-11-28 13:01:20,339] INFO in <ipython-input-12-a98a4be4d7c5>: ans: is_rec sum max mean
[2019-11-28 13:01:25,172] INFO in <ipython-input-12-a98a4be4d7c5>: ans: is_dest sum max mean
[2019-11-28 13:01:30,013] INFO in <ipython-input-12-a98a4be4d7c5>: ans: has_img sum max mean
[2019-11

In [22]:
# todo: 考虑在训练集中,人为补 nan
# test_kfold['q_inv_kfold_mean'].count()

354695

In [53]:
# origin
origin = len(train.qid.drop_duplicates())
print(origin)
origin2 = len(test.qid.drop_duplicates())
print(origin2)
temp = pd.concat([train.qid,test.qid])
sum2 = len(temp.drop_duplicates())
print(sum2)
print(sum2-origin)

926203
237167
1083727
157524


In [58]:
t1 = train.groupby('qid')['day'].agg(['mean', 'std', 'count'])
t1[:60]

Unnamed: 0_level_0,mean,std,count
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q1000002524,3861.0,0.0,2
Q1000006560,3839.0,,1
Q1000007604,3840.5,1.772811,8
Q1000008276,3863.142857,2.734262,7
Q1000010200,3857.0,,1
Q1000025766,3864.0,,1
Q100002777,3846.461538,1.126601,13
Q1000036156,3842.0,,1
Q1000041691,3838.0,,1
Q1000049860,3866.0,1.0,3


In [59]:
t2 = train.groupby('uid')['day'].agg(['mean', 'std', 'count'])
t2[:60]

Unnamed: 0_level_0,mean,std,count
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1000000382,3855.75,9.982127,8
M1000000983,3853.071429,5.341626,14
M1000003304,3852.0,,1
M1000008978,3854.333333,5.809475,9
M1000009571,3865.0,,1
M1000009786,3853.6,2.408319,5
M1000010662,3842.0,,1
M1000020034,3857.666667,6.350853,3
M1000022555,3853.2,8.700575,5
M1000024720,3843.619048,4.779918,21


In [23]:
# test_kfold['u_inv_kfold_mean'].count()

1057029

In [27]:
del train_kfold['uid'], train_kfold['qid'], train_kfold['label']
del test_kfold['uid'], test_kfold['qid']

In [37]:
# 压缩数据
t = train_kfold.dtypes
for x in t[t == 'int64'].index:
    train_kfold[x] = train_kfold[x].astype('int32')
    test_kfold[x] = test_kfold[x].astype('int32')

for x in t[t == 'float64'].index:
    train_kfold[x] = train_kfold[x].astype('float32')
    test_kfold[x] = test_kfold[x].astype('float32')

In [41]:
train_kfold.to_csv('feature/train_kfold_feature.txt', index=False, sep='\t')
test_kfold.to_csv('feature/test_kfold_feature.txt', index=False, sep='\t')

In [42]:
train_kfold.columns

Index(['day', 'hour', 'q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std',
       'q_inv_kfold_count', 'u_inv_kfold_mean', 'u_inv_kfold_sum',
       'u_inv_kfold_std', 'u_inv_kfold_count',
       ...
       'q_reci_dis_mean', 'u_reci_dis_sum', 'u_reci_dis_max',
       'u_reci_dis_mean', 'q_diff_qa_days_sum', 'q_diff_qa_days_max',
       'q_diff_qa_days_mean', 'u_diff_qa_days_sum', 'u_diff_qa_days_max',
       'u_diff_qa_days_mean'],
      dtype='object', length=102)