In [70]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [71]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

# import warnings
# warnings.filterwarnings('ignore')

In [72]:
base_path = './data'

In [73]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-11-29 08:15:57,625] INFO in <ipython-input-73-af34e33feb4f>: invite (9489162, 4)
[2019-11-29 08:15:59,626] INFO in <ipython-input-73-af34e33feb4f>: test (1141683, 3)


In [74]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

In [75]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])

del ques['q_dt']

[2019-11-29 08:17:17,347] INFO in <ipython-input-75-9063dae39e17>: ques (1829900, 3)


In [76]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid', how='left')
del ques

# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']


[2019-11-29 08:19:00,549] INFO in <ipython-input-76-cdaa443c2e7e>: ans (4513735, 18)


In [78]:
t = train['day'].value_counts().sort_index()
t

3838    270391
3839    264301
3840    278456
3841    275574
3842    284752
3843    268168
3844    263967
3845    235093
3846    278450
3847    329741
3848    335248
3849    267149
3850    339722
3851    325129
3852    319622
3853    315867
3854    318553
3855    323729
3856    316613
3857    336136
3858    317410
3859    306501
3860    324921
3861    362485
3862    383377
3863    368015
3864    373872
3865    403985
3866    353989
3867    347946
Name: day, dtype: int64

In [91]:
t1 = 0
t2 = t.loc[3838:3846].sum()
t1 += t2
t2

2419152

In [92]:
t2 = t.loc[3847:3853].sum()
t1 += t2
t2

2232478

In [93]:
t2 = t.loc[3854:3860].sum()
t1 += t2
t2

2243863

In [94]:
t2 = t.loc[3861:3867].sum()
t1 += t2
t2

2593669

In [95]:
t1

9489162

In [96]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    raise ValueError 

In [97]:
def extract_kfold_train_feature(data_df_, ans_df_): 
    train_df = data_df_.copy()
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    train_df['fold'] = train_df['day'].apply(fold_fn)
    train_df_copy = train_df.copy()
    
    # 给 ans 加 fold 信息
    t1 = train_df[['uid', 'qid', 'fold']].drop_duplicates(subset=['uid', 'qid'], keep='last', inplace=False)
    ans_df = pd.merge(ans_df, t1, on=['uid', 'qid'], how='left')
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat
    
    for feat in extract_feat:
        train_df[feat] = -10000
        
    for fold_ in range(4):
        logging.info("fold %s", fold_)
        
        log_trn = train_df_copy.loc[train_df_copy['fold'] != fold_]    # 提这些记录里的信息
        logging.info("log_trn shape %s", log_trn.shape)
        val_df = train_df_copy.loc[train_df_copy['fold'] == fold_]
        logging.info("val_df shape %s", val_df.shape)
        log_ans = ans_df.loc[ans_df['fold'] != fold_]  # 排除掉当前 fold 的 ans
        logging.info("log_ans shape %s", log_ans.shape)
        
        # ques
        logging.info("question info")
        t1 = log_trn.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
        train_df.loc[train_df['fold']==fold_, extract_q_feat] = pd.merge(val_df, t1, on='qid', 
                                                                         how='left')[extract_q_feat].values

        # user
        logging.info("user info")
        t1 = log_trn.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
        t1.columns = ['uid'] + extract_u_feat
        train_df.loc[train_df['fold']==fold_, extract_u_feat] = pd.merge(val_df, t1, on='uid', 
                                                                         how='left')[extract_u_feat].values
        
        # ans
        ans_q_group = log_ans.groupby('qid')
        ans_u_group = log_ans.groupby('uid')
        
        logging.info("ans: q_ans_kfold_count")
        t1 = ans_q_group['aid'].count().reset_index()
        t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['q_ans_kfold_count']] = pd.merge(val_df, t1, on='qid', 
                                                                                how='left')['q_ans_kfold_count'].values
        
        logging.info("ans: u_ans_kfold_count")
        t1 = ans_u_group['aid'].count().reset_index()
        t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
        train_df.loc[train_df['fold']==fold_, ['u_ans_kfold_count']] = pd.merge(val_df, t1, on='uid', 
                                                                                how='left')['u_ans_kfold_count'].values
        
        for col in a_feat:
            logging.info("ans: %s sum max mean", col)
            
            t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
            t1.columns = ['qid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='qid', 
                                                                     how='left')[f_name].values
            
            t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
            f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
            t1.columns = ['uid'] + f_name
            train_df.loc[train_df['fold']==fold_, f_name] = pd.merge(val_df, t1, on='uid', 
                                                                     how='left')[f_name].values
            
    for feat in extract_feat:
        assert len(train_df[train_df[feat]==-10000]) == 0
    del train_df['fold']
    return train_df

In [None]:
train_kfold = extract_kfold_train_feature(train, ans)

[2019-11-29 08:28:06,957] INFO in <ipython-input-97-38aad0c5001c>: ans_df shape (4513735, 23)
[2019-11-29 08:28:56,752] INFO in <ipython-input-97-38aad0c5001c>: ans_df shape (4513735, 24)
[2019-11-29 08:29:35,375] INFO in <ipython-input-97-38aad0c5001c>: fold 0
[2019-11-29 08:29:36,621] INFO in <ipython-input-97-38aad0c5001c>: log_trn shape (7070010, 6)
[2019-11-29 08:29:36,990] INFO in <ipython-input-97-38aad0c5001c>: val_df shape (2419152, 6)
[2019-11-29 08:29:40,168] INFO in <ipython-input-97-38aad0c5001c>: log_ans shape (4058804, 24)
[2019-11-29 08:29:40,174] INFO in <ipython-input-97-38aad0c5001c>: question info
[2019-11-29 08:31:34,780] INFO in <ipython-input-97-38aad0c5001c>: user info
[2019-11-29 08:32:12,087] INFO in <ipython-input-97-38aad0c5001c>: ans: q_ans_kfold_count
[2019-11-29 08:32:27,172] INFO in <ipython-input-97-38aad0c5001c>: ans: u_ans_kfold_count
[2019-11-29 08:32:38,270] INFO in <ipython-input-97-38aad0c5001c>: ans: is_good sum max mean
[2019-11-29 08:33:20,371]

In [56]:
def extract_kfold_test_feature(test_df_, data_df_, ans_df_): 
    train_df = data_df_.copy()
    logging.info("train_df shape %s", train_df.shape)
    test_df = test_df_.copy()
    logging.info("test_df shape %s", test_df.shape)
    ans_df = ans_df_.copy()
    logging.info("ans_df shape %s", ans_df.shape)
    
    extract_q_feat = ['q_inv_kfold_mean', 'q_inv_kfold_sum', 'q_inv_kfold_std', 'q_inv_kfold_count']
    extract_u_feat = ['u_inv_kfold_mean', 'u_inv_kfold_sum', 'u_inv_kfold_std', 'u_inv_kfold_count']
    a_feat = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
              'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
              'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']
    
    extract_a_feat = ['q_ans_kfold_count', 'u_ans_kfold_count']
    for col in a_feat:
        extract_a_feat += [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean', 
                           f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        
    extract_feat = extract_q_feat + extract_u_feat + extract_a_feat

    # ques
    logging.info("question info")
    t1 = train_df.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid'] + extract_q_feat   # 回答率,回答次数,标准差,邀请次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    # user
    logging.info("user info")
    t1 = train_df.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid'] + extract_u_feat
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    # ans
    ans_q_group = ans_df.groupby('qid')
    ans_u_group = ans_df.groupby('uid')
    
    logging.info("ans: q_ans_kfold_count")
    t1 = ans_q_group['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_kfold_count']          # 某问题在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='qid', how='left')

    logging.info("ans: u_ans_kfold_count")
    t1 = ans_u_group['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_kfold_count']          # 某用户在 answer_info 中的回答次数
    test_df = pd.merge(test_df, t1, on='uid', how='left')

    for col in a_feat:
        logging.info("ans: %s sum max mean", col)
        
        t1 = ans_q_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        t1.columns = ['qid'] + f_name
        test_df = pd.merge(test_df, t1, on='qid', how='left')

        t1 = ans_u_group[col].agg(['sum', 'max', 'mean']).reset_index()
        f_name = [f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        t1.columns = ['uid'] + f_name
        test_df = pd.merge(test_df, t1, on='uid', how='left')

    return test_df

In [57]:
test_kfold = extract_kfold_test_feature(test, train[['uid', 'qid', 'day', 'hour', 'label']], ans)

[2019-11-29 07:05:34,210] INFO in <ipython-input-56-a98a4be4d7c5>: train_df shape (9489162, 5)
[2019-11-29 07:05:34,293] INFO in <ipython-input-56-a98a4be4d7c5>: test_df shape (1141683, 4)
[2019-11-29 07:05:35,103] INFO in <ipython-input-56-a98a4be4d7c5>: ans_df shape (4513735, 23)
[2019-11-29 07:05:35,104] INFO in <ipython-input-56-a98a4be4d7c5>: question info
[2019-11-29 07:05:43,066] INFO in <ipython-input-56-a98a4be4d7c5>: user info
[2019-11-29 07:05:53,488] INFO in <ipython-input-56-a98a4be4d7c5>: ans: q_ans_kfold_count
[2019-11-29 07:06:01,030] INFO in <ipython-input-56-a98a4be4d7c5>: ans: u_ans_kfold_count
[2019-11-29 07:06:05,716] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_good sum max mean
[2019-11-29 07:06:11,178] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_rec sum max mean
[2019-11-29 07:06:16,381] INFO in <ipython-input-56-a98a4be4d7c5>: ans: is_dest sum max mean
[2019-11-29 07:06:21,467] INFO in <ipython-input-56-a98a4be4d7c5>: ans: has_img sum max mean
[2019-11

In [65]:
# todo: 考虑在训练集中,人为补 nan
test_kfold['q_inv_kfold_mean'].count()

354695

In [67]:
test_kfold['u_inv_kfold_mean'].count()

1057029

In [66]:
train_kfold['q_inv_kfold_mean'].count()

5299600

In [69]:
train_kfold['u_inv_kfold_mean'].count()

8858584

In [None]:
# origin
origin = len(train.qid.drop_duplicates())
print(origin)
origin2 = len(test.qid.drop_duplicates())
print(origin2)
temp = pd.concat([train.qid,test.qid])
sum2 = len(temp.drop_duplicates())
print(sum2)
print(sum2-origin)

In [None]:
t1 = train.groupby('qid')['day'].agg(['mean', 'std', 'count'])
t1[:60]

In [None]:
t2 = train.groupby('uid')['day'].agg(['mean', 'std', 'count'])
t2[:60]

In [None]:
# test_kfold['u_inv_kfold_mean'].count()

In [60]:
del train_kfold['uid'], train_kfold['qid'], train_kfold['label']
del test_kfold['uid'], test_kfold['qid']

In [61]:
# 压缩数据
t = train_kfold.dtypes
for x in t[t == 'int64'].index:
    train_kfold[x] = train_kfold[x].astype('int32')
    test_kfold[x] = test_kfold[x].astype('int32')

for x in t[t == 'float64'].index:
    train_kfold[x] = train_kfold[x].astype('float32')
    test_kfold[x] = test_kfold[x].astype('float32')

In [62]:
train_kfold.to_csv('feature/train_kfold_feature_3.txt', index=False, sep='\t')
test_kfold.to_csv('feature/test_kfold_feature_3.txt', index=False, sep='\t')

In [63]:
train_kfold[-60:]

Unnamed: 0,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,u_inv_kfold_sum,u_inv_kfold_std,u_inv_kfold_count,...,q_reci_dis_mean,u_reci_dis_sum,u_reci_dis_max,u_reci_dis_mean,q_diff_qa_days_sum,q_diff_qa_days_max,q_diff_qa_days_mean,u_diff_qa_days_sum,u_diff_qa_days_max,u_diff_qa_days_mean
9489102,3849,8,0.0,0.0,,1.0,0.0,0.0,0.0,7.0,...,,,,,,,,,,
9489103,3851,9,0.166667,1.0,0.408248,6.0,0.076923,1.0,0.27735,13.0,...,0.0,0.0,0.0,0.0,14.0,14.0,14.0,12.0,12.0,3.0
9489104,3845,18,0.0625,4.0,0.243975,64.0,,,,,...,0.0,,,,366.0,49.0,19.263159,,,
9489105,3867,16,,,,,0.0,0.0,0.0,8.0,...,,0.0,0.0,0.0,,,,91.0,91.0,91.0
9489106,3846,9,1.0,1.0,,1.0,,,,,...,0.142857,,,,39.0,24.0,5.571429,,,
9489107,3861,10,,,,,0.166667,1.0,0.408248,6.0,...,,0.0,0.0,0.0,,,,3.0,3.0,1.5
9489108,3841,12,,,,,,,,,...,0.014925,,,,1214.0,54.0,18.119404,,,
9489109,3848,7,0.333333,2.0,0.516398,6.0,0.25,3.0,0.452267,12.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.5,359.0,167.0,27.615385
9489110,3846,7,0.166667,1.0,0.408248,6.0,0.25,3.0,0.452267,12.0,...,0.0,0.0,0.0,0.0,2.0,2.0,1.0,359.0,167.0,27.615385
9489111,3847,18,,,,,0.25,3.0,0.452267,12.0,...,,0.0,0.0,0.0,,,,359.0,167.0,27.615385


In [64]:
test_kfold[:60]

Unnamed: 0,day,hour,q_inv_kfold_mean,q_inv_kfold_sum,q_inv_kfold_std,q_inv_kfold_count,u_inv_kfold_mean,u_inv_kfold_sum,u_inv_kfold_std,u_inv_kfold_count,...,q_reci_dis_mean,u_reci_dis_sum,u_reci_dis_max,u_reci_dis_mean,q_diff_qa_days_sum,q_diff_qa_days_max,q_diff_qa_days_mean,u_diff_qa_days_sum,u_diff_qa_days_max,u_diff_qa_days_mean
0,3870,9,0.0,0.0,0.0,2.0,0.125,1.0,0.353553,8.0,...,,0.0,0.0,0.0,,,,1087.0,1085.0,543.5
1,3872,22,,,,,0.0,0.0,0.0,4.0,...,,,,,,,,,,
2,3874,15,,,,,0.1,1.0,0.316228,10.0,...,,32.0,32.0,8.0,,,,1125.0,694.0,281.25
3,3873,4,0.5,3.0,0.547723,6.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,12.0,11.0,3.0,807.0,807.0,807.0
4,3872,19,,,,,0.0,0.0,0.0,4.0,...,,,,,,,,,,
5,3871,13,,,,,0.142857,1.0,0.377964,7.0,...,,0.0,0.0,0.0,,,,4.0,2.0,2.0
6,3873,14,,,,,,,,,...,,,,,,,,,,
7,3873,9,,,,,0.25,3.0,0.452267,12.0,...,,2.0,2.0,0.4,,,,68.0,56.0,13.6
8,3872,18,,,,,1.0,8.0,0.0,8.0,...,,0.0,0.0,0.0,,,,79.0,29.0,11.285714
9,3874,11,,,,,0.133333,2.0,0.351866,15.0,...,,4.0,2.0,0.285714,,,,1545.0,470.0,110.35714
