In [1]:
# %load 102_user_smooth.py
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm


# 载入数据
train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')
df = pd.concat([train, test], axis=0)

# 贝叶斯平滑参数
iter_num = 1000
epsilon = 0.001

'''
1. 定义需要计算平滑点击率的变量
2. 对于每一天，找出在这之前的所有点击行为
3. 统计该变量的点击次数和购买次数
'''

smooth_cols = ['shop_id', 'shop_review_num_level', 'shop_star_level']

# 保存最后结果的dataframe
smooth_train = train[smooth_cols + ['instance_id', 'day']]
smooth_test = test[smooth_cols + ['instance_id','day']]

# 开始对每个特征进行贝叶斯平滑以获取历史交易率
for col in tqdm(smooth_cols):
    # 定义特征名
    col_I = '{}_I'.format(col)
    col_C = '{}_C'.format(col)
    col_smooth_rate = '{}_smooth_rate'.format(col)
    
#     train[col_smooth_rate] = -1
#     smooth_all = pd.DataFrame({'day': train.day, '{}'.format(col): train[col]})
    CVR_all = None
    for day in tqdm(range(19, 26)):
        # 统计总浏览数和购买数
        I = train[train.day<day].groupby(col)['is_trade'].count().reset_index()
        I.columns = [col, col_I]
        C = train[train.day<day].groupby(col)['is_trade'].sum().reset_index()
        C.columns = [col, col_C]
        CVR = pd.concat([I, C[col_C]], axis=1)
        
        # CVR的columns：[col, col_I, col_C, 'day']
        CVR['day'] = day
        
        # 贝叶斯平滑过程
        smooth = BayesianSmoothing(1, 1)
        smooth.update(CVR[col_I].values, CVR[col_C].values, iter_num, epsilon)
        alpha = smooth.alpha
        beta = smooth.beta
        CVR[col_smooth_rate] = (CVR[col_C] + alpha) / (CVR[col_I] + alpha + beta)
        
        # 把不同天算的concat起来
        CVR_all = pd.concat([CVR_all, CVR], axis=0)
        # print(CVR.head())
        # smooth_all[col_smooth_rate] = -1
        # print((pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='inner')).columns[-1])
        # smooth_all[col_smooth_rate][smooth_all.day == day] = (pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='left')).iloc[:, -1].values

    smooth_train = pd.merge(smooth_train, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')


smooth_train.drop(['day'] + smooth_cols, axis=1, inplace=True)
smooth_test.drop(['day'] + smooth_cols, axis=1, inplace=True)
print(smooth_train.columns)
print('the shape of train {}'.format(smooth_train.shape))
print('the shape of test {}'.format(smooth_test.shape))
dump_pickle(smooth_train, path='../data/train_feature/202_smooth_features.pkl')
dump_pickle(smooth_test, path='../data/test_feature/202_smooth_features.pkl')




  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:11<01:06, 11.12s/it][A
 29%|██▊       | 2/7 [00:23<00:58, 11.80s/it][A
 43%|████▎     | 3/7 [00:36<00:48, 12.14s/it][A
 57%|█████▋    | 4/7 [00:49<00:37, 12.41s/it][A
 71%|███████▏  | 5/7 [01:03<00:25, 12.63s/it][A
 86%|████████▌ | 6/7 [01:16<00:12, 12.74s/it][A
100%|██████████| 7/7 [01:30<00:00, 12.90s/it][A
 33%|███▎      | 1/3 [01:30<03:00, 90.41s/it]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  8.70it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  7.14it/s][A
 43%|████▎     | 3/7 [00:00<00:00,  6.37it/s][A
 57%|█████▋    | 4/7 [00:00<00:00,  5.71it/s][A
 71%|███████▏  | 5/7 [00:00<00:00,  5.16it/s][A
 86%|████████▌ | 6/7 [00:01<00:00,  4.86it/s][A
100%|██████████| 7/7 [00:01<00:00,  4.54it/s][A
 67%|██████▋   | 2/3 [01:32<00:46, 46.02s/it]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  9.38it/s][A
 29%|██▊    

Index(['instance_id', 'shop_id_smooth_rate',
       'shop_review_num_level_smooth_rate', 'shop_star_level_smooth_rate'],
      dtype='object')
the shape of train (478087, 4)
the shape of test (42888, 4)
