In [4]:
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path, valid_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm

train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')
df = pd.concat([train, test], axis=0)

In [5]:
iter_num = 200
epsilon = 0.001
'''
1. 定义需要计算平滑点击率的变量
2. 对于每一天，找出在这之前的所有点击行为
3. 统计该变量的点击次数和购买次数
'''
smooth_cols = ['item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level','item_pv_level']

smooth_train = train[smooth_cols + ['instance_id', 'day']]
smooth_test = test[smooth_cols + ['instance_id','day']]
for col in tqdm(smooth_cols):
    col_I = '{}_I'.format(col)
    col_C = '{}_C'.format(col)
    col_smooth_rate = '{}_smooth_rate'.format(col)
    train[col_smooth_rate] = -1
    smooth_all = pd.DataFrame({'day': train.day, '{}'.format(col): train[col]})
    CVR_all = None
    for day in tqdm(range(19, 25)):
        I = train[train.day<day].groupby(col)['is_trade'].count().reset_index()
        I.columns = [col, col_I]
        C = train[train.day<day].groupby(col)['is_trade'].sum().reset_index()
        C.columns = [col, col_C]
        CVR = pd.concat([I, C[col_C]], axis=1)
        CVR['day'] = day

        smooth = BayesianSmoothing(1, 1)
        smooth.update(CVR[col_I].values, CVR[col_C].values, iter_num, epsilon)
        alpha = smooth.alpha
        beta = smooth.beta
        CVR[col_smooth_rate] = (CVR[col_C] + alpha) / (CVR[col_I] + alpha + beta)
        CVR_all = pd.concat([CVR_all, CVR], axis=0)
        
    smooth_train = pd.merge(smooth_train, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')


  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:01<00:06,  1.39s/it][A
 33%|███▎      | 2/6 [00:02<00:05,  1.49s/it][A
 50%|█████     | 3/6 [00:04<00:04,  1.60s/it][A
 67%|██████▋   | 4/6 [00:06<00:03,  1.73s/it][A
 83%|████████▎ | 5/6 [00:09<00:01,  1.82s/it][A
100%|██████████| 6/6 [00:11<00:00,  1.91s/it][A
 14%|█▍        | 1/7 [00:11<01:09, 11.60s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:00<00:02,  1.75it/s][A
 33%|███▎      | 2/6 [00:01<00:02,  1.94it/s][A
 50%|█████     | 3/6 [00:01<00:01,  1.91it/s][A
 67%|██████▋   | 4/6 [00:02<00:01,  1.84it/s][A
 83%|████████▎ | 5/6 [00:02<00:00,  1.77it/s][A
100%|██████████| 6/6 [00:03<00:00,  1.70it/s][A
 29%|██▊       | 2/7 [00:15<00:38,  7.61s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:00<00:01,  4.45it/s][A
 33%|███▎      | 2/6 [00:00<00:00,  5.54it/s][A
 50%|█████     | 3/6 [00:00<00:00,  5.92it/s][A
 67%|██████▋

In [6]:
smooth_train.drop(['day'], axis=1, inplace=True)
smooth_test.drop(['day'], axis=1, inplace=True)
smooth_train.drop(smooth_cols,axis=1,inplace=True)
smooth_test.drop(smooth_cols,axis=1,inplace=True)
print(smooth_train.columns)
print('the shape of train {}'.format(smooth_train.shape))
print('the shape of test {}'.format(smooth_test.shape))

Index(['instance_id', 'item_id_smooth_rate', 'item_brand_id_smooth_rate',
       'item_city_id_smooth_rate', 'item_price_level_smooth_rate',
       'item_sales_level_smooth_rate', 'item_collected_level_smooth_rate',
       'item_pv_level_smooth_rate'],
      dtype='object')
the shape of train (478087, 8)
the shape of test (42888, 8)


In [7]:
dump_pickle(smooth_train, path='../data/train_feature/301_smooth_item_features.pkl')
dump_pickle(smooth_test, path='../data/test_feature/301_smooth_item_features.pkl')