In [1]:
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm

train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')
df = load_pickle('../data/df.pkl')

print(train.day.value_counts())
print(test.day.value_counts())
# train = pd.concat([train, test])


18    78261
21    71193
19    70926
20    68377
22    68309
23    63610
24    57411
Name: day, dtype: int64
25    18371
Name: day, dtype: int64


In [2]:

iter_num = 1
epsilon = 0.001
'''
1. 定义需要计算平滑点击率的变量
2. 对于每一天，找出在这之前的所有点击行为
3. 统计该变量的点击次数和购买次数
'''
smooth_cols = ['item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level','item_pv_level']

smooth_train = train[smooth_cols + ['instance_id', 'day']]
smooth_test = test[smooth_cols + ['instance_id','day']]
for col in tqdm(smooth_cols):
    col_I = '{}_I'.format(col)
    col_C = '{}_C'.format(col)
    col_smooth_rate = '{}_smooth_rate'.format(col)
    train[col_smooth_rate] = -1
    smooth_all = pd.DataFrame({'day': train.day, '{}'.format(col): train[col]})
    CVR_all = None
    for day in tqdm(range(19, 26)):
        I = train[train.day<day].groupby(col)['is_trade'].count().reset_index()
        I.columns = [col, col_I]
        C = train[train.day<day].groupby(col)['is_trade'].sum().reset_index()
        C.columns = [col, col_C]
        CVR = pd.concat([I, C[col_C]], axis=1)
        CVR['day'] = day

        smooth = BayesianSmoothing(1, 1)
        smooth.update(CVR[col_I].values, CVR[col_C].values, iter_num, epsilon)
        alpha = smooth.alpha
        beta = smooth.beta
        CVR[col_smooth_rate] = (CVR[col_C] + alpha) / (CVR[col_I] + alpha + beta)
        CVR_all = pd.concat([CVR_all, CVR], axis=0)
        # print(CVR.head())
        # smooth_all[col_smooth_rate] = -1
        # print((pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='inner')).columns[-1])
        # smooth_all[col_smooth_rate][smooth_all.day == day] = (pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='left')).iloc[:, -1].values

    # smooth_all = pd.concat([smooth_all, smooth_feat], axis=1)
    # print(smooth_all.columns)
    smooth_train = pd.merge(smooth_train, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]
  0%|                                                                                            | 0/7 [00:00<?, ?it/s]
 14%|████████████                                                                        | 1/7 [00:00<00:02,  2.61it/s]
 29%|████████████████████████                                                            | 2/7 [00:00<00:01,  3.23it/s]
 43%|████████████████████████████████████                                                | 3/7 [00:00<00:01,  3.16it/s]
 57%|████████████████████████████████████████████████                                    | 4/7 [00:01<00:00,  3.09it/s]
 71%|████████████████████████████████████████████████████████████                        | 5/7 [00:01<00:00,  2.87it/s]
 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [00:02<00:00,  2.67it/s]
100%|███████████████████████████████████

In [3]:
total = smooth_train.shape[0]
nan = smooth_train.item_pv_level_smooth_rate.isnull().sum().sum()
# sum(pd.isnull(smooth_train['item_pv_level_smooth_rate']))
nan_rate = nan/total
print(nan_rate)

0.163698239023


In [4]:
smooth_train[smooth_train.day==19][:20]

Unnamed: 0,item_id,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,instance_id,day,item_id_smooth_rate,item_brand_id_smooth_rate,item_city_id_smooth_rate,item_price_level_smooth_rate,item_sales_level_smooth_rate,item_collected_level_smooth_rate,item_pv_level_smooth_rate
149454,3804,453,50,3,4.0,4,14,8020414902236616944,19,0.015667,0.011903,0.017658,0.049884,0.012354,0.026035,0.019471
149455,3804,453,50,3,4.0,4,14,4859094599505998036,19,0.015667,0.011903,0.017658,0.049884,0.012354,0.026035,0.019471
149456,3804,453,50,3,4.0,4,14,5558126838111646808,19,0.015667,0.011903,0.017658,0.049884,0.012354,0.026035,0.019471
149457,3804,453,50,3,4.0,4,14,847092414099368634,19,0.015667,0.011903,0.017658,0.049884,0.012354,0.026035,0.019471
149458,3804,453,50,3,4.0,4,14,8795328164245101288,19,0.015667,0.011903,0.017658,0.049884,0.012354,0.026035,0.019471
149459,339,2030,9,8,9.0,8,13,7948373161476559870,19,0.071709,0.045063,0.02113,0.013548,0.015204,0.016197,0.018396
149460,5777,1267,9,8,9.0,10,16,1475383245996038639,19,0.011236,0.007153,0.02113,0.013548,0.015204,0.020677,0.019638
149461,339,2030,9,8,9.0,8,13,7275916460297517487,19,0.071709,0.045063,0.02113,0.013548,0.015204,0.016197,0.018396
149462,5777,1267,9,8,9.0,10,16,8027968860188555291,19,0.011236,0.007153,0.02113,0.013548,0.015204,0.020677,0.019638
149463,339,2030,9,8,9.0,8,13,393517416808742415,19,0.071709,0.045063,0.02113,0.013548,0.015204,0.016197,0.018396


In [5]:
smooth_train.drop(['day'], axis=1, inplace=True)
smooth_test.drop(['day'], axis=1, inplace=True)
smooth_train.drop(smooth_cols,axis=1,inplace=True)
smooth_test.drop(smooth_cols,axis=1,inplace=True)
print(smooth_train.columns)
print('the shape of train {}'.format(smooth_train.shape))
print('the shape of test {}'.format(smooth_test.shape))

Index(['instance_id', 'item_id_smooth_rate', 'item_brand_id_smooth_rate',
       'item_city_id_smooth_rate', 'item_price_level_smooth_rate',
       'item_sales_level_smooth_rate', 'item_collected_level_smooth_rate',
       'item_pv_level_smooth_rate'],
      dtype='object')
the shape of train (478087, 8)
the shape of test (18371, 8)


In [6]:
dump_pickle(smooth_train, path='../data/train_feature/301_smooth_item_features.pkl')
dump_pickle(smooth_test, path='../data/test_feature/301_smooth_item_features.pkl')