In [1]:
# item 
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import time

df = load_pickle('../data/df.pkl')
train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')



In [2]:
print(train.shape,test.shape)

(478087, 36) (18371, 36)


In [3]:
def item_time_count(df, mode):
    # ========================= 产品当天被搜索次数和当前小时被搜索次数 =========================================
    time_features = ['hour', 'day']
    item_features = ['item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level','item_pv_level']

    item_time_df = pd.DataFrame()
    item_time_df['instance_id'] = df['instance_id']

    for item_feature in tqdm(item_features):
        for time_feature in time_features:
            search_group = df.groupby([item_feature, time_feature]).count().reset_index()
            tmp_df = df[[item_feature, time_feature]]
            item_day_search = pd.merge(tmp_df, search_group, on=[item_feature, time_feature], how='left').iloc[:, -1]
            item_time_df['{}_{}_search'.format(item_feature, time_feature)] = item_day_search
    print(item_time_df.columns)
    print('the shape of {} {}'.format(mode, item_time_df.shape))
    dump_pickle(item_time_df, path=raw_data_path+'{}_feature/'.format(mode)+'302_item_time_count.pkl')
    
    
    # ======================== 产品当前被搜索距离上次的时间 ================================================
    df_tmp = df[['instance_id', 'item_id',  'context_timestamp']].copy()
    df_tmp.sort_values(['item_id', 'context_timestamp'], inplace=True)

    df_tmp['item_t-1_context_timestamp'] = df_tmp.groupby('item_id')['context_timestamp'].shift(1)
    df_tmp['item_time_diff_last_query'] = df_tmp['context_timestamp'] - df_tmp['item_t-1_context_timestamp']
    # df_tmp['item_time_diff_last_query'] = np.log1p(df_tmp['context_timestamp'] - df_tmp['item_t-1_context_timestamp'])

    final_feat = df_tmp[['instance_id', 'item_time_diff_last_query']]
    print(final_feat.columns)
    print('the shape of {} {}'.format(mode, final_feat.shape))
    dump_pickle(final_feat, path=raw_data_path+'{}_feature/'.format(mode)+'302_item_last_query.pkl')
    
    
        # ========================= 当日当前搜索距离上次的时间(商品，商标) ==========================================
    final_feat = pd.DataFrame()
    final_feat['instance_id'] = df['instance_id']
    cols = ['item_id', 'item_brand_id']
    for col in tqdm(cols):
        df_select = df[[col,'day','context_timestamp']]
        df_group =  df_select.groupby([col,'day'])
        group_max = df_group['context_timestamp'].transform('max')
        group_min = df_group['context_timestamp'].transform('min')
        final_feat['diff_maxtime_{}'.format(col)] = group_max - df['context_timestamp'].values
        final_feat['diff_mintime_{}'.format(col)] = df['context_timestamp'].values -group_min
    print(final_feat.columns)
    print('the shape of {} {}'.format(mode, final_feat.shape))
    dump_pickle(final_feat, path=raw_data_path + '{}_feature/'.format(mode) + '302_item_diff_max_min.pkl')
    
    
    # ================================= 当前日期前一天的cnt ===========================================
    count_features = ['item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level','item_pv_level']
    final_feat = df[count_features+['instance_id', 'day']]
    for col in count_features:
        count_name = '{}_lastday_count'.format(col)
        count_all = None
        for d in range(18, 24):
            col_cnt = df[df['day'] == d - 1].groupby(by=col)['instance_id'].count().reset_index()
            col_cnt.columns = [col, count_name]
            col_cnt['day'] = d
            count_all = pd.concat([count_all, col_cnt], axis=0)
        final_feat = pd.merge(final_feat, count_all, on=[col, 'day'], how='left')
    final_feat = final_feat.drop(count_features+['day'], axis=1)
    print(final_feat.columns)
    print('the shape of {} {}'.format(mode, final_feat.shape))

    dump_pickle(final_feat, path=raw_data_path + '{}_feature/'.format(mode) + '302_item_last_day_count.pkl')

In [4]:
start = time.time()
item_time_count(train, 'train')
item_time_count(test, 'test')
end = time.time()
print('time elapsed {}'.format(end-start))

100%|██████████| 7/7 [00:05<00:00,  1.32it/s]


Index(['instance_id', 'item_id_hour_search', 'item_id_day_search',
       'item_brand_id_hour_search', 'item_brand_id_day_search',
       'item_city_id_hour_search', 'item_city_id_day_search',
       'item_price_level_hour_search', 'item_price_level_day_search',
       'item_sales_level_hour_search', 'item_sales_level_day_search',
       'item_collected_level_hour_search', 'item_collected_level_day_search',
       'item_pv_level_hour_search', 'item_pv_level_day_search'],
      dtype='object')
the shape of train (478087, 15)


  0%|          | 0/2 [00:00<?, ?it/s]

Index(['instance_id', 'item_time_diff_last_query'], dtype='object')
the shape of train (478087, 2)


100%|██████████| 2/2 [00:00<00:00, 12.93it/s]


Index(['instance_id', 'diff_maxtime_item_id', 'diff_mintime_item_id',
       'diff_maxtime_item_brand_id', 'diff_mintime_item_brand_id'],
      dtype='object')
the shape of train (478087, 5)


  0%|          | 0/7 [00:00<?, ?it/s]

Index(['instance_id', 'item_brand_id_lastday_count',
       'item_city_id_lastday_count', 'item_price_level_lastday_count',
       'item_sales_level_lastday_count', 'item_collected_level_lastday_count',
       'item_pv_level_lastday_count'],
      dtype='object')
the shape of train (478087, 7)


100%|██████████| 7/7 [00:00<00:00, 23.69it/s]
100%|██████████| 2/2 [00:00<00:00, 137.02it/s]


Index(['instance_id', 'item_id_hour_search', 'item_id_day_search',
       'item_brand_id_hour_search', 'item_brand_id_day_search',
       'item_city_id_hour_search', 'item_city_id_day_search',
       'item_price_level_hour_search', 'item_price_level_day_search',
       'item_sales_level_hour_search', 'item_sales_level_day_search',
       'item_collected_level_hour_search', 'item_collected_level_day_search',
       'item_pv_level_hour_search', 'item_pv_level_day_search'],
      dtype='object')
the shape of test (18371, 15)
Index(['instance_id', 'item_time_diff_last_query'], dtype='object')
the shape of test (18371, 2)
Index(['instance_id', 'diff_maxtime_item_id', 'diff_mintime_item_id',
       'diff_maxtime_item_brand_id', 'diff_mintime_item_brand_id'],
      dtype='object')
the shape of test (18371, 5)
Index(['instance_id', 'item_brand_id_lastday_count',
       'item_city_id_lastday_count', 'item_price_level_lastday_count',
       'item_sales_level_lastday_count', 'item_collected_level