In [1]:
# %load 103_user_time_count.py
# 计算出至今为止用户买的产品数
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import time


train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')
df = pd.concat([train, test], axis=0)

def user_time_count(df):
    # ========================= 当天搜索次数和当前小时搜索次数 =========================================
    time_features = ['hour', 'day']
    user_features = ['user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']

    user_time_df = pd.DataFrame()
    user_time_df = df[['instance_id', 'day']]

    for user_feature in tqdm(user_features):
        for time_feature in time_features:
            search_group = df.groupby([user_feature, time_feature]).count().reset_index()
            tmp_df = df[[user_feature, time_feature]]
            user_day_search = pd.merge(tmp_df, search_group, on=[user_feature, time_feature], how='left').iloc[:, -1]
            user_time_df['{}_{}_search'.format(user_feature, time_feature)] = user_day_search
            
    train_feat = user_time_df[(user_time_df['day'] >= 18) & (user_time_df['day'] <= 24)].drop('day', axis=1)
    test_feat = user_time_df[user_time_df['day']==25].drop('day', axis=1)
    dump_pickle(train_feat, path='../data/train_feature/103_user_time_count.pkl')
    dump_pickle(test_feat, path='../data/test_feature/103_user_time_count.pkl')


    # ======================== 用户当前搜索距离上次的时间 ================================================
    df_tmp = df[['instance_id', 'user_id',  'context_timestamp', 'day']].copy()
    df_tmp.sort_values(['user_id', 'context_timestamp'], inplace=True)


    df_tmp['t-1_context_timestamp'] = df_tmp.groupby('user_id')['context_timestamp'].shift(1)
    df_tmp['time_diff_last_query'] = np.log1p(df_tmp['context_timestamp'] - df_tmp['t-1_context_timestamp'])

    train_feat = df_tmp[(df_tmp['day'] >= 18) & (df_tmp['day'] <= 24)].drop('day', axis=1)   
    train_feat = train_feat[['instance_id', 'time_diff_last_query']]
    
    test_feat = df_tmp[df_tmp['day']==25].drop('day', axis=1)   
    test_feat = test_feat[['instance_id', 'time_diff_last_query']]
    
    dump_pickle(train_feat, path='../data/train_feature/103_feature_last_query.pkl')
    dump_pickle(test_feat, path='../data/test_feature/103_feature_last_query.pkl')


    # ========================= 当日用户当前搜索距离上次的时间(商品，商店，商标) ==========================================
    final_feat = pd.DataFrame()
    final_feat = df[['instance_id', 'day']]
    cols = ['item_id', 'shop_id', 'item_brand_id']
    for col in tqdm(cols):
        df_select = df[['user_id', col,'day','context_timestamp']]
        df_group =  df_select.groupby(['user_id', col,'day'])
        group_max = df_group['context_timestamp'].transform('max')
        group_min = df_group['context_timestamp'].transform('min')
        final_feat['diff_maxtime_{}'.format(col)] = group_max - df['context_timestamp'].values
        final_feat['diff_mintime_{}'.format(col)] = df['context_timestamp'].values -group_min
    
    train_feat = final_feat[(final_feat['day'] >= 18) & (final_feat['day'] <= 24)].drop('day', axis=1)       
    test_feat = final_feat[final_feat['day']==25].drop('day', axis=1)   

    dump_pickle(train_feat, path='../data/train_feature/103_diff_max_min.pkl')
    dump_pickle(test_feat, path='../data/test_feature/103_diff_max_min.pkl')


    # ================================= 当前日期前一天的cnt ===========================================
    count_features = ['user_id', 'item_id', 'shop_id']
    final_feat = df[count_features+['instance_id', 'day']]
    for col in count_features:
        count_name = '{}_lastday_count'.format(col)
        count_all = None
        for d in range(18, 24):
            col_cnt = df[df['day'] == d - 1].groupby(by=col)['instance_id'].count().reset_index()
            col_cnt.columns = [col, count_name]
            col_cnt['day'] = d
            count_all = pd.concat([count_all, col_cnt], axis=0)
        final_feat = pd.merge(final_feat, count_all, on=[col, 'day'], how='left')
#     final_feat = final_feat.drop(count_features+['day'], axis=1)
    
    train_feat = final_feat[(final_feat['day'] >= 18) & (final_feat['day'] <= 24)]  
    test_feat = final_feat[final_feat['day']==25]
    train_feat = train_feat.drop(count_features+['day'], axis=1)
    test_feat = test_feat.drop(count_features+['day'], axis=1)
    
    dump_pickle(train_feat, path='../data/train_feature/103_last_day_count.pkl')
    dump_pickle(test_feat, path='../data/test_feature/103_last_day_count.pkl')


start = time.time()
user_time_count(df)
end = time.time()
print('time elapsed {}'.format(end-start))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 3/3 [00:01<00:00,  2.65it/s]


time elapsed 8.361943006515503
