In [2]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path, valid_data_path

train = pd.read_pickle('../data/train.pkl')
test = pd.read_pickle('../data/test.pkl')
df = pd.concat([train, test], axis=0)

feat_train = train[['instance_id', 'item_id']].copy()
feat_test = test[['instance_id', 'item_id']].copy()
def slide_count(group_col, count_col=None):
    col_I = 'num_{}_shop'.format(count_col)
    col_C = 'num_trade_{}_shop'.format(count_col)
    col_smooth_rate = 'proportion_trade_{}_shop'.format(count_col)
    CVR_all = None
    smooth_cols = [group_col]
    smooth_train = train[smooth_cols + ['instance_id', 'day']]
    smooth_test = test[smooth_cols + ['instance_id', 'day']]
    for day in tqdm(range(19, 26)):
        if count_col == None:
            I = train[train.day<day].groupby(group_col).size().reset_index()
        else:
            I = train[train.day<day].groupby(group_col)[count_col].nunique().reset_index()
        I.columns = [group_col, col_I]
        if count_col == None:
            C = train[train.day<day][train.is_trade==1].groupby(group_col).size().reset_index()
        else:
            C = train[train.day<day][train.is_trade==1].groupby(group_col)[count_col].nunique().reset_index()
        C.columns = [group_col, col_C]
        C = pd.merge(I, C, on=group_col, how='left')
        C.fillna(0, inplace=True)
        C.drop([col_I], inplace=True, axis=1)
        
        CVR = pd.concat([I, C[col_C]], axis=1)
        CVR['day'] = day
        CVR[col_smooth_rate] = (CVR[col_C]) / (CVR[col_I])
        
        CVR_all = pd.concat([CVR_all, CVR], axis=0)    
    smooth_train = pd.merge(smooth_train, CVR_all[[group_col, 'day', col_smooth_rate,  col_I, col_C ]], on=[group_col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[group_col, 'day', col_smooth_rate,  col_I, col_C ]], on=[group_col, 'day'], how='left')
    return smooth_train, smooth_test


def get_stats_target(df, group_column, target_column, drop_raw_col=False):
    df_old = df.copy()
    grouped = df_old.groupby(group_column)
    the_stats = grouped[target_column].agg(['mean', 'median', 'max', 'min']).reset_index()
    the_stats.columns = [ group_column,
                            '_%s_groupby_%s_mean' % (target_column, group_column),
                            '_%s_groupby_%s_median' % (target_column, group_column),
                            '_%s_groupby_%s__max' % (target_column, group_column),
                            '_%s_groupby_%s__min' % (target_column, group_column)
                        ]
    if drop_raw_col:
        df_old.drop(group_column, axis=1, inplace=True)

    return the_stats
    

# 1. 店铺 浏览
# 浏览数，点击数， 点击率
f1_train, f1_test = slide_count('item_id')
f1_train.drop(['day', 'item_id'], axis=1, inplace=True)
f1_test.drop(['day', 'item_id'], axis=1, inplace=True)

# 2. 店铺 用户数
# 浏览数，点击数， 点击率
f2_train, f2_test = slide_count('item_id', 'user_id')
f2_train.drop(['day', 'item_id'], axis=1, inplace=True)
f2_test.drop(['day', 'item_id'], axis=1, inplace=True)

# user_star_level 统计量
df_tmp = df[['instance_id', 'item_id', 'user_star_level']]
k21 = get_stats_target(df_tmp, 'item_id', 'user_star_level')
# k21 = pd.merge(df_tmp, k21, on=['item_id'], how='left')
# k21.drop(['item_id', 'user_star_level'], axis=1, inplace=True)

df_tmp = df[['instance_id', 'item_id', 'user_age_level']]
k22 = get_stats_target(df_tmp, 'item_id', 'user_age_level')
# k21 = pd.merge(df_tmp, k21, on=['item_id'], how='left')
# k21.drop(['item_id', 'user_age_level'], axis=1, inplace=True)
# k21.fillna(0, inplace=True)
# k21 = k21.astype(np.int32)

k23 = df.groupby('item_id')['user_gender_id'].median().reset_index()
k23.columns = ['item_id', 'median_user_gget_stats_targetender_shop']
k24 = df.groupby('item_id')['user_age_level'].median().reset_index()
k24.columns = ['item_id', 'median_user_age_shop']
k25 = df.groupby('item_id')['user_occupation_id'].median().reset_index()
k25.columns = ['item_id', 'median_user_occupation_shop']


merge_list1 = [k21, k22, k23, k24, k25]
for feat in merge_list1:
    feat_train = pd.merge(feat_train, feat, on='item_id', how='left')
    feat_test = pd.merge(feat_test, feat, on='item_id', how='left')

merge_list2 = [f1_train, f2_train]
for feat in merge_list2:
    feat_train = pd.merge(feat_train, feat, on='instance_id', how='left')
    
merge_list3 = [f1_test, f2_test]
for feat in merge_list3:
    feat_test = pd.merge(feat_test, feat, on='instance_id', how='left')

feat_train = feat_train.fillna(-1)
feat_test = feat_test.fillna(-1)

feat_train = feat_train.drop('item_id', axis=1)
feat_test = feat_test.drop('item_id', axis=1)

dump_pickle(feat_train, raw_data_path + 'train_feature/' + '201_user_based_statistics.pkl')
dump_pickle(feat_test, raw_data_path + 'test_feature/' + '201_user_based_statistics.pkl')





100%|██████████| 7/7 [00:01<00:00,  6.00it/s]
100%|██████████| 7/7 [00:01<00:00,  4.63it/s]
