In [42]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path

train = pd.read_pickle('../data/train.pkl')
test = pd.read_pickle('../data/test.pkl')
df = pd.read_pickle('../data/df.pkl')


In [115]:
"""
店铺特征：
店铺被浏览册数
店铺被消费 次数
店铺消费 次数/店铺浏览次数
店铺 浏览的 不同用户数
店铺 消费的 不同用户
店铺 商品 种类数
店铺 消费的商品 种类数
该店铺的商品平均商品价格等级
该店铺的商品平均销量等级
该店铺的商品平均收藏次数
不同评价数量等级的店铺 的 消费率
不同星级店铺的 消费率
店铺的评价数量等级
店铺的星级编号
店铺的好评率
店铺的服务态度评分
店铺的物流服务评分
店铺的描述相符评分
"""
feat_train = train[['instance_id', 'shop_id']].copy()
feat_test = test[['instance_id', 'shop_id']].copy()
def slide_count(group_col, count_col=None):
    col_I = 'num_{}_shop'.format(count_col)
    col_C = 'num_trade_{}_shop'.format(count_col)
    col_smooth_rate = 'proportion_trade_{}_shop'.format(count_col)
    CVR_all = None
    smooth_cols = [group_col]
    smooth_train = train[smooth_cols + ['instance_id', 'day']]
    smooth_test = test[smooth_cols + ['instance_id', 'day']]
    for day in tqdm(range(19, 26)):
        if count_col == None:
            I = train[train.day<day].groupby(group_col).size().reset_index()
        else:
            I = train[train.day<day].groupby(group_col)[count_col].nunique().reset_index()
        I.columns = [group_col, col_I]
        if count_col == None:
            C = train[train.day<day][train.is_trade==1].groupby(group_col).size().reset_index()
        else:
            C = train[train.day<day][train.is_trade==1].groupby(group_col)[count_col].nunique().reset_index()
        C.columns = [group_col, col_C]
        C = pd.merge(I, C, on=group_col, how='left')
        C.fillna(0, inplace=True)
        C.drop([col_I], inplace=True, axis=1)
        
        CVR = pd.concat([I, C[col_C]], axis=1)
        CVR['day'] = day
        CVR[col_smooth_rate] = (CVR[col_C]) / (CVR[col_I])
        
        CVR_all = pd.concat([CVR_all, CVR], axis=0)    
    smooth_train = pd.merge(smooth_train, CVR_all[[group_col, 'day', col_smooth_rate,  col_I, col_C ]], on=[group_col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[group_col, 'day', col_smooth_rate,  col_I, col_C ]], on=[group_col, 'day'], how='left')
    return smooth_train, smooth_test


def get_stats_target(df, group_column, target_column, drop_raw_col=False):
    df_old = df.copy()
    grouped = df_old.groupby(group_column)
    the_stats = grouped[target_column].agg(['mean', 'median', 'max', 'min']).reset_index()
    the_stats.columns = [ group_column,
                            '_%s_groupby_%s_mean' % (target_column, group_column),
                            '_%s_groupby_%s_median' % (target_column, group_column),
                            '_%s_groupby_%s__max' % (target_column, group_column),
                            '_%s_groupby_%s__min' % (target_column, group_column)
                        ]
    if drop_raw_col:
        df_old.drop(group_column, axis=1, inplace=True)

    return the_stats
    

# 1. 店铺 浏览
# 浏览数，点击数， 点击率
f1_train, f1_test = slide_count('shop_id')
f1_train.drop(['day', 'shop_id'], axis=1, inplace=True)
f1_test.drop(['day', 'shop_id'], axis=1, inplace=True)

# 2. 店铺 用户数
# 浏览数，点击数， 点击率
f2_train, f2_test = slide_count('shop_id', 'user_id')
f2_train.drop(['day', 'shop_id'], axis=1, inplace=True)
f2_test.drop(['day', 'shop_id'], axis=1, inplace=True)

# user_star_level 统计量
df_tmp = df[['instance_id', 'shop_id', 'user_star_level']]
k21 = get_stats_target(df_tmp, 'shop_id', 'user_star_level')
# k21 = pd.merge(df_tmp, k21, on=['shop_id'], how='left')
# k21.drop(['shop_id', 'user_star_level'], axis=1, inplace=True)

df_tmp = df[['instance_id', 'shop_id', 'user_age_level']]
k22 = get_stats_target(df_tmp, 'shop_id', 'user_age_level')
# k21 = pd.merge(df_tmp, k21, on=['shop_id'], how='left')
# k21.drop(['shop_id', 'user_age_level'], axis=1, inplace=True)
# k21.fillna(0, inplace=True)
# k21 = k21.astype(np.int32)

k23 = df.groupby('shop_id')['user_gender_id'].median().reset_index()
k23.columns = ['shop_id', 'median_user_gender_shop']
k24 = df.groupby('shop_id')['user_age_level'].median().reset_index()
k24.columns = ['shop_id', 'median_user_age_shop']
k25 = df.groupby('shop_id')['user_occupation_id'].median().reset_index()
k25.columns = ['shop_id', 'median_user_occupation_shop']


# 3. 店铺 商品
f3_train, f3_test = slide_count('shop_id', 'item_id')
f3_train.drop(['day', 'shop_id'], axis=1, inplace=True)
f3_test.drop(['day', 'shop_id'], axis=1, inplace=True)

df_tmp = df[['instance_id', 'shop_id', 'item_price_level']]
k31 = get_stats_target(df_tmp, 'shop_id', 'item_price_level')

df_tmp = df[['instance_id', 'shop_id', 'item_sales_level']]
k32 = get_stats_target(df_tmp, 'shop_id', 'item_sales_level')

df_tmp = df[['instance_id', 'shop_id', 'item_collected_level']]
k33 = get_stats_target(df_tmp, 'shop_id', 'item_collected_level')

df_tmp = df[['instance_id', 'shop_id', 'item_pv_level']]
k34 = get_stats_target(df_tmp, 'shop_id', 'item_pv_level')

k35 = df.groupby('shop_id')['item_id'].median().reset_index()
k35.columns = ['shop_id', 'median_item_id_shop']
k36 = df.groupby('shop_id')['item_brand_id'].median().reset_index()
k36.columns = ['shop_id', 'median_item_brand_shop']
k37 = df.groupby('shop_id')['item_city_id'].median().reset_index()
k37.columns = ['shop_id', 'median_item_city_shop']


merge_list1 = [k21, k22, k23, k24, k25, k31, k32, k33, k34, k35, k36, k37]
for feat in merge_list1:
    feat_train = pd.merge(feat_train, feat, on='shop_id', how='left')
    feat_test = pd.merge(feat_test, feat, on='shop_id', how='left')

merge_list2 = [f1_train, f2_train]
for feat in merge_list2:
    feat_train = pd.merge(feat_train, feat, on='instance_id', how='left')
    
merge_list3 = [f1_test, f2_test]
for feat in merge_list3:
    feat_test = pd.merge(feat_test, feat, on='instance_id', how='left')

feat_train = feat_train.fillna(-1)
feat_test = feat_test.fillna(-1)

feat_train = feat_train.drop('shop_id', axis=1)
feat_test = feat_test.drop('shop_id', axis=1)

dump_pickle(feat_train, raw_data_path + 'train_feature/' + '201_user_based_statistics.pkl')
dump_pickle(feat_test, raw_data_path + 'test_feature/' + '201_user_based_statistics.pkl')

# 4. 

# l1 = np.intersect1d(train[train.day==18].shop_id, train[train.day==19].shop_id)
# float(len([x for x in train[train.day==18].shop_id if x in l1]))/train[train.day==18].shape[0]


        
# shop_df = train.copy()

# # 1.1 店铺 被浏览 总数
# f_11 = shop_df['shop_id'].value_counts().reset_index()
# f_11.columns = ['shop_id', 'visit_count_shop']

# # 1.2 店铺 被购买 总数
# f_12 = shop_df[shop_df.is_trade==1]['shop_id'].value_counts().reset_index()
# f_12.columns = ['shop_id', 'trade_count_shop']
# f_12 = pd.merge(f_11, f_12, on='shop_id', how='left')
# f_12.fillna(0, inplace=True)
# f_12.drop(['visit_count_shop'], inplace=True, axis=1)

# # 1.3 店铺 被购买占被浏览的比例
# f_13 = pd.merge(f_11, f_12, on='shop_id', how='left')
# f_13['proportion_trade_visit'] = f_13['trade_count_shop']/f_13['visit_count_shop']
# f_13.drop(['visit_count_shop', 'trade_count_shop'], inplace=True, axis=1)


# # 2.1 店铺 浏览的 不同用户数
# f_21 = shop_df.groupby('shop_id')['user_id'].nunique().reset_index()
# f_21.columns = ['shop_id', 'num_users_shop']

# # 2.2 店铺 消费的 不同用户数
# f_22 = shop_df[shop_df.is_trade==1].groupby('shop_id')['user_id'].nunique().reset_index()
# f_22.columns =  ['shop_id', 'num_trade_users_shop']
# f_22 = pd.merge(f_21, f_22, on='shop_id', how='left')
# f_22.fillna(0, inplace=True)
# f_22.drop(['num_users_shop'], inplace=True, axis=1)

# # 2.3 店铺 消费用户占用户的比例(用户数要大于10)
# f_23 = pd.merge(f_21, f_22, on='shop_id', how='left')
# f_23['proportion_trade_users'] = f_23['num_trade_users_shop']/f_23['num_users_shop']
# f_23.drop(['num_trade_users_shop', 'num_users_shop'], inplace=True, axis=1)


# # 3.1 店铺 商品 总数
# f_31 = shop_df.groupby('shop_id')['item_id'].nunique().reset_index()
# f_31.columns =  ['shop_id', 'num_items_shop']

# # 3.2 店铺 被购买过的商品 总数
# f_32 = shop_df[shop_df.is_trade==1].groupby('shop_id')['item_id'].nunique().reset_index()
# f_32.columns =  ['shop_id', 'num_trade_items_shop']
# f_32 = pd.merge(f_31, f_32, on='shop_id', how='left')
# f_32.fillna(0, inplace=True)
# f_32.drop(['num_items_shop'], inplace=True, axis=1)

# # 3.3 店铺  被购买过的商品占商品总数 的比例
# f_33 = pd.merge(f_31, f_32, on='shop_id', how='left')
# f_33['proportion_trade_items'] = f_33['num_trade_items_shop']/f_33['num_items_shop']
# f_33.drop(['num_items_shop', 'num_trade_items_shop'], inplace=True, axis=1)




100%|██████████| 7/7 [00:01<00:00,  6.34it/s]
100%|██████████| 7/7 [00:01<00:00,  4.07it/s]
100%|██████████| 7/7 [00:01<00:00,  5.08it/s]


In [113]:
feat_train.isnull().sum()

instance_id                                         0
shop_id                                             0
_user_star_level_groupby_shop_id_mean               0
_user_star_level_groupby_shop_id_median             0
_user_star_level_groupby_shop_id__max               0
_user_star_level_groupby_shop_id__min               0
_user_age_level_groupby_shop_id_mean                0
_user_age_level_groupby_shop_id_median              0
_user_age_level_groupby_shop_id__max                0
_user_age_level_groupby_shop_id__min                0
median_user_gender_shop                             0
median_user_age_shop                                0
median_user_occupation_shop                         0
_item_price_level_groupby_shop_id_mean              0
_item_price_level_groupby_shop_id_median            0
_item_price_level_groupby_shop_id__max              0
_item_price_level_groupby_shop_id__min              0
_item_sales_level_groupby_shop_id_mean              0
_item_sales_level_groupby_sh

In [67]:
train.user_star_level

0         3
1         6
2         4
3         6
4         1
5         2
6         0
7         2
8         7
9         6
10        3
11        7
12        5
13        6
14        2
15        6
16        7
17        7
18        7
19        3
20        8
21        7
22        6
23        3
24        7
25        8
26        6
27        6
28        8
29        3
         ..
478057    2
478058    3
478059    6
478060    6
478061    7
478062    7
478063    4
478064    5
478065    6
478066    6
478067    6
478068    3
478069    8
478070    3
478071    6
478072    7
478073    8
478074    7
478075    6
478076    6
478077    9
478078    6
478079    8
478080    3
478081    7
478082    5
478083    2
478084    3
478085    3
478086    5
Name: user_star_level, Length: 478087, dtype: int64