In [1]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path

train = pd.read_pickle('../data/train.pkl')
test = pd.read_pickle('../data/test.pkl')
df = pd.read_pickle('../data/df.pkl')


In [53]:
"""
店铺特征：
店铺被浏览册数
店铺被消费 次数
店铺消费 次数/店铺浏览次数
店铺 浏览的 不同用户数
店铺 消费的 不同用户
店铺 商品 种类数
店铺 消费的商品 种类数
该店铺的商品平均商品价格等级
该店铺的商品平均销量等级
该店铺的商品平均收藏次数
不同评价数量等级的店铺 的 消费率
不同星级店铺的 消费率
店铺的评价数量等级
店铺的星级编号
店铺的好评率
店铺的服务态度评分
店铺的物流服务评分
店铺的描述相符评分
"""
shop_df = train.copy()

# 1.1 店铺 被浏览 总数
f_11 = shop_df['shop_id'].value_counts().reset_index()
f_11.columns = ['shop_id', 'visit_count_shop']

# 1.2 店铺 被购买 总数
f_12 = shop_df[shop_df.is_trade==1]['shop_id'].value_counts().reset_index()
f_12.columns = ['shop_id', 'trade_count_shop']
f_12 = pd.merge(f_11, f_12, on='shop_id', how='left')
f_12.fillna(0, inplace=True)
f_12.drop(['visit_count_shop'], inplace=True, axis=1)

# 1.3 店铺 被购买占被浏览的比例
f_13 = pd.merge(f_11, f_12, on='shop_id', how='left')
f_13['proportion_trade_visit'] = f_13['trade_count_shop']/f_13['visit_count_shop']
f_13.drop(['visit_count_shop', 'trade_count_shop'], inplace=True, axis=1)


# 2.1 店铺 浏览的 不同用户数
f_21 = shop_df.groupby('shop_id')['user_id'].nunique().reset_index()
f_21.columns = ['shop_id', 'num_users_shop']

# 2.2 店铺 消费的 不同用户数
f_22 = shop_df[shop_df.is_trade==1].groupby('shop_id')['user_id'].nunique().reset_index()
f_22.columns =  ['shop_id', 'num_trade_users_shop']
f_22 = pd.merge(f_21, f_22, on='shop_id', how='left')
f_22.fillna(0, inplace=True)
f_22.drop(['num_users_shop'], inplace=True, axis=1)

# 2.3 店铺 消费人数占用户的比例(用户数要大于10)
f_23 = pd.merge(f_21, f_22, on='shop_id', how='left')
f_23['proportion_trade_users'] = f_23['num_trade_users_shop']/f_23['num_users_shop']
f_23.drop(['num_trade_users_shop', 'num_users_shop'], inplace=True, axis=1)


# 3.1 店铺 商品 总数
f_31 = shop_df.groupby('shop_id')['item_id'].nunique().reset_index()
f_31.columns =  ['shop_id', 'num_items_shop']

# 3.2 店铺 被购买过的商品 总数
f_32 = shop_df[shop_df.is_trade==1].groupby('shop_id')['item_id'].nunique().reset_index()
f_32.columns =  ['shop_id', 'num_trade_items_shop']
f_32 = pd.merge(f_31, f_32, on='shop_id', how='left')
f_32.fillna(0, inplace=True)
f_32.drop(['num_items_shop'], inplace=True, axis=1)

# 3.3 店铺  被购买过的商品占商品总数 的比例
f_33 = pd.merge(f_31, f_32, on='shop_id', how='left')
f_33['proportion_trade_items'] = f_33['num_trade_items_shop']/f_33['num_items_shop']
f_33.drop(['num_items_shop', 'num_trade_items_shop'], inplace=True, axis=1)

# 3.4 店铺 商品 平均价格
f_34 = shop_df.groupby('shop_id')['item_price_level'].mean().reset_index()
f_34.columns = ['shop_id', 'item_average_shop']

# 3.5 店铺 商品 中位数
f_35 = shop_df.groupby('shop_id')['item_price_level'].median().reset_index()
f_35.columns = ['shop_id', 'item_median_shop']




In [54]:
f_34


Unnamed: 0,shop_id,item_average_shop
0,1543559655939246,7.400000
1,6515295840140210,5.044444
2,9230153951388399,8.433333
3,10653466129289268,7.000000
4,12057125006548226,8.000000
5,13902855225429796,7.939597
6,17074386468227768,7.000000
7,17705093901583386,5.000000
8,18720162083472464,8.000000
9,20482721174257713,6.194271


In [49]:
proportion_trade_items

Unnamed: 0,shop_id,proportion_trade_items
0,1543559655939246,0.000000
1,6515295840140210,0.666667
2,9230153951388399,0.333333
3,10653466129289268,0.000000
4,12057125006548226,1.000000
5,13902855225429796,0.200000
6,17074386468227768,0.000000
7,17705093901583386,1.000000
8,18720162083472464,1.000000
9,20482721174257713,0.500000
