# Part 2. Feature Engineering

## 2.3 Double 11 Feature

In [2]:
#import packages
import numpy as np
import pandas as pd

In [3]:
user_data = pd.read_csv('cleaned_raw/user_data.csv')

In [4]:
def Categorize(n):
    if n == 1111:
        return 2
    elif n >= 1104:
        return 1
    else:
        return 0

In [6]:
user_data['sort_time'] = user_data.time_stamp.map(Categorize)

In [7]:
user_data.tail()

Unnamed: 0.1,Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,sort_time
54925325,54925325,208016,107662,898,1346,7995.0,1110,0,1
54925326,54925326,208016,1058313,898,1346,7995.0,1110,0,1
54925327,54925327,208016,449814,898,983,7995.0,1110,0,1
54925328,54925328,208016,634856,898,1346,7995.0,1110,0,1
54925329,54925329,208016,272094,898,1346,7995.0,1111,0,2


In [8]:
user_data['user_id'].unique().size

424170

In [9]:
user_data['seller_id'].unique().size

4995

### A. User & Merchant action count at Double 11，one week before Double 11, and before double 11

**User**

In [12]:
user_actions = user_data[['user_id', 'action_type', 'sort_time']]
user_actions.head()

Unnamed: 0,user_id,action_type,sort_time
0,328862,0,0
1,328862,0,0
2,328862,0,0
3,328862,0,0
4,328862,0,0


In [20]:
#annotated Double 11 day as event, one week before Double 11 day as pre, before Double 11 day as before.
user_event = user_actions[user_actions['sort_time'] == 2]
print(user_event.shape)

(10582633, 3)


In [21]:
user_event.head()

Unnamed: 0,user_id,action_type,sort_time
171,328862,0,2
172,328862,0,2
173,328862,0,2
174,328862,2,2
175,328862,0,2


In [22]:
user_event_dummies = pd.get_dummies(user_event['action_type'], prefix='action')
user_event = pd.concat([user_event.user_id, user_event_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

In [23]:
user_event.rename(columns={"action_0":"click", "action_1":"cart", "action_2":"buy", "action_3":"fav"},inplace=True)
user_event['ueventTotalAction'] = user_event.apply(lambda x: x["click"] + x["cart"] + x["buy"] + x["fav"], axis=1)

In [24]:
user_event.shape

(424170, 6)

In [25]:
user_event.head()

Unnamed: 0,user_id,click,cart,buy,fav,ueventTotalAction
0,1,13.0,0.0,4.0,0.0,17.0
1,2,0.0,0.0,7.0,1.0,8.0
2,3,7.0,0.0,1.0,0.0,8.0
3,4,7.0,0.0,1.0,0.0,8.0
4,5,28.0,0.0,3.0,1.0,32.0


In [26]:
ls = ['click', 'cart', 'buy', 'fav']
for i in ls:
    col_name = i+'_ratio'
    user_event[col_name] = user_event[i] / user_event['ueventTotalAction']
    
user_event.head()

Unnamed: 0,user_id,click,cart,buy,fav,ueventTotalAction,click_ratio,cart_ratio,buy_ratio,fav_ratio
0,1,13.0,0.0,4.0,0.0,17.0,0.764706,0.0,0.235294,0.0
1,2,0.0,0.0,7.0,1.0,8.0,0.0,0.0,0.875,0.125
2,3,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
3,4,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
4,5,28.0,0.0,3.0,1.0,32.0,0.875,0.0,0.09375,0.03125


In [27]:
# Because some users do not buy things during this particular tim interval,we have to add them and fill 0 for future merge.
all_userid = pd.DataFrame({'user_id':range(1, 424171 ,1)})
user_event = pd.merge(all_userid,user_event,how='left',on=['user_id']).fillna(0)
user_event.head()

Unnamed: 0,user_id,click,cart,buy,fav,ueventTotalAction,click_ratio,cart_ratio,buy_ratio,fav_ratio
0,1,13.0,0.0,4.0,0.0,17.0,0.764706,0.0,0.235294,0.0
1,2,0.0,0.0,7.0,1.0,8.0,0.0,0.0,0.875,0.125
2,3,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
3,4,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
4,5,28.0,0.0,3.0,1.0,32.0,0.875,0.0,0.09375,0.03125


In [28]:
user_event.shape

(424170, 10)

In [29]:
user_event.isnull().sum()

user_id              0
click                0
cart                 0
buy                  0
fav                  0
ueventTotalAction    0
click_ratio          0
cart_ratio           0
buy_ratio            0
fav_ratio            0
dtype: int64

In [30]:
user_event.rename(columns={'click': "click_11", 'cart':"cart_11",
                                   "buy":"buy_11", "fav":"fav_11", 
                                   "ueventTotalAction":"ueventTotalAction_11", 
                                   "click_ratio":"click_ratio_11", "cart_ratio":"cart_ratio_11", 
                                   "buy_ratio":"buy_ratio_11",
                                   "fav_ratio":"fav_ratio_11"},inplace=True)
user_event.head()

Unnamed: 0,user_id,click_11,cart_11,buy_11,fav_11,ueventTotalAction_11,click_ratio_11,cart_ratio_11,buy_ratio_11,fav_ratio_11
0,1,13.0,0.0,4.0,0.0,17.0,0.764706,0.0,0.235294,0.0
1,2,0.0,0.0,7.0,1.0,8.0,0.0,0.0,0.875,0.125
2,3,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
3,4,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0
4,5,28.0,0.0,3.0,1.0,32.0,0.875,0.0,0.09375,0.03125


In [32]:
user_pre = user_actions[user_actions['sort_time'] == 1]
print(user_pre.shape)

(8254892, 3)


In [33]:
user_pre.head()

Unnamed: 0,user_id,action_type,sort_time
427,234512,0,1
460,234512,2,1
461,234512,0,1
462,234512,0,1
463,234512,0,1


In [34]:
user_pre_dummies = pd.get_dummies(user_pre['action_type'], prefix='action')
user_pre = pd.concat([user_pre.user_id, user_pre_dummies], axis=1).groupby(['user_id'], as_index=False).sum()
user_pre.head()

Unnamed: 0,user_id,action_0,action_1,action_2,action_3
0,2,0.0,0.0,0.0,1.0
1,3,23.0,0.0,0.0,1.0
2,4,2.0,0.0,0.0,0.0
3,6,27.0,0.0,1.0,1.0
4,7,0.0,0.0,1.0,0.0


In [35]:
user_pre.rename(columns={"action_0":"click_pre", "action_1":"cart_pre", "action_2":"buy_pre", "action_3":"fav_pre"},inplace=True)
user_pre['ueventTotalAction_pre'] = user_pre.apply(lambda x: x["click_pre"] + x["cart_pre"] + x["buy_pre"] + x["fav_pre"], axis=1)
user_pre.shape

(314764, 6)

In [36]:
user_pre.head()

Unnamed: 0,user_id,click_pre,cart_pre,buy_pre,fav_pre,ueventTotalAction_pre
0,2,0.0,0.0,0.0,1.0,1.0
1,3,23.0,0.0,0.0,1.0,24.0
2,4,2.0,0.0,0.0,0.0,2.0
3,6,27.0,0.0,1.0,1.0,29.0
4,7,0.0,0.0,1.0,0.0,1.0


In [37]:
ls = ['click_pre', 'cart_pre', 'buy_pre', 'fav_pre']
for i in ls:
    col_name = i+'_ratio'
    user_pre[col_name] = user_pre[i] / user_pre['ueventTotalAction_pre']
    
user_pre.head()

Unnamed: 0,user_id,click_pre,cart_pre,buy_pre,fav_pre,ueventTotalAction_pre,click_pre_ratio,cart_pre_ratio,buy_pre_ratio,fav_pre_ratio
0,2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,3,23.0,0.0,0.0,1.0,24.0,0.958333,0.0,0.0,0.041667
2,4,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
3,6,27.0,0.0,1.0,1.0,29.0,0.931034,0.0,0.034483,0.034483
4,7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [38]:
user_pre = pd.merge(all_userid,user_pre,how='left',on=['user_id']).fillna(0)
user_pre.shape

(424170, 10)

In [39]:
user_pre.head()

Unnamed: 0,user_id,click_pre,cart_pre,buy_pre,fav_pre,ueventTotalAction_pre,click_pre_ratio,cart_pre_ratio,buy_pre_ratio,fav_pre_ratio
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,3,23.0,0.0,0.0,1.0,24.0,0.958333,0.0,0.0,0.041667
3,4,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
user_before = user_actions[user_actions['sort_time'] == 0]
print(user_before.shape)

(36087805, 3)


In [42]:
user_before.head()

Unnamed: 0,user_id,action_type,sort_time
0,328862,0,0
1,328862,0,0
2,328862,0,0
3,328862,0,0
4,328862,0,0


In [43]:
user_before_dummies = pd.get_dummies(user_before['action_type'], prefix='action')
user_before = pd.concat([user_before.user_id, user_before_dummies], axis=1).groupby(['user_id'], as_index=False).sum()
user_before.head()

Unnamed: 0,user_id,action_0,action_1,action_2,action_3
0,1,14.0,0.0,2.0,0.0
1,2,47.0,0.0,7.0,0.0
2,3,33.0,0.0,3.0,0.0
3,4,40.0,0.0,0.0,0.0
4,5,122.0,0.0,10.0,9.0


In [44]:
user_before.shape

(416162, 5)

In [45]:
user_before.rename(columns={"action_0":"click_before", "action_1":"cart_before", "action_2":"buy_before", "action_3":"fav_before"},inplace=True)
user_before['ueventTotalAction_before'] = user_before.apply(lambda x: x["click_before"] + x["cart_before"] + x["buy_before"] + x["fav_before"], axis=1)
user_before.shape

(416162, 6)

In [46]:
user_before.head()

Unnamed: 0,user_id,click_before,cart_before,buy_before,fav_before,ueventTotalAction_before
0,1,14.0,0.0,2.0,0.0,16.0
1,2,47.0,0.0,7.0,0.0,54.0
2,3,33.0,0.0,3.0,0.0,36.0
3,4,40.0,0.0,0.0,0.0,40.0
4,5,122.0,0.0,10.0,9.0,141.0


In [47]:
ls = ['click_before', 'cart_before', 'buy_before', 'fav_before']
for i in ls:
    col_name = i+'_ratio'
    user_before[col_name] = user_before[i] / user_before['ueventTotalAction_before']
    
user_before.head()

Unnamed: 0,user_id,click_before,cart_before,buy_before,fav_before,ueventTotalAction_before,click_before_ratio,cart_before_ratio,buy_before_ratio,fav_before_ratio
0,1,14.0,0.0,2.0,0.0,16.0,0.875,0.0,0.125,0.0
1,2,47.0,0.0,7.0,0.0,54.0,0.87037,0.0,0.12963,0.0
2,3,33.0,0.0,3.0,0.0,36.0,0.916667,0.0,0.083333,0.0
3,4,40.0,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0
4,5,122.0,0.0,10.0,9.0,141.0,0.865248,0.0,0.070922,0.06383


In [48]:
user_before = pd.merge(all_userid,user_before,how='left',on=['user_id']).fillna(0)
user_before.shape

(424170, 10)

In [49]:
user_event.shape, user_pre.shape, user_before.shape

((424170, 10), (424170, 10), (424170, 10))

In [50]:
user_double11 = pd.merge(user_event,user_pre,on=['user_id'])
user_double11.head()

Unnamed: 0,user_id,click_11,cart_11,buy_11,fav_11,ueventTotalAction_11,click_ratio_11,cart_ratio_11,buy_ratio_11,fav_ratio_11,click_pre,cart_pre,buy_pre,fav_pre,ueventTotalAction_pre,click_pre_ratio,cart_pre_ratio,buy_pre_ratio,fav_pre_ratio
0,1,13.0,0.0,4.0,0.0,17.0,0.764706,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,7.0,1.0,8.0,0.0,0.0,0.875,0.125,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,3,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0,23.0,0.0,0.0,1.0,24.0,0.958333,0.0,0.0,0.041667
3,4,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
4,5,28.0,0.0,3.0,1.0,32.0,0.875,0.0,0.09375,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
user_double11 = pd.merge(user_double11,user_before,on=['user_id'])
user_double11.head()

Unnamed: 0,user_id,click_11,cart_11,buy_11,fav_11,ueventTotalAction_11,click_ratio_11,cart_ratio_11,buy_ratio_11,fav_ratio_11,...,fav_pre_ratio,click_before,cart_before,buy_before,fav_before,ueventTotalAction_before,click_before_ratio,cart_before_ratio,buy_before_ratio,fav_before_ratio
0,1,13.0,0.0,4.0,0.0,17.0,0.764706,0.0,0.235294,0.0,...,0.0,14.0,0.0,2.0,0.0,16.0,0.875,0.0,0.125,0.0
1,2,0.0,0.0,7.0,1.0,8.0,0.0,0.0,0.875,0.125,...,1.0,47.0,0.0,7.0,0.0,54.0,0.87037,0.0,0.12963,0.0
2,3,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0,...,0.041667,33.0,0.0,3.0,0.0,36.0,0.916667,0.0,0.083333,0.0
3,4,7.0,0.0,1.0,0.0,8.0,0.875,0.0,0.125,0.0,...,0.0,40.0,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0
4,5,28.0,0.0,3.0,1.0,32.0,0.875,0.0,0.09375,0.03125,...,0.0,122.0,0.0,10.0,9.0,141.0,0.865248,0.0,0.070922,0.06383


In [52]:
user_double11.shape

(424170, 28)

In [53]:
user_double11.to_csv('data_output/Double11_user.csv', index=False)

**Merchant**

In [54]:
seller_actions = user_data[['seller_id', 'action_type', 'sort_time']]
seller_actions.head()

Unnamed: 0,seller_id,action_type,sort_time
0,2882,0,0
1,2882,0,0
2,2882,0,0
3,2882,0,0
4,1253,0,0


In [55]:
#annotated Double 11 day as event, one week before Double 11 day as pre, before Double 11 day as before.
seller_event = seller_actions[seller_actions['sort_time'] == 2]
print(seller_event.shape)

(10582633, 3)


In [56]:
seller_event_dummies = pd.get_dummies(seller_event['action_type'], prefix='action')
seller_event = pd.concat([seller_event.seller_id, seller_event_dummies], axis=1).groupby(['seller_id'], as_index=False).sum()
seller_event.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3
0,1,73496.0,122.0,7537.0,816.0
1,2,514.0,1.0,125.0,5.0
2,3,414.0,1.0,33.0,5.0
3,4,104.0,0.0,3.0,3.0
4,5,1133.0,2.0,63.0,34.0


In [57]:
seller_event.rename(columns={"action_0":"click_11", "action_1":"cart_11", "action_2":"buy_11", "action_3":"fav_11"},inplace=True)
seller_event['seventTotalAction_11'] = seller_event.apply(lambda x: x["click_11"] + x["cart_11"] + x["buy_11"] + x["fav_11"], axis=1)
seller_event.shape

(4993, 6)

In [59]:
ls = ['click_11', 'cart_11', 'buy_11', 'fav_11']
for i in ls:
    col_name = i+'_ratio'
    seller_event[col_name] = seller_event[i] / seller_event['seventTotalAction_11']
    
seller_event.head()

Unnamed: 0,seller_id,click_11,cart_11,buy_11,fav_11,seventTotalAction_11,click_11_ratio,cart_11_ratio,buy_11_ratio,fav_11_ratio
0,1,73496.0,122.0,7537.0,816.0,81971.0,0.89661,0.001488,0.091947,0.009955
1,2,514.0,1.0,125.0,5.0,645.0,0.796899,0.00155,0.193798,0.007752
2,3,414.0,1.0,33.0,5.0,453.0,0.913907,0.002208,0.072848,0.011038
3,4,104.0,0.0,3.0,3.0,110.0,0.945455,0.0,0.027273,0.027273
4,5,1133.0,2.0,63.0,34.0,1232.0,0.919643,0.001623,0.051136,0.027597


In [60]:
# Because some sellers do not have any actions done by usrs during this particular time interval
# we have to add them and fill 0 for future merge.
all_sellerid = pd.DataFrame({'seller_id':range(1, 4996 ,1)})
seller_event = pd.merge(all_sellerid,seller_event,how='left',on=['seller_id']).fillna(0)
seller_event.head()

Unnamed: 0,seller_id,click_11,cart_11,buy_11,fav_11,seventTotalAction_11,click_11_ratio,cart_11_ratio,buy_11_ratio,fav_11_ratio
0,1,73496.0,122.0,7537.0,816.0,81971.0,0.89661,0.001488,0.091947,0.009955
1,2,514.0,1.0,125.0,5.0,645.0,0.796899,0.00155,0.193798,0.007752
2,3,414.0,1.0,33.0,5.0,453.0,0.913907,0.002208,0.072848,0.011038
3,4,104.0,0.0,3.0,3.0,110.0,0.945455,0.0,0.027273,0.027273
4,5,1133.0,2.0,63.0,34.0,1232.0,0.919643,0.001623,0.051136,0.027597


In [61]:
seller_event.shape

(4995, 10)

In [62]:
#annotated Double 11 day as event, one week before Double 11 day as pre, before Double 11 day as before.
seller_pre = seller_actions[seller_actions['sort_time'] == 1]
print(seller_pre.shape)

(8254892, 3)


In [63]:
seller_pre_dummies = pd.get_dummies(seller_pre['action_type'], prefix='action')
seller_pre = pd.concat([seller_pre.seller_id, seller_pre_dummies], axis=1).groupby(['seller_id'], as_index=False).sum()
seller_pre.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3
0,1,53067.0,188.0,34.0,2652.0
1,2,370.0,5.0,5.0,22.0
2,3,533.0,2.0,2.0,20.0
3,4,159.0,0.0,18.0,7.0
4,5,1366.0,3.0,2.0,90.0


In [64]:
seller_pre.shape

(4995, 5)

In [65]:
seller_pre.rename(columns={"action_0":"click_pre", "action_1":"cart_pre", "action_2":"buy_pre", "action_3":"fav_pre"},inplace=True)
seller_pre['seventTotalAction_pre'] = seller_pre.apply(lambda x: x["click_pre"] + x["cart_pre"] + x["buy_pre"] + x["fav_pre"], axis=1)
seller_pre.shape

(4995, 6)

In [66]:
seller_pre.head()

Unnamed: 0,seller_id,click_pre,cart_pre,buy_pre,fav_pre,seventTotalAction_pre
0,1,53067.0,188.0,34.0,2652.0,55941.0
1,2,370.0,5.0,5.0,22.0,402.0
2,3,533.0,2.0,2.0,20.0,557.0
3,4,159.0,0.0,18.0,7.0,184.0
4,5,1366.0,3.0,2.0,90.0,1461.0


In [68]:
ls = ['click_pre', 'cart_pre', 'buy_pre', 'fav_pre']
for i in ls:
    col_name = i+'_ratio'
    seller_pre[col_name] = seller_pre[i] / seller_pre['seventTotalAction_pre']
    
seller_pre.head()

Unnamed: 0,seller_id,click_pre,cart_pre,buy_pre,fav_pre,seventTotalAction_pre,click_pre_ratio,cart_pre_ratio,buy_pre_ratio,fav_pre_ratio
0,1,53067.0,188.0,34.0,2652.0,55941.0,0.948624,0.003361,0.000608,0.047407
1,2,370.0,5.0,5.0,22.0,402.0,0.920398,0.012438,0.012438,0.054726
2,3,533.0,2.0,2.0,20.0,557.0,0.956912,0.003591,0.003591,0.035907
3,4,159.0,0.0,18.0,7.0,184.0,0.86413,0.0,0.097826,0.038043
4,5,1366.0,3.0,2.0,90.0,1461.0,0.934976,0.002053,0.001369,0.061602


In [69]:
#each seller in one week before Double 11 have actions
seller_pre.shape

(4995, 10)

In [71]:
#annotated Double 11 day as event, one week before Double 11 day as pre, before Double 11 day as before.
seller_before = seller_actions[seller_actions['sort_time'] == 0]
print(seller_before.shape)

(36087805, 3)


In [72]:
seller_before_dummies = pd.get_dummies(seller_before['action_type'], prefix='action')
seller_before = pd.concat([seller_before.seller_id, seller_before_dummies], axis=1).groupby(['seller_id'], as_index=False).sum()
seller_before.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3
0,1,181673.0,134.0,10134.0,9287.0
1,2,1146.0,2.0,59.0,117.0
2,3,1452.0,1.0,32.0,150.0
3,4,2383.0,2.0,273.0,154.0
4,5,4984.0,4.0,79.0,432.0


In [73]:
seller_before.rename(columns={"action_0":"click_before", "action_1":"cart_before", "action_2":"buy_before", "action_3":"fav_before"},inplace=True)
seller_before['seventTotalAction_before'] = seller_before.apply(lambda x: x["click_before"] + x["cart_before"] + x["buy_before"] + x["fav_before"], axis=1)
seller_before.shape

(4991, 6)

In [74]:
ls = ['click_before', 'cart_before', 'buy_before', 'fav_before']
for i in ls:
    col_name = i+'_ratio'
    seller_before[col_name] = seller_before[i] / seller_before['seventTotalAction_before']
    
seller_before.head()

Unnamed: 0,seller_id,click_before,cart_before,buy_before,fav_before,seventTotalAction_before,click_before_ratio,cart_before_ratio,buy_before_ratio,fav_before_ratio
0,1,181673.0,134.0,10134.0,9287.0,201228.0,0.902822,0.000666,0.050361,0.046152
1,2,1146.0,2.0,59.0,117.0,1324.0,0.865559,0.001511,0.044562,0.088369
2,3,1452.0,1.0,32.0,150.0,1635.0,0.888073,0.000612,0.019572,0.091743
3,4,2383.0,2.0,273.0,154.0,2812.0,0.84744,0.000711,0.097084,0.054765
4,5,4984.0,4.0,79.0,432.0,5499.0,0.906347,0.000727,0.014366,0.07856


In [75]:
seller_before = pd.merge(all_sellerid,seller_before,how='left',on=['seller_id']).fillna(0)
seller_before.head()

Unnamed: 0,seller_id,click_before,cart_before,buy_before,fav_before,seventTotalAction_before,click_before_ratio,cart_before_ratio,buy_before_ratio,fav_before_ratio
0,1,181673.0,134.0,10134.0,9287.0,201228.0,0.902822,0.000666,0.050361,0.046152
1,2,1146.0,2.0,59.0,117.0,1324.0,0.865559,0.001511,0.044562,0.088369
2,3,1452.0,1.0,32.0,150.0,1635.0,0.888073,0.000612,0.019572,0.091743
3,4,2383.0,2.0,273.0,154.0,2812.0,0.84744,0.000711,0.097084,0.054765
4,5,4984.0,4.0,79.0,432.0,5499.0,0.906347,0.000727,0.014366,0.07856


In [76]:
seller_event.shape, seller_pre.shape, seller_before.shape

((4995, 10), (4995, 10), (4995, 10))

In [77]:
seller_double11 = pd.merge(seller_event,seller_pre,on=['seller_id'])
seller_double11.head()

Unnamed: 0,seller_id,click_11,cart_11,buy_11,fav_11,seventTotalAction_11,click_11_ratio,cart_11_ratio,buy_11_ratio,fav_11_ratio,click_pre,cart_pre,buy_pre,fav_pre,seventTotalAction_pre,click_pre_ratio,cart_pre_ratio,buy_pre_ratio,fav_pre_ratio
0,1,73496.0,122.0,7537.0,816.0,81971.0,0.89661,0.001488,0.091947,0.009955,53067.0,188.0,34.0,2652.0,55941.0,0.948624,0.003361,0.000608,0.047407
1,2,514.0,1.0,125.0,5.0,645.0,0.796899,0.00155,0.193798,0.007752,370.0,5.0,5.0,22.0,402.0,0.920398,0.012438,0.012438,0.054726
2,3,414.0,1.0,33.0,5.0,453.0,0.913907,0.002208,0.072848,0.011038,533.0,2.0,2.0,20.0,557.0,0.956912,0.003591,0.003591,0.035907
3,4,104.0,0.0,3.0,3.0,110.0,0.945455,0.0,0.027273,0.027273,159.0,0.0,18.0,7.0,184.0,0.86413,0.0,0.097826,0.038043
4,5,1133.0,2.0,63.0,34.0,1232.0,0.919643,0.001623,0.051136,0.027597,1366.0,3.0,2.0,90.0,1461.0,0.934976,0.002053,0.001369,0.061602


In [78]:
seller_double11 = pd.merge(seller_double11,seller_before,on=['seller_id'])
seller_double11.head()

Unnamed: 0,seller_id,click_11,cart_11,buy_11,fav_11,seventTotalAction_11,click_11_ratio,cart_11_ratio,buy_11_ratio,fav_11_ratio,...,fav_pre_ratio,click_before,cart_before,buy_before,fav_before,seventTotalAction_before,click_before_ratio,cart_before_ratio,buy_before_ratio,fav_before_ratio
0,1,73496.0,122.0,7537.0,816.0,81971.0,0.89661,0.001488,0.091947,0.009955,...,0.047407,181673.0,134.0,10134.0,9287.0,201228.0,0.902822,0.000666,0.050361,0.046152
1,2,514.0,1.0,125.0,5.0,645.0,0.796899,0.00155,0.193798,0.007752,...,0.054726,1146.0,2.0,59.0,117.0,1324.0,0.865559,0.001511,0.044562,0.088369
2,3,414.0,1.0,33.0,5.0,453.0,0.913907,0.002208,0.072848,0.011038,...,0.035907,1452.0,1.0,32.0,150.0,1635.0,0.888073,0.000612,0.019572,0.091743
3,4,104.0,0.0,3.0,3.0,110.0,0.945455,0.0,0.027273,0.027273,...,0.038043,2383.0,2.0,273.0,154.0,2812.0,0.84744,0.000711,0.097084,0.054765
4,5,1133.0,2.0,63.0,34.0,1232.0,0.919643,0.001623,0.051136,0.027597,...,0.061602,4984.0,4.0,79.0,432.0,5499.0,0.906347,0.000727,0.014366,0.07856


In [79]:
seller_double11.shape

(4995, 28)

In [80]:
seller_double11.to_csv('data_output/Double11_seller.csv', index=False)

### B. Seller Features
#### 1). get unique count for item/cat/brand for each seller

In [177]:
# seller and item numbers
item_count = user_datasample[['seller_id','item_id']]
item_count = item_count.groupby(['seller_id'])['item_id'].nunique().reset_index()
item_count.head()
item_count.columns = ['seller_id', 'item_count']
item_count.head()

Unnamed: 0,seller_id,item_count
0,1,2977
1,2,154
2,3,171
3,4,155
4,5,660


In [178]:
# seller and categories count
cat_count = user_datasample[['seller_id','cat_id']]
cat_count = cat_count.groupby(['seller_id'])['cat_id'].nunique().reset_index()
cat_count.head()
cat_count.columns = ['seller_id', 'cat_count']
cat_count.head()

Unnamed: 0,seller_id,cat_count
0,1,44
1,2,10
2,3,4
3,4,7
4,5,23


In [179]:
# seller and brand numbers
brand_count = user_datasample[['seller_id','brand_id']]
brand_count = brand_count.groupby(['seller_id'])['brand_id'].nunique().reset_index()
brand_count.head()
brand_count.columns = ['seller_id', 'brand_count']
brand_count.head()

Unnamed: 0,seller_id,brand_count
0,1,2
1,2,1
2,3,1
3,4,2
4,5,1


#### 2). get repeat purchase user for every seller before double 11 (Nov. 11th); we could also get a label for users: whether or not they are repeat buyers (regardless of the seller)

In [180]:
repeat_purchase = user_datasample[(user_datasample['action_type'] == 2) & (user_datasample['time_stamp'] < 1111)]
repeat_purchase = repeat_purchase.groupby(['seller_id'])['user_id'].value_counts().to_frame()
repeat_purchase.columns = ['purchase_count']
repeat_purchase.reset_index(inplace=True)
repeat_purchase.head()

Unnamed: 0,seller_id,user_id,purchase_count
0,1,406,21
1,1,56832,21
2,1,180072,20
3,1,339584,15
4,1,88181,14


In [181]:
repeat_purchase = repeat_purchase[repeat_purchase['purchase_count'] > 1]
repeat_purchase = repeat_purchase.groupby(['seller_id'])['user_id'].nunique().reset_index()

In [182]:
repeat_purchase.describe()

Unnamed: 0,seller_id,user_id
count,4888.0,4888.0
mean,2492.389321,75.873159
std,1441.626649,151.651987
min,1.0,1.0
25%,1244.75,11.0
50%,2487.5,29.0
75%,3740.25,75.0
max,4995.0,2748.0


In [183]:
# Because some sellers without repeated purchases are excluded,we have to add them and fill 0 for future merge.
all_sellerid = pd.DataFrame({'seller_id':range(1, 4996 ,1)})
repeat_purchase = pd.merge(all_sellerid,repeat_purchase,how='left',on=['seller_id']).fillna(0)
repeat_purchase.head()

Unnamed: 0,seller_id,user_id
0,1,2214.0
1,2,4.0
2,3,2.0
3,4,48.0
4,5,14.0


In [184]:
repeat_purchase.columns = ['seller_id', 'repeat_users_count']
repeat_purchase.head()

Unnamed: 0,seller_id,repeat_users_count
0,1,2214.0
1,2,4.0
2,3,2.0
3,4,48.0
4,5,14.0


#### 3). seller analysis: 
I: count each action click/add/buy/save/total for each seller and proportion/mean

II: count each action done by each user in each seller

In [185]:
# seller_id and action_type
actions = user_datasample[['seller_id', 'action_type']]
actions.head()

Unnamed: 0,seller_id,action_type
0,2882,0
1,2882,0
2,2882,0
3,2882,0
4,1253,0


In [186]:
# extract each action type
dummy_action = pd.get_dummies(actions['action_type'], prefix='action')
dummy_action.head()

Unnamed: 0,action_0,action_1,action_2,action_3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [187]:
# count the number of each actions for each seller
seller_actions = pd.concat([actions,dummy_action], axis=1).groupby(['seller_id'], as_index=False).sum()
#.drop('action_type', axis=1, inplace=True)

In [188]:
seller_actions.head()

Unnamed: 0,seller_id,action_type,action_0,action_1,action_2,action_3
0,1,74119,308236.0,444.0,17705.0,12755.0
1,2,818,2030.0,8.0,189.0,144.0
2,3,663,2399.0,4.0,67.0,175.0
3,4,1082,2646.0,2.0,294.0,164.0
4,5,1965,7483.0,9.0,144.0,556.0


In [189]:
seller_actions.drop('action_type',1, inplace=True)

In [190]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3
0,1,308236.0,444.0,17705.0,12755.0
1,2,2030.0,8.0,189.0,144.0
2,3,2399.0,4.0,67.0,175.0
3,4,2646.0,2.0,294.0,164.0
4,5,7483.0,9.0,144.0,556.0


In [191]:
seller_actions['sellerTotalAction'] = seller_actions.apply(lambda x: x["action_0"] + x["action_1"] + x["action_2"] + x["action_3"], axis=1)
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction
0,1,308236.0,444.0,17705.0,12755.0,339140.0
1,2,2030.0,8.0,189.0,144.0,2371.0
2,3,2399.0,4.0,67.0,175.0,2645.0
3,4,2646.0,2.0,294.0,164.0,3106.0
4,5,7483.0,9.0,144.0,556.0,8192.0


In [192]:
#Clicked to buy conversion rate, added to shopping cart purchase conversion rate, purchased conversion rate
seller_actions['click_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_0'])
# add-in-cart to purchase conversion rate, purchased conversion rate
seller_actions['cart_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_1'])
# add-in-fav to purchase conversion rate
seller_actions['fav_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_3'])

In [193]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831


In [194]:
# user_id, seller_id and action_type
user_actions = user_datasample[['seller_id', 'user_id', 'action_type']]
user_actions.head()

Unnamed: 0,seller_id,user_id,action_type
0,2882,328862,0
1,2882,328862,0
2,2882,328862,0
3,2882,328862,0
4,1253,328862,0


In [195]:
dummy_user = pd.get_dummies(user_actions['action_type'], prefix='seller_user_action')
dummy_user.head()

Unnamed: 0,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [196]:
su_action = pd.concat([user_actions, dummy_user], axis=1)
su_action.drop('action_type',1, inplace=True)
su_action.head()

Unnamed: 0,seller_id,user_id,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,2882,328862,1,0,0,0
1,2882,328862,1,0,0,0
2,2882,328862,1,0,0,0
3,2882,328862,1,0,0,0
4,1253,328862,1,0,0,0


In [197]:
# drop duplicates so that we can count the number of users who made actions for each seller
su_action.drop_duplicates(inplace=True)

In [198]:
su_action.head()

Unnamed: 0,seller_id,user_id,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,2882,328862,1,0,0,0
4,1253,328862,1,0,0,0
7,883,328862,1,0,0,0
10,420,328862,1,0,0,0
11,4605,328862,1,0,0,0


In [199]:
dummy_user_1 = su_action.groupby(['seller_id'])['seller_user_action_0'].sum()
dummy_user_2 = su_action.groupby(['seller_id'])['seller_user_action_1'].sum()
dummy_user_3 = su_action.groupby(['seller_id'])['seller_user_action_2'].sum()
dummy_user_4 = su_action.groupby(['seller_id'])['seller_user_action_3'].sum()

In [200]:
su_action = pd.concat([dummy_user_1, dummy_user_2, dummy_user_3, dummy_user_4], axis=1).reset_index()
su_action.head()

Unnamed: 0,seller_id,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,29251.0,265.0,7666.0,4965.0
1,2,902.0,6.0,161.0,127.0
2,3,1103.0,4.0,65.0,150.0
3,4,1384.0,2.0,201.0,153.0
4,5,3535.0,9.0,120.0,458.0


In [201]:
seller_actions = pd.merge(seller_actions,item_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,cat_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,brand_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,repeat_purchase,on=['seller_id'])
seller_actions = pd.merge(seller_actions,su_action,on=['seller_id'])

In [202]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [203]:
# calculate the ratio of item, category, brand of each seller of total to see scale of each seller
seller_actions.item_ratio = seller_actions.item_count/user_datasample.item_id.nunique()
seller_actions.cat_ratio = seller_actions.cat_count/user_datasample.cat_id.nunique()
seller_actions.brand_ratio = seller_actions.brand_count/user_datasample.brand_id.nunique()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [204]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [205]:
# calculate the proportion of clicks, add-to-cart, purchase, add-to-fav of each seller of total of each activities 
# in the whole market
seller_actions.su_click_ratio = seller_actions.seller_user_action_0/user_datasample[user_datasample.action_type == 0]['user_id'].nunique()
seller_actions.su_cart_ratio = seller_actions.seller_user_action_1/user_datasample[user_datasample.action_type == 1]['user_id'].nunique()
seller_actions.su_buy_ratio = seller_actions.seller_user_action_2/user_datasample[user_datasample.action_type == 2]['user_id'].nunique()
seller_actions.su_fav_ratio = seller_actions.seller_user_action_3/user_datasample[user_datasample.action_type == 3]['user_id'].nunique()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [206]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [207]:
seller_actions.rename(columns={'seller_user_action_0': "su_click", 'seller_user_action_1':"su_cart",
                                   "seller_user_action_2":"su_buy", "seller_user_action_3":"su_fav", 
                                   "action_0":"click", "action_1":"cart", "action_2":"buy", "action_3":"fav"},inplace=True)

In [208]:
seller_actions.head()

Unnamed: 0,seller_id,click,cart,buy,fav,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,su_click,su_cart,su_buy,su_fav
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [209]:
seller_actions.shape

(4995, 17)

In [210]:
seller_actions = pd.merge(seller_actions,seller_double11,on=['seller_id'])
seller_actions.shape

(4995, 44)

In [211]:
seller_actions.to_csv('seller_features.csv', index=False)