# Part 2. Feature Engineering

## 2.3 Market share feature

### 2.3.1 get unique count for item/cat/brand for each seller

In [1]:
#import packages
import numpy as np
import pandas as pd

In [2]:
user_data = pd.read_csv('cleaned_raw/user_data.csv')

In [12]:
def Categorize(n):
    if n == 1111:
        return 2
    elif n >= 1104:
        return 1
    else:
        return 0
user_data['sort_time'] = user_data.time_stamp.map(Categorize)

In [7]:
def unique_count_seller(id_name):
    col_id = id_name + '_id'
    col_name = id_name + '_count'
    
    count = user_data[['seller_id',col_id]]
    count = count.groupby(['seller_id'])[col_id].nunique().reset_index()
    count.columns = ['seller_id', col_name]
    return (count)

In [11]:
item_count = unique_count_seller('item')
cat_count = unique_count_seller('cat')
brand_count = unique_count_seller('brand')

### 2.3.2 get repeat purchase user for every seller before double 11 (Nov. 11th); we could also get a label for users: whether or not they are repeat buyers (regardless of the seller)

In [14]:
repeat_purchase = user_data[(user_data['action_type'] == 2) & (user_data['time_stamp'] < 1111)]
repeat_purchase = repeat_purchase.groupby(['seller_id'])['user_id'].value_counts().to_frame()
repeat_purchase.columns = ['purchase_count']
repeat_purchase.reset_index(inplace=True)
repeat_purchase.head()

Unnamed: 0,seller_id,user_id,purchase_count
0,1,406,21
1,1,56832,21
2,1,180072,20
3,1,339584,15
4,1,88181,14


In [15]:
repeat_purchase = repeat_purchase[repeat_purchase['purchase_count'] > 1]
repeat_purchase = repeat_purchase.groupby(['seller_id'])['user_id'].nunique().reset_index()

In [16]:
repeat_purchase.describe()

Unnamed: 0,seller_id,user_id
count,4888.0,4888.0
mean,2492.389321,75.873159
std,1441.626649,151.651987
min,1.0,1.0
25%,1244.75,11.0
50%,2487.5,29.0
75%,3740.25,75.0
max,4995.0,2748.0


In [17]:
# Because some sellers without repeated purchases are excluded,we have to add them and fill 0 for future merge.
all_sellerid = pd.DataFrame({'seller_id':range(1, 4996 ,1)})
repeat_purchase = pd.merge(all_sellerid,repeat_purchase,how='left',on=['seller_id']).fillna(0)
repeat_purchase.head()

Unnamed: 0,seller_id,user_id
0,1,2214.0
1,2,4.0
2,3,2.0
3,4,48.0
4,5,14.0


In [18]:
repeat_purchase.columns = ['seller_id', 'repeat_users_count']
repeat_purchase.head()

Unnamed: 0,seller_id,repeat_users_count
0,1,2214.0
1,2,4.0
2,3,2.0
3,4,48.0
4,5,14.0


### 2.3.2 seller analysis
I: count each action click/add/buy/save/total for each seller and proportion/mean  
II: count each action done by each user in each seller

In [20]:
# seller_id and action_type
actions = user_data[['seller_id', 'action_type']]
actions.head()

Unnamed: 0,seller_id,action_type
0,2882,0
1,2882,0
2,2882,0
3,2882,0
4,1253,0


In [21]:
# extract each action type
dummy_action = pd.get_dummies(actions['action_type'], prefix='action')
dummy_action.head()

Unnamed: 0,action_0,action_1,action_2,action_3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [22]:
# count the number of each actions for each seller
seller_actions = pd.concat([actions,dummy_action], axis=1).groupby(['seller_id'], as_index=False).sum()
#.drop('action_type', axis=1, inplace=True)

In [24]:
seller_actions.drop('action_type',1, inplace=True)
seller_actions['sellerTotalAction'] = seller_actions.apply(lambda x: x["action_0"] + x["action_1"] + x["action_2"] + x["action_3"], axis=1)
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction
0,1,308236.0,444.0,17705.0,12755.0,339140.0
1,2,2030.0,8.0,189.0,144.0,2371.0
2,3,2399.0,4.0,67.0,175.0,2645.0
3,4,2646.0,2.0,294.0,164.0,3106.0
4,5,7483.0,9.0,144.0,556.0,8192.0


In [25]:
#Clicked to buy conversion rate, added to shopping cart purchase conversion rate, purchased conversion rate
seller_actions['click_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_0'])
# add-in-cart to purchase conversion rate, purchased conversion rate
seller_actions['cart_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_1'])
# add-in-fav to purchase conversion rate
seller_actions['fav_to_buy_ratio'] = np.log1p(seller_actions['action_2']) - np.log1p(seller_actions['action_3'])

In [26]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831


In [27]:
# user_id, seller_id and action_type
user_actions = user_data[['seller_id', 'user_id', 'action_type']]
dummy_user = pd.get_dummies(user_actions['action_type'], prefix='seller_user_action')
dummy_user.head()

Unnamed: 0,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [28]:
su_action = pd.concat([user_actions, dummy_user], axis=1)
su_action.drop('action_type',1, inplace=True)
su_action.head()

Unnamed: 0,seller_id,user_id,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,2882,328862,1,0,0,0
1,2882,328862,1,0,0,0
2,2882,328862,1,0,0,0
3,2882,328862,1,0,0,0
4,1253,328862,1,0,0,0


In [29]:
# drop duplicates so that we can count the number of users who made actions for each seller
su_action.drop_duplicates(inplace=True)
dummy_user_1 = su_action.groupby(['seller_id'])['seller_user_action_0'].sum()
dummy_user_2 = su_action.groupby(['seller_id'])['seller_user_action_1'].sum()
dummy_user_3 = su_action.groupby(['seller_id'])['seller_user_action_2'].sum()
dummy_user_4 = su_action.groupby(['seller_id'])['seller_user_action_3'].sum()

In [30]:
su_action = pd.concat([dummy_user_1, dummy_user_2, dummy_user_3, dummy_user_4], axis=1).reset_index()
su_action.head()

Unnamed: 0,seller_id,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,29251.0,265.0,7666.0,4965.0
1,2,902.0,6.0,161.0,127.0
2,3,1103.0,4.0,65.0,150.0
3,4,1384.0,2.0,201.0,153.0
4,5,3535.0,9.0,120.0,458.0


In [31]:
seller_actions = pd.merge(seller_actions,item_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,cat_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,brand_count,on=['seller_id'])
seller_actions = pd.merge(seller_actions,repeat_purchase,on=['seller_id'])
seller_actions = pd.merge(seller_actions,su_action,on=['seller_id'])

In [32]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [34]:
# calculate the ratio of item, category, brand of each seller of total to see scale of each seller
seller_actions.item_ratio = seller_actions.item_count/user_data.item_id.nunique()
seller_actions.cat_ratio = seller_actions.cat_count/user_data.cat_id.nunique()
seller_actions.brand_ratio = seller_actions.brand_count/user_data.brand_id.nunique()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [35]:
# calculate the proportion of clicks, add-to-cart, purchase, add-to-fav of each seller of total of each activities 
# in the whole market
seller_actions.su_click_ratio = seller_actions.seller_user_action_0/user_data[user_data.action_type == 0]['user_id'].nunique()
seller_actions.su_cart_ratio = seller_actions.seller_user_action_1/user_data[user_data.action_type == 1]['user_id'].nunique()
seller_actions.su_buy_ratio = seller_actions.seller_user_action_2/user_data[user_data.action_type == 2]['user_id'].nunique()
seller_actions.su_fav_ratio = seller_actions.seller_user_action_3/user_data[user_data.action_type == 3]['user_id'].nunique()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [36]:
seller_actions.head()

Unnamed: 0,seller_id,action_0,action_1,action_2,action_3,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,seller_user_action_0,seller_user_action_1,seller_user_action_2,seller_user_action_3
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [37]:
seller_actions.rename(columns={'seller_user_action_0': "su_click", 'seller_user_action_1':"su_cart",
                                   "seller_user_action_2":"su_buy", "seller_user_action_3":"su_fav", 
                                   "action_0":"click", "action_1":"cart", "action_2":"buy", "action_3":"fav"},inplace=True)

In [38]:
seller_actions.head()

Unnamed: 0,seller_id,click,cart,buy,fav,sellerTotalAction,click_to_buy_ratio,cart_to_buy_ratio,fav_to_buy_ratio,item_count,cat_count,brand_count,repeat_users_count,su_click,su_cart,su_buy,su_fav
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,2977,44,2,2214.0,29251.0,265.0,7666.0,4965.0
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,154,10,1,4.0,902.0,6.0,161.0,127.0
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,171,4,1,2.0,1103.0,4.0,65.0,150.0
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,155,7,2,48.0,1384.0,2.0,201.0,153.0
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,660,23,1,14.0,3535.0,9.0,120.0,458.0


In [40]:
seller_double11 = pd.read_csv('data_output/Double11_seller.csv')

In [41]:
seller_actions = pd.merge(seller_actions,seller_double11,on=['seller_id'])
seller_actions.shape

(4995, 44)

In [42]:
seller_actions.to_csv('data_output/seller_features.csv', index=False)