# Part 2 Feature Engineering

## 2.2 Product diversity

In [1]:
import pandas as pd
import numpy as np

In [2]:
user_data = pd.read_csv('cleaned_raw/user_data.csv')

In [3]:
user_data.head()

Unnamed: 0.1,Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,0,328862,323294,833,2882,2661.0,829,0
1,1,328862,844400,1271,2882,2661.0,829,0
2,2,328862,575153,1271,2882,2661.0,829,0
3,3,328862,996875,1271,2882,2661.0,829,0
4,4,328862,1086186,1271,1253,1049.0,829,0


### 2.2.1 merchant
for each merchant, obtain the count of the 4 different actions and sum of them for unique item/brand/category

In [8]:
#merchant action dataframe prep
mc_action = user_data[['seller_id','brand_id','cat_id','item_id','action_type']]
action_dummies = pd.get_dummies(mc_action['action_type'])
mc_action = pd.concat([mc_action, action_dummies], axis=1)
mc_action.rename(columns={0: "click", 1:"add", 2:"buy", 3:"save"},inplace=True)
del action_dummies
mc_action.head()

Unnamed: 0,seller_id,brand_id,cat_id,item_id,time_stamp,action_type,click,add,buy,save
0,2882,2661.0,833,323294,829,0,1,0,0,0
1,2882,2661.0,1271,844400,829,0,1,0,0,0
2,2882,2661.0,1271,575153,829,0,1,0,0,0
3,2882,2661.0,1271,996875,829,0,1,0,0,0
4,1253,1049.0,1271,1086186,829,0,1,0,0,0


In [16]:
# for each unique item, get the overall count
mc_item_overall = mc_action.groupby(['seller_id','item_id'], as_index=False)['click','add','buy','save'].sum()

In [24]:
mc_item_overall['total_action'] = mc_item_overall['click'] + mc_item_overall['add'] + mc_item_overall['buy'] + mc_item_overall['save']
mc_item_overall.head()

Unnamed: 0,seller_id,item_id,click,add,buy,save,total_action
0,1,20,43.0,1.0,1.0,2.0,47.0
1,1,230,43.0,0.0,0.0,2.0,45.0
2,1,1353,2.0,0.0,0.0,0.0,2.0
3,1,1521,465.0,1.0,10.0,25.0,501.0
4,1,1571,205.0,0.0,18.0,2.0,225.0


In [20]:
# for each unique cat, get the overall count
mc_cat_overall = mc_action.groupby(['seller_id','cat_id'], as_index=False)['click','add','buy','save'].sum()

In [25]:
mc_cat_overall['total_action'] = mc_cat_overall['click'] + mc_cat_overall['add'] + mc_cat_overall['buy'] + mc_cat_overall['save']
mc_cat_overall.head()

Unnamed: 0,seller_id,cat_id,click,add,buy,save,total_action
0,1,27,136.0,0.0,4.0,5.0,145.0
1,1,28,0.0,1.0,0.0,0.0,1.0
2,1,35,31503.0,12.0,1971.0,1272.0,34758.0
3,1,103,31.0,0.0,2.0,2.0,35.0
4,1,125,50.0,0.0,4.0,2.0,56.0


In [27]:
# for each unique brand, get the overall count
mc_brand_overall = mc_action.groupby(['seller_id','brand_id'], as_index=False)['click','add','buy','save'].sum()

In [28]:
mc_brand_overall['total_action'] = mc_brand_overall['click'] + mc_brand_overall['buy'] + mc_brand_overall['save']
mc_brand_overall.head()

Unnamed: 0,seller_id,brand_id,click,add,buy,save,total_action
0,1,1104.0,6420.0,0.0,326.0,326.0,7072.0
1,1,1662.0,301816.0,444.0,17379.0,12429.0,331624.0
2,2,2921.0,2030.0,8.0,189.0,144.0,2363.0
3,3,970.0,2399.0,4.0,67.0,175.0,2641.0
4,4,1944.0,2632.0,2.0,294.0,164.0,3090.0


### 2.2.2 user
for each merchant, obtain the count of the 4 different actions and sum of them for unique item/brand/category

In [39]:
#user action dataframe prep
user_action = user_data[['seller_id','user_id','brand_id','cat_id','item_id','action_type']]
action_dummies = pd.get_dummies(user_action['action_type'])
user_action = pd.concat([user_action, action_dummies], axis=1)
user_action.rename(columns={0: "click", 1:"add", 2:"buy", 3:"save"},inplace=True)
del action_dummies
user_action.head()

Unnamed: 0,seller_id,user_id,brand_id,cat_id,item_id,time_stamp,action_type,click,add,buy,save
0,2882,328862,2661.0,833,323294,829,0,1,0,0,0
1,2882,328862,2661.0,1271,844400,829,0,1,0,0,0
2,2882,328862,2661.0,1271,575153,829,0,1,0,0,0
3,2882,328862,2661.0,1271,996875,829,0,1,0,0,0
4,1253,328862,1049.0,1271,1086186,829,0,1,0,0,0


In [40]:
# for each unique item, get the overall count
user_item_overall = user_action.groupby(['user_id','item_id'], as_index=False)['click','add','buy','save'].sum()
user_item_overall['total_action'] = user_item_overall['click'] + user_item_overall['add'] + user_item_overall['buy'] + user_item_overall['save']
user_item_overall.head()

Unnamed: 0,user_id,item_id,click,add,buy,save,total_action
0,1,112203,4.0,0.0,1.0,0.0,5.0
1,1,181459,1.0,0.0,0.0,0.0,1.0
2,1,411984,1.0,0.0,0.0,0.0,1.0
3,1,452339,1.0,0.0,0.0,0.0,1.0
4,1,452837,1.0,0.0,0.0,0.0,1.0


In [41]:
# for each unique cat, get the overall count
user_cat_overall = user_action.groupby(['user_id','cat_id'], as_index=False)['click','add','buy','save'].sum()
user_cat_overall['total_action'] = user_cat_overall['click'] + user_cat_overall['add'] + user_cat_overall['buy'] + user_cat_overall['save']
user_cat_overall.head()

Unnamed: 0,user_id,cat_id,click,add,buy,save,total_action
0,1,276,5.0,0.0,0.0,0.0,5.0
1,1,389,2.0,0.0,0.0,0.0,2.0
2,1,992,10.0,0.0,4.0,0.0,14.0
3,1,1023,3.0,0.0,1.0,0.0,4.0
4,1,1252,6.0,0.0,1.0,0.0,7.0


In [42]:
# for each unique brand, get the overall count
user_brand_overall = user_action.groupby(['user_id','brand_id'], as_index=False)['click','add','buy','save'].sum()
user_brand_overall['total_action'] = user_cat_overall['click'] + user_cat_overall['add'] + user_cat_overall['buy'] + user_cat_overall['save']
user_brand_overall.head()

Unnamed: 0,user_id,brand_id,click,add,buy,save,total_action
0,1,649.0,1.0,0.0,0.0,0.0,5.0
1,1,1469.0,4.0,0.0,1.0,0.0,2.0
2,1,1960.0,1.0,0.0,0.0,0.0,14.0
3,1,3431.0,1.0,0.0,0.0,0.0,4.0
4,1,3862.0,1.0,0.0,0.0,0.0,7.0


### 2.2.3 <merchant, user>
for each unique <merchant,user>, obtain the count of the 4 different actions and sum of them for unique item/brand/category

In [43]:
# for each unique item, get the overall count
mu_item_overall = user_action.groupby(['seller_id','user_id','item_id'], as_index=False)['click','add','buy','save'].sum()
mu_item_overall['total_action'] = mu_item_overall['click'] + mu_item_overall['add'] + mu_item_overall['buy'] + mu_item_overall['save']
mu_item_overall.head()

Unnamed: 0,seller_id,user_id,item_id,click,add,buy,save,total_action
0,1,5,993997,4.0,0.0,0.0,1.0,5.0
1,1,20,307038,1.0,0.0,0.0,0.0,1.0
2,1,20,325001,1.0,0.0,0.0,0.0,1.0
3,1,23,116611,1.0,0.0,0.0,0.0,1.0
4,1,23,421707,1.0,0.0,0.0,0.0,1.0


In [44]:
# for each unique cat, get the overall count
mu_cat_overall = user_action.groupby(['seller_id','user_id','cat_id'], as_index=False)['click','add','buy','save'].sum()
mu_cat_overall['total_action'] = mu_cat_overall['click'] + mu_cat_overall['add'] + mu_cat_overall['buy'] + mu_cat_overall['save']
mu_cat_overall.head()

Unnamed: 0,seller_id,user_id,cat_id,click,add,buy,save,total_action
0,1,5,1028,4.0,0.0,0.0,1.0,5.0
1,1,20,1028,1.0,0.0,0.0,0.0,1.0
2,1,20,1130,1.0,0.0,0.0,0.0,1.0
3,1,23,629,2.0,0.0,0.0,0.0,2.0
4,1,23,993,2.0,0.0,0.0,0.0,2.0


In [46]:
# for each unique brand, get the overall count
mu_brand_overall = user_action.groupby(['seller_id','user_id','brand_id'], as_index=False)['click','add','buy','save'].sum()
mu_brand_overall['total_action'] = mu_brand_overall['click'] + mu_brand_overall['add'] + mu_brand_overall['buy'] + mu_brand_overall['save']
mu_brand_overall.head()

Unnamed: 0,seller_id,user_id,brand_id,click,add,buy,save,total_action
0,1,5,1662.0,4.0,0.0,0.0,1.0,5.0
1,1,20,1662.0,2.0,0.0,0.0,0.0,2.0
2,1,23,1662.0,6.0,0.0,0.0,0.0,6.0
3,1,34,1662.0,6.0,0.0,1.0,0.0,7.0
4,1,72,1662.0,1.0,0.0,1.0,0.0,2.0
