# Part I. Data cleaning

In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
user_datasample = pd.read_csv('data_format1/user_log_format1.csv')
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [3]:
#find out the missing value in this file
user_datasample.isnull().sum()

user_id            0
item_id            0
cat_id             0
seller_id          0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64

In [4]:
#get the mode brand id for all stores, fillin NA value in brain_id column
missing = user_datasample[user_datasample.brand_id.isnull()].index
seller = user_datasample.groupby(['seller_id']).apply(lambda x:x.brand_id.mode()[0]).reset_index()
get_brand = user_datasample.loc[missing]
get_brand = get_brand.merge(seller,how='left',on=['seller_id'])[0].astype('float32')
get_brand.index = missing
user_datasample.loc[missing,'brand_id'] = get_brand

In [5]:
#see whether null values are filled
user_datasample.isnull().sum()

user_id        0
item_id        0
cat_id         0
seller_id      0
brand_id       0
time_stamp     0
action_type    0
dtype: int64

In [6]:
#user info
user_info = pd.read_csv('data_format1/user_info_format1.csv')
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
#user info
user_info.isnull().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

In [8]:
user_info.age_range.fillna(user_info.age_range.median(),inplace=True)
user_info.gender.fillna(2,inplace=True)
#user info NA value exam
user_info.isnull().sum()

user_id      0
age_range    0
gender       0
dtype: int64

In [9]:
user_info = user_info[user_info['age_range'] != 0]

In [10]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [11]:
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [12]:
#unique customer purchase history
user_datasample['user_id'].nunique()

424170

In [13]:
user_datasample['user_id'] = user_datasample['user_id'].astype(str)
user_datasample['seller_id'] = user_datasample['seller_id'].astype(str)
user_datasample['su_id'] = user_datasample['user_id'].str.cat(user_datasample['seller_id'],
                                                                   sep = '_')

In [14]:
user_datasample['brand_id'] = user_datasample['brand_id'].astype(str)
user_datasample['seller_id'] = user_datasample['seller_id'].astype(str)
user_datasample['sb_id'] = user_datasample['brand_id'].str.cat(user_datasample['seller_id'],
                                                                   sep = '_')

## Merge all user info

In [14]:
df_age = pd.get_dummies(user_info.age_range,prefix='age')
df_gender = pd.get_dummies(user_info.gender)
df_gender.rename(columns={0:'female',1:'male',2:'unknown'},inplace=True)
user_info = pd.concat([user_info.user_id, df_age, df_gender], axis=1)
del df_age, df_gender

In [15]:
user_info.isnull().sum()

user_id    0
age_1.0    0
age_2.0    0
age_3.0    0
age_4.0    0
age_5.0    0
age_6.0    0
age_7.0    0
age_8.0    0
female     0
male       0
unknown    0
dtype: int64

In [16]:
tot_user_df = user_datasample.merge(user_info, on="user_id", how="left")

In [17]:
tot_user_df.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,female,male,unknown
0,328862,323294,833,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,328862,844400,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,328862,575153,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,328862,996875,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,328862,1086186,1271,1253,1049.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# Part II. Feature Engineering:
### A. merchant features:
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [17]:
'''For each action'''
seller_actions = user_datasample[['seller_id', 'action_type']]
dummy_seller_action = pd.get_dummies(seller_actions['action_type'])
seller_actions = pd.concat([seller_actions.seller_id,dummy_seller_action], axis=1).groupby(['seller_id'], as_index=False).sum()
seller_actions.rename(columns={0: "seller_click_count_", 1:"seller_cart_count_", 
                               2:"seller_buy_count_", 3:"seller_fav_count_"},inplace=True)
seller_actions['sellerTotalAction'] = seller_actions.apply(lambda x: x["seller_click_count_"] + x["seller_cart_count_"] + x["seller_buy_count_"] + x["seller_fav_count_"], axis=1)

In [18]:
seller_actions['seller_click_to_buy_ratio'] = np.log1p(seller_actions['seller_buy_count_']) - np.log1p(seller_actions['seller_click_count_'])
seller_actions['seller_cart_to_buy_ratio'] = np.log1p(seller_actions['seller_buy_count_']) - np.log1p(seller_actions['seller_cart_count_'])
seller_actions['seller_fav_to_buy_ratio'] = np.log1p(seller_actions['seller_buy_count_']) - np.log1p(seller_actions['seller_fav_count_'])

In [19]:
seller_actions.head()

Unnamed: 0,seller_id,seller_click_count_,seller_cart_count_,seller_buy_count_,seller_fav_count_,sellerTotalAction,seller_click_to_buy_ratio,seller_cart_to_buy_ratio,seller_fav_to_buy_ratio
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831


In [20]:
#extract month and day from time stamp
user_datasample['month'] = np.floor(user_datasample['time_stamp']/100).astype(int)
user_datasample['day'] = user_datasample['time_stamp']%100
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,month,day
0,328862,323294,833,2882,2661.0,829,0,8,29
1,328862,844400,1271,2882,2661.0,829,0,8,29
2,328862,575153,1271,2882,2661.0,829,0,8,29
3,328862,996875,1271,2882,2661.0,829,0,8,29
4,328862,1086186,1271,1253,1049.0,829,0,8,29


In [21]:
#stat: for each user, in how many days he has action(all types)
time_diff = user_datasample[['seller_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['seller_id','month','day'])['action_type'].count())
time_diff_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action_type
seller_id,month,day,Unnamed: 3_level_1
1,5,11,105
1,5,12,134
1,5,13,149
1,5,14,114
1,5,15,131


In [22]:
time_diff_stat.reset_index(inplace=True)

In [23]:
time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

In [24]:
time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

In [25]:
click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["seller_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"seller_click_day_count"},inplace=True)
click_day.head()

Unnamed: 0,seller_id,seller_click_day_count
0,1,176
1,2,169
2,3,168
3,4,176
4,5,176


In [26]:
add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["seller_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"seller_add_day_count"}, inplace=True)
add_day.head()

Unnamed: 0,seller_id,seller_add_day_count
0,1,55
1,2,6
2,3,4
3,4,2
4,5,6


In [27]:
buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["seller_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"seller_buy_day_count"}, inplace=True)
buy_day.head()

Unnamed: 0,seller_id,seller_buy_day_count
0,1,185
1,2,54
2,3,31
3,4,117
4,5,64


In [28]:
save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["seller_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"seller_save_day_count"}, inplace=True)
save_day.head()

Unnamed: 0,seller_id,seller_save_day_count
0,1,185
1,2,84
2,3,79
3,4,99
4,5,154


In [29]:
day_count = pd.merge(click_day, add_day, how = 'outer', on='seller_id')

day_count = day_count.merge(buy_day, how = 'outer', on ='seller_id')

day_count = day_count.merge(save_day, how='outer', on='seller_id')

day_count.fillna(0, inplace=True)

In [30]:
seller_overall = pd.merge(seller_actions, day_count, how = 'left', on='seller_id')
seller_overall.head()

Unnamed: 0,seller_id,seller_click_count_,seller_cart_count_,seller_buy_count_,seller_fav_count_,sellerTotalAction,seller_click_to_buy_ratio,seller_cart_to_buy_ratio,seller_fav_to_buy_ratio,seller_click_day_count,seller_add_day_count,seller_buy_day_count,seller_save_day_count
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,176,55.0,185,185
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,169,6.0,54,84
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,168,4.0,31,79
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,176,2.0,117,99
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,176,6.0,64,154


In [31]:
seller_overall.shape

(4995, 13)

#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

In [32]:
# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['seller_click_count_', 'seller_cart_count_', 'seller_fav_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    seller_overall[conversion_name] = np.log1p(seller_overall['seller_buy_count_']) - np.log1p(seller_overall[i])
    seller_overall[conversion_diff_name] = seller_overall[conversion_name] - seller_overall[conversion_name].mean()

In [33]:
seller_overall.head()

Unnamed: 0,seller_id,seller_click_count_,seller_cart_count_,seller_buy_count_,seller_fav_count_,sellerTotalAction,seller_click_to_buy_ratio,seller_cart_to_buy_ratio,seller_fav_to_buy_ratio,seller_click_day_count,seller_add_day_count,seller_buy_day_count,seller_save_day_count,seller_click_count__conversion,seller_click_count_conversion_diff,seller_cart_count__conversion,seller_cart_count_conversion_diff,seller_fav_count__conversion,seller_fav_count_conversion_diff
0,1,308236.0,444.0,17705.0,12755.0,339140.0,-2.856965,3.683585,0.327902,176,55.0,185,185,-2.856965,-0.268369,3.683585,-0.01221,0.327902,0.107324
1,2,2030.0,8.0,189.0,144.0,2371.0,-2.369259,3.049799,0.27029,169,6.0,54,84,-2.369259,0.219337,3.049799,-0.645995,0.27029,0.049713
2,3,2399.0,4.0,67.0,175.0,2645.0,-3.563716,2.61007,-0.950976,168,4.0,31,79,-3.563716,-0.975119,2.61007,-1.085725,-0.950976,-1.171554
3,4,2646.0,2.0,294.0,164.0,3106.0,-2.194207,4.588363,0.58103,176,2.0,117,99,-2.194207,0.39439,4.588363,0.892568,0.58103,0.360452
4,5,7483.0,9.0,144.0,556.0,8192.0,-3.943789,2.674149,-1.345831,176,6.0,64,154,-3.943789,-1.355192,2.674149,-1.021646,-1.345831,-1.566409


In [35]:
seller_overall.to_csv("seller_overall.csv")

### B. <user,merchant> features:
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [15]:
'''For unique user'''
seller_user_action = user_datasample[['su_id', 'action_type']]
dummy_seller_user_action = pd.get_dummies(seller_user_action['action_type'])
seller_user_action = pd.concat([seller_user_action, dummy_seller_user_action], 
                               axis=1).groupby(['su_id'],as_index = False).sum()
seller_user_action.drop('action_type',1, inplace=True)
seller_user_action.drop_duplicates(inplace=True)
seller_user_action.rename(columns={0: "su_click", 1:"su_cart", 2:"su_buy", 3:"su_fav"},inplace=True)
seller_user_action['suTotalAction'] = seller_user_action.apply(lambda x: x["su_click"] + x["su_cart"] + x["su_buy"] + x["su_fav"], axis=1)

In [16]:
seller_user_action.head()

Unnamed: 0,su_id,su_click,su_cart,su_buy,su_fav,suTotalAction
0,100000_1042,1.0,0.0,0.0,0.0,1.0
1,100000_106,9.0,0.0,0.0,0.0,9.0
2,100000_1087,1.0,0.0,0.0,0.0,1.0
3,100000_1142,9.0,0.0,1.0,0.0,10.0
4,100000_1199,6.0,0.0,0.0,0.0,6.0


#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [17]:
# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['su_click','su_cart','su_buy','su_fav']
for i in action_ls:
    col_name = i + '_ratio_'
    seller_user_action[col_name] = seller_user_action[i] / seller_user_action['suTotalAction']

#### 3). Evaluate the level of activeness, count for the whole period & each month

In [18]:
#extract month and day from time stamp
user_datasample['month'] = np.floor(user_datasample['time_stamp']/100).astype(int)
user_datasample['day'] = user_datasample['time_stamp']%100
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,su_id,sb_id,month,day
0,328862,323294,833,2882,2661.0,829,0,328862_2882,2661.0_2882,8,29
1,328862,844400,1271,2882,2661.0,829,0,328862_2882,2661.0_2882,8,29
2,328862,575153,1271,2882,2661.0,829,0,328862_2882,2661.0_2882,8,29
3,328862,996875,1271,2882,2661.0,829,0,328862_2882,2661.0_2882,8,29
4,328862,1086186,1271,1253,1049.0,829,0,328862_1253,1049.0_1253,8,29


In [23]:
#stat: for each user, in how many days he has action(all types)
time_diff = user_datasample[['su_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['su_id','month','day'])['action_type'].count())
time_diff_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action_type
su_id,month,day,Unnamed: 3_level_1
100000_1042,8,27,1
100000_106,9,5,8
100000_106,9,7,1
100000_1087,9,27,1
100000_1142,9,27,6


In [24]:
time_diff_stat.reset_index(inplace=True)
time_diff = pd.get_dummies(time_diff, columns = ["action_type"])
time_diff.head()

Unnamed: 0,su_id,month,day,time_stamp,action_type_0,action_type_1,action_type_2,action_type_3
0,328862_2882,8,29,829,1,0,0,0
1,328862_2882,8,29,829,1,0,0,0
2,328862_2882,8,29,829,1,0,0,0
3,328862_2882,8,29,829,1,0,0,0
4,328862_1253,8,29,829,1,0,0,0


In [25]:
time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

In [26]:
click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["su_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"su_click_day_count"},inplace=True)
click_day.head()

Unnamed: 0,su_id,su_click_day_count
0,100000_1042,1
1,100000_106,2
2,100000_1087,1
3,100000_1142,4
4,100000_1199,2


In [27]:
add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["su_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"su_add_day_count"}, inplace=True)
add_day.head()

Unnamed: 0,su_id,su_add_day_count
0,100005_4752,1
1,100005_4924,1
2,10002_1134,1
3,10002_1783,1
4,10002_1978,1


In [28]:
buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["su_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"su_buy_day_count"}, inplace=True)
buy_day.head()

Unnamed: 0,su_id,su_buy_day_count
0,100000_1142,1
1,100000_2337,1
2,100000_3050,1
3,100000_3200,2
4,100000_3286,1


In [29]:
save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["su_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"su_save_day_count"}, inplace=True)
save_day.head()

Unnamed: 0,su_id,su_save_day_count
0,100000_3183,1
1,100000_3200,1
2,100000_361,1
3,100000_4321,1
4,100002_915,1


In [30]:
day_count = pd.merge(click_day, add_day, how = 'outer', on='su_id')

In [32]:
day_count = day_count.merge(buy_day, how = 'outer', on ='su_id')

In [34]:
day_count = day_count.merge(save_day, how='outer', on='su_id')

In [36]:
day_count.fillna(0, inplace=True)

In [38]:
su_overall = pd.merge(seller_user_action, day_count, how = 'left', on='su_id')
su_overall.head()

Unnamed: 0,su_id,su_click,su_cart,su_buy,su_fav,suTotalAction,su_click_ratio_,su_cart_ratio_,su_buy_ratio_,su_fav_ratio_,su_click_day_count,su_add_day_count,su_buy_day_count,su_save_day_count
0,100000_1042,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,100000_106,9.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2,100000_1087,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,100000_1142,9.0,0.0,1.0,0.0,10.0,0.9,0.0,0.1,0.0,4.0,0.0,1.0,0.0
4,100000_1199,6.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [40]:
su_overall.to_csv("su_overall.csv")

### C. <merchant, brand> features:
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [42]:
seller_brand_action = user_datasample[['sb_id', 'action_type']]
dummy_seller_brand_action = pd.get_dummies(seller_brand_action['action_type'])
seller_brand_action = pd.concat([seller_brand_action, dummy_seller_brand_action], 
                               axis=1).groupby(['sb_id'],as_index = False).sum()
seller_brand_action.drop('action_type',1, inplace=True)
seller_brand_action.drop_duplicates(inplace=True)
seller_brand_action.rename(columns={0: "sb_click", 1:"sb_cart", 2:"sb_buy", 3:"sb_fav"},inplace=True)
seller_brand_action.head()
seller_brand_action['sbTotalAction'] = seller_brand_action.apply(lambda x: x["sb_click"] + x["sb_cart"] + x["sb_buy"] + x["sb_fav"], axis=1)

#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [44]:
# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['sb_click','sb_cart','sb_buy','sb_fav']
for i in action_ls:
    col_name = i + '_ratio_'
    seller_brand_action[col_name] = seller_brand_action[i] / seller_brand_action['sbTotalAction']

#### 3). Evaluate the level of activeness, count for the whole period & each month

In [None]:
#extract month and day from time stamp
# user_datasample['month'] = np.floor(user_datasample['time_stamp']/100).astype(int)
# user_datasample['day'] = user_datasample['time_stamp']%100
# user_datasample.head()

In [46]:
#stat: for each user, in how many days he has action(all types)
time_diff = user_datasample[['sb_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['sb_id','month','day'])['action_type'].count())
time_diff_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action_type
sb_id,month,day,Unnamed: 3_level_1
1.0_1233,5,13,3
1.0_1233,5,14,2
1.0_1233,5,15,2
1.0_1233,5,17,2
1.0_1233,5,19,2


In [47]:
time_diff_stat.reset_index(inplace=True)
time_diff = pd.get_dummies(time_diff, columns = ["action_type"])
time_diff.head()

Unnamed: 0,sb_id,month,day,time_stamp,action_type_0,action_type_1,action_type_2,action_type_3
0,2661.0_2882,8,29,829,1,0,0,0
1,2661.0_2882,8,29,829,1,0,0,0
2,2661.0_2882,8,29,829,1,0,0,0
3,2661.0_2882,8,29,829,1,0,0,0
4,1049.0_1253,8,29,829,1,0,0,0


In [48]:
time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

In [49]:
click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["sb_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"sb_click_day_count"},inplace=True)
click_day.head()

Unnamed: 0,sb_id,sb_click_day_count
0,1.0_1233,174
1,10.0_4883,2
2,100.0_2831,173
3,1000.0_4479,12
4,1001.0_4631,176


In [50]:
add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["sb_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"sb_add_day_count"}, inplace=True)
add_day.head()

Unnamed: 0,sb_id,sb_add_day_count
0,1.0_1233,6
1,100.0_2831,5
2,1001.0_4631,8
3,1002.0_3569,1
4,1003.0_1878,13


In [51]:
buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["sb_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"sb_buy_day_count"}, inplace=True)
buy_day.head()

Unnamed: 0,sb_id,sb_buy_day_count
0,1.0_1233,85
1,100.0_2831,112
2,1000.0_4479,1
3,1001.0_4631,102
4,1002.0_3569,10


In [52]:
save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["sb_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"sb_save_day_count"}, inplace=True)
save_day.head()

Unnamed: 0,sb_id,sb_save_day_count
0,1.0_1233,85
1,100.0_2831,115
2,1001.0_4631,149
3,1002.0_3569,13
4,1003.0_1878,127


In [53]:
day_count = pd.merge(click_day, add_day, how = 'outer', on='sb_id')

In [54]:
day_count = day_count.merge(buy_day, how = 'outer', on ='sb_id')

In [55]:
day_count = day_count.merge(save_day, how='outer', on='sb_id')

In [56]:
day_count.fillna(0, inplace=True)

In [58]:
sb_overall = pd.merge(seller_brand_action, day_count, how = 'left', on='sb_id')
sb_overall.head()

Unnamed: 0,sb_id,sb_click,sb_cart,sb_buy,sb_fav,sbTotalAction,sb_click_ratio_,sb_cart_ratio_,sb_buy_ratio_,sb_fav_ratio_,sb_click_day_count,sb_add_day_count,sb_buy_day_count,sb_save_day_count
0,1.0_1233,3882.0,6.0,316.0,153.0,4357.0,0.89098,0.001377,0.072527,0.035116,174.0,6.0,85.0,85.0
1,10.0_4883,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2,100.0_2831,4120.0,8.0,361.0,276.0,4765.0,0.864638,0.001679,0.075761,0.057922,173.0,5.0,112.0,115.0
3,1000.0_4479,35.0,0.0,1.0,0.0,36.0,0.972222,0.0,0.027778,0.0,12.0,0.0,1.0,0.0
4,1001.0_4631,5806.0,9.0,236.0,399.0,6450.0,0.900155,0.001395,0.036589,0.06186,176.0,8.0,102.0,149.0


In [59]:
sb_overall.to_csv("sb_overall.csv")