# User EDA

## Goal
- Activate user analysis: total/click/added to shopping cart/purchase/saved to favourite
- Individual user analysis: proportion/mean -> total/click/added to shopping cart/purchase/saved to favourite
- Conversion rate: click/saved to favourite/added to shopping cart -> purchase 

## Data cleaning

In [69]:
#import packages
import pandas as pd
import numpy as np

In [2]:
user_datasample = pd.read_csv('data_format1/user_log_format1.csv')
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [3]:
#find out the missing value in this file
user_datasample.isnull().sum()

user_id            0
item_id            0
cat_id             0
seller_id          0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64

In [4]:
#get the mode brand id for all stores, fillin NA value in brain_id column
missing = user_datasample[user_datasample.brand_id.isnull()].index
seller = user_datasample.groupby(['seller_id']).apply(lambda x:x.brand_id.mode()[0]).reset_index()
get_brand = user_datasample.loc[missing]
get_brand = get_brand.merge(seller,how='left',on=['seller_id'])[0].astype('float32')
get_brand.index = missing
user_datasample.loc[missing,'brand_id'] = get_brand

In [5]:
#see whether null values are filled
user_datasample.isnull().sum()

user_id        0
item_id        0
cat_id         0
seller_id      0
brand_id       0
time_stamp     0
action_type    0
dtype: int64

In [6]:
#user info
user_info = pd.read_csv('data_format1/user_info_format1.csv')
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
#user info
user_info.isnull().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

In [8]:
user_info.age_range.fillna(user_info.age_range.median(),inplace=True)
user_info.gender.fillna(2,inplace=True)
#user info NA value exam
user_info.isnull().sum()

user_id      0
age_range    0
gender       0
dtype: int64

In [9]:
user_info = user_info[user_info['age_range'] != 0]

In [10]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [11]:
user_info.shape

(331256, 3)

In [12]:
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [13]:
user_datasample.shape

(54925330, 7)

In [14]:
#unique customer purchase history
user_datasample['user_id'].nunique()

424170

In [15]:
df_age = pd.get_dummies(user_info.age_range,prefix='age')# 对age进行哑编码
df_sex = pd.get_dummies(user_info.gender)# 对gender进行哑编码并改变列名
df_sex.rename(columns={0:'female',1:'male',2:'unknown'},inplace=True)
user_info = pd.concat([user_info.user_id, df_age, df_sex], axis=1)# 整合user信息

In [16]:
user_info.head()

Unnamed: 0,user_id,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,female,male,unknown
0,376517,0,0,0,0,0,1,0,0,0,1,0
1,234512,0,0,0,0,1,0,0,0,1,0,0
2,344532,0,0,0,0,1,0,0,0,1,0,0
3,186135,0,0,0,0,1,0,0,0,1,0,0
4,30230,0,0,0,0,1,0,0,0,1,0,0


## Merge all user info

In [17]:
tot_user = user_datasample
tot_user = tot_user.merge(user_info, on="user_id", how="left")

In [18]:
tot_user.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,female,male,unknown
0,328862,323294,833,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,328862,844400,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,328862,575153,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,328862,996875,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,328862,1086186,1271,1253,1049.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [96]:
total_action = tot_user[["user_id", "action_type"]]

In [97]:
df = pd.get_dummies(total_action['action_type'],prefix='userTotalAction')

In [98]:
total_action = pd.concat([total_action.user_id, df], axis=1).groupby(['user_id'], as_index=False).sum()

In [99]:
total_action.rename(columns={"userTotalAction_0": "click", "userTotalAction_1":"shopping_cart", 
                            "userTotalAction_2":"purchased", "userTotalAction_3":"save"},inplace=True)

In [101]:
# df['variance'] = df.apply(lambda x: x['budget'] + x['actual'], axis=1)
total_action['userTotalAction'] = total_action.apply(
lambda x: x["click"] + x["shopping_cart"] + x["purchased"] + x["save"], axis=1)

In [102]:
total_action.shape

(424170, 6)

In [103]:
total_action.head()

Unnamed: 0,user_id,click,shopping_cart,purchased,save,userTotalAction
0,1,27.0,0.0,6.0,0.0,33.0
1,2,47.0,0.0,14.0,2.0,63.0
2,3,63.0,0.0,4.0,1.0,68.0
3,4,49.0,0.0,1.0,0.0,50.0
4,5,150.0,0.0,13.0,10.0,173.0


## Individual user analysis: 
### proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [109]:
# Total number of action
tot_act = user_datasample.shape[0]
print('Total number of action: %f'%tot_act)
# Total number of users
tot_user = user_datasample['user_id'].nunique()
print('Total number of users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per user: %f'%avg_act)

Total number of action: 54925330.000000
Total number of users: 424170.000000
Average number of action per user: 129.488955


In [112]:
# Total number of clicked
tot_clicked = total_action.click.sum()
print("Total number of clicked: %f"%tot_clicked)
#Average number of clicked
avg_clicked = total_action.click.mean()
print("Average number of clicked: %f"%avg_clicked)
total_action['userTotalClickDiff'] = total_action.click-avg_clicked

Total number of clicked: 48550713.000000
Average number of clicked: 114.460506


In [114]:
# Total number of added
tot_added = total_action.shopping_cart.sum()
print("Total number of added: %f"%tot_added)
#Average number of added
avg_added = total_action.shopping_cart.mean()
print("Average number of added: %f"%avg_added)
total_action['userTotalAddDiff'] = total_action.shopping_cart-avg_added

Total number of added: 76750.000000
Average number of added: 0.180942


In [117]:
# Total number of purchased
tot_buy = total_action.purchased.sum()
print("Total number of purchased: %f"%tot_buy)
#Average number of purchased
avg_buy = total_action.purchased.mean()
print("Average number of purchased: %f"%avg_buy)
total_action['userTotalBuyDiff'] = total_action.purchased-avg_buy

Total number of purchased: 3292144.000000
Average number of purchased: 7.761379


In [118]:
# Total number of saved to favourite
tot_saved = total_action.save.sum()
print("Total number of saved to favourite: %f"%tot_saved)
# Average number of saved to favourite
avg_saved = total_action.save.mean()
print("Average number of saved to favourite: %f"%avg_saved)
total_action['useTotalSaveDiff'] = total_action.save - avg_saved

Total number of saved to favourite: 3005723.000000
Average number of saved to favourite: 7.086128


In [119]:
# proportion of click/add/purchased/save in terms of per individual user
total_action['Click_ratio_'] = total_action['click']/total_action['userTotalAction']
total_action['Add_ratio_'] = total_action['shopping_cart']/total_action['userTotalAction']
total_action['Buy_ratio_'] = total_action['purchased']/total_action['userTotalAction']
total_action['Save_ratio_'] = total_action['save']/total_action['userTotalAction']

## Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

In [122]:
# Converstion rate
total_action['Click_conversion'] = np.log1p(total_action['purchased']) - np.log1p(total_action['click'])
total_action['Click_conversion_diff'] = total_action['Click_ratio_'] - total_action['Click_ratio_'].mean()
total_action['Add_conversion'] = np.log1p(total_action['purchased']) - np.log1p(total_action['shopping_cart'])
total_action['Add_conversion_diff'] = total_action['Add_ratio_'] - total_action['Add_ratio_'].mean()
total_action['Save_conversion'] = np.log1p(total_action['purchased']) - np.log1p(total_action['save'])
total_action['Save_conversion_diff'] = total_action['Save_ratio_'] - total_action['Save_ratio_'].mean()
total_action.head()

Unnamed: 0,user_id,click,shopping_cart,purchased,save,userTotalAction,userTotalClickDiff,userTotalAddDiff,userTotalBuyDiff,useTotalSaveDiff,...,Add_ratio_,Buy_ratio_,Save_ratio_,userClick_ratio,Click_conversion,Click_conversion_diff,Add_conversion,Add_conversion_diff,Save_conversion,Save_conversion_diff
0,1,27.0,0.0,6.0,0.0,33.0,-87.460506,-0.180942,-1.761379,-7.086128,...,0.0,0.181818,0.0,-21.0,-1.386294,-0.031228,1.94591,-0.002712,1.94591,-0.049846
1,2,47.0,0.0,14.0,2.0,63.0,-67.460506,-0.180942,6.238621,-5.086128,...,0.0,0.222222,0.031746,-33.0,-1.163151,-0.103378,2.70805,-0.002712,1.609438,-0.0181
2,3,63.0,0.0,4.0,1.0,68.0,-51.460506,-0.180942,-3.761379,-6.086128,...,0.0,0.058824,0.014706,-59.0,-2.549445,0.077061,1.609438,-0.002712,0.916291,-0.03514
3,4,49.0,0.0,1.0,0.0,50.0,-65.460506,-0.180942,-6.761379,-7.086128,...,0.0,0.02,0.0,-48.0,-3.218876,0.13059,0.693147,-0.002712,0.693147,-0.049846
4,5,150.0,0.0,13.0,10.0,173.0,35.539494,-0.180942,5.238621,2.913872,...,0.0,0.075145,0.057803,-137.0,-2.378223,0.017642,2.639057,-0.002712,0.241162,0.007957


In [129]:
total_action.columns

Index(['user_id', 'click', 'shopping_cart', 'purchased', 'save',
       'userTotalAction', 'userTotalClickDiff', 'userTotalAddDiff',
       'userTotalBuyDiff', 'useTotalSaveDiff', 'Click_ratio_', 'Add_ratio_',
       'Buy_ratio_', 'Save_ratio_', 'userClick_ratio', 'Click_conversion',
       'Click_conversion_diff', 'Add_conversion', 'Add_conversion_diff',
       'Save_conversion', 'Save_conversion_diff'],
      dtype='object')