# User EDA

## Goal
- Activate user analysis: total/click/added to shopping cart/purchase/saved to favourite
- Individual user analysis: proportion/mean -> total/click/added to shopping cart/purchase/saved to favourite
- Conversion rate: click/saved to favourite/added to shopping cart -> purchase 

## Data cleaning

In [1]:
#import packages
import pandas as pd
import numpy as np

In [2]:
user_datasample = pd.read_csv('data_format1/user_log_format1.csv')
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [3]:
#find out the missing value in this file
user_datasample.isnull().sum()

user_id            0
item_id            0
cat_id             0
seller_id          0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64

In [4]:
#get the mode brand id for all stores, fillin NA value in brain_id column
missing = user_datasample[user_datasample.brand_id.isnull()].index
seller = user_datasample.groupby(['seller_id']).apply(lambda x:x.brand_id.mode()[0]).reset_index()
get_brand = user_datasample.loc[missing]
get_brand = get_brand.merge(seller,how='left',on=['seller_id'])[0].astype('float32')
get_brand.index = missing
user_datasample.loc[missing,'brand_id'] = get_brand

In [5]:
#see whether null values are filled
user_datasample.isnull().sum()

user_id        0
item_id        0
cat_id         0
seller_id      0
brand_id       0
time_stamp     0
action_type    0
dtype: int64

In [6]:
#user info
user_info = pd.read_csv('data_format1/user_info_format1.csv')
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
#user info
user_info.isnull().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

In [8]:
user_info.age_range.fillna(user_info.age_range.median(),inplace=True)
user_info.gender.fillna(2,inplace=True)
#user info NA value exam
user_info.isnull().sum()

user_id      0
age_range    0
gender       0
dtype: int64

In [9]:
user_info = user_info[user_info['age_range'] != 0]

In [10]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [11]:
user_info.shape

(331256, 3)

In [12]:
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [13]:
user_datasample.shape

(54925330, 7)

In [14]:
#unique customer purchase history
user_datasample['user_id'].nunique()

424170

In [15]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


## Merge all user info

In [86]:
tot_user = user_datasample.merge(user_info, on="user_id", how="left")

In [87]:
tot_user.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender
0,328862,323294,833,2882,2661.0,829,0,6.0,1.0
1,328862,844400,1271,2882,2661.0,829,0,6.0,1.0
2,328862,575153,1271,2882,2661.0,829,0,6.0,1.0
3,328862,996875,1271,2882,2661.0,829,0,6.0,1.0
4,328862,1086186,1271,1253,1049.0,829,0,6.0,1.0


## Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [88]:
#get dummies from tot_user.action_type, combine dummies with user_id, get the stat for each user_id
total_action = tot_user[["user_id", "action_type"]]
action_dummies = pd.get_dummies(total_action['action_type'])
total_action = pd.concat([total_action.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

In [89]:
#rename the columns
total_action.rename(columns={0: "click", 1:"add", 2:"buy", 3:"save"},inplace=True)

In [90]:
#calculate total action number for each unique user_id
total_action['userTotalAction'] = total_action.apply(lambda x: x["click"] + x["add"] + x["buy"] + x["save"], axis=1)

In [91]:
total_action.shape

(424170, 6)

In [92]:
total_action.head()

Unnamed: 0,user_id,click,add,buy,save,userTotalAction
0,1,27.0,0.0,6.0,0.0,33.0
1,2,47.0,0.0,14.0,2.0,63.0
2,3,63.0,0.0,4.0,1.0,68.0
3,4,49.0,0.0,1.0,0.0,50.0
4,5,150.0,0.0,13.0,10.0,173.0


## Individual user analysis: 
### proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [93]:
# Total number of action
tot_act = user_datasample.shape[0]
print('Total number of action: %f'%tot_act)
# Total number of users
tot_user = user_datasample['user_id'].nunique()
print('Total number of users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per user: %f'%avg_act)

Total number of action: 54925330.000000
Total number of users: 424170.000000
Average number of action per user: 129.000000


In [94]:
columns = ['click', 'add', 'buy','save']
for i in columns:
    #total number of each feature
    tot_feature = total_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = total_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

Total number of click is: 48550713.000000
Mean of click is: 114.460506 

Total number of add is: 76750.000000
Mean of add is: 0.180942 

Total number of buy is: 3292144.000000
Mean of buy is: 7.761379 

Total number of save is: 3005723.000000
Mean of save is: 7.086128 



In [95]:
# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['click','add','buy','save']
for i in action_ls:
    col_name = i + '_ratio_'
    total_action[col_name] = total_action[i] / total_action['userTotalAction']

In [96]:
total_action.head()

Unnamed: 0,user_id,click,add,buy,save,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803


## Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

In [97]:
# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['click', 'add', 'save']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    total_action[conversion_name] = np.log1p(total_action['buy']) - np.log1p(total_action[i])
    total_action[conversion_diff_name] = total_action[conversion_name] - total_action[conversion_name].mean()


In [98]:
total_action.head()

Unnamed: 0,user_id,click,add,buy,save,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_,click_conversion,clickconversion_diff,add_conversion,addconversion_diff,save_conversion,saveconversion_diff
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,-1.386294,0.88769,1.94591,0.126755,1.94591,1.11287
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,-1.163151,1.110834,2.70805,0.888895,1.609438,0.776398
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,-2.549445,-0.275461,1.609438,-0.209717,0.916291,0.08325
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0,-3.218876,-0.944891,0.693147,-1.126008,0.693147,-0.139893
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,-2.378223,-0.104238,2.639057,0.819902,0.241162,-0.591878


In [99]:
total_action.columns

Index([u'user_id', u'click', u'add', u'buy', u'save', u'userTotalAction',
       u'click_ratio_', u'add_ratio_', u'buy_ratio_', u'save_ratio_',
       u'click_conversion', u'clickconversion_diff', u'add_conversion',
       u'addconversion_diff', u'save_conversion', u'saveconversion_diff'],
      dtype='object')