# Part I. Data cleaning

In [297]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
user_datasample = pd.read_csv('data_format1/user_log_format1.csv')
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [3]:
#find out the missing value in this file
user_datasample.isnull().sum()

user_id            0
item_id            0
cat_id             0
seller_id          0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64

In [4]:
#get the mode brand id for all stores, fillin NA value in brain_id column
missing = user_datasample[user_datasample.brand_id.isnull()].index
seller = user_datasample.groupby(['seller_id']).apply(lambda x:x.brand_id.mode()[0]).reset_index()
get_brand = user_datasample.loc[missing]
get_brand = get_brand.merge(seller,how='left',on=['seller_id'])[0].astype('float32')
get_brand.index = missing
user_datasample.loc[missing,'brand_id'] = get_brand

In [5]:
#see whether null values are filled
user_datasample.isnull().sum()

user_id        0
item_id        0
cat_id         0
seller_id      0
brand_id       0
time_stamp     0
action_type    0
dtype: int64

In [6]:
#user info
user_info = pd.read_csv('data_format1/user_info_format1.csv')
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
#user info
user_info.isnull().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

In [8]:
user_info.age_range.fillna(user_info.age_range.median(),inplace=True)
user_info.gender.fillna(2,inplace=True)
#user info NA value exam
user_info.isnull().sum()

user_id      0
age_range    0
gender       0
dtype: int64

In [9]:
user_info = user_info[user_info['age_range'] != 0]

In [10]:
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [11]:
#unique customer purchase history
user_datasample['user_id'].nunique()

424170

## Merge all user info

In [12]:
df_age = pd.get_dummies(user_info.age_range,prefix='age')
df_gender = pd.get_dummies(user_info.gender)
df_gender.rename(columns={0:'female',1:'male',2:'unknown'},inplace=True)
user_info = pd.concat([user_info.user_id, df_age, df_gender], axis=1)
del df_age, df_gender

In [13]:
user_info.isnull().sum()

user_id    0
age_1.0    0
age_2.0    0
age_3.0    0
age_4.0    0
age_5.0    0
age_6.0    0
age_7.0    0
age_8.0    0
female     0
male       0
unknown    0
dtype: int64

In [14]:
tot_user_df = user_datasample.merge(user_info, on="user_id", how="left")

In [15]:
tot_user_df.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,female,male,unknown
0,328862,323294,833,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,328862,844400,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,328862,575153,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,328862,996875,1271,2882,2661.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,328862,1086186,1271,1253,1049.0,829,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# Part II. Feature Engineering:
### A. user features: 
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [16]:
#get dummies from tot_user.action_type, combine dummies with user_id, get the stat for each user_id
total_action = tot_user_df[["user_id", "action_type"]]
action_dummies = pd.get_dummies(total_action['action_type'])
total_action = pd.concat([total_action.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

In [17]:
#rename the columns
total_action.rename(columns={0: "click", 1:"add", 2:"buy", 3:"save"},inplace=True)

In [18]:
#calculate total action number for each unique user_id
total_action['userTotalAction'] = total_action.apply(lambda x: x["click"] + x["add"] + x["buy"] + x["save"], axis=1)

In [19]:
total_action.shape

(424170, 6)

In [20]:
total_action.head()

Unnamed: 0,user_id,click,add,buy,save,userTotalAction
0,1,27.0,0.0,6.0,0.0,33.0
1,2,47.0,0.0,14.0,2.0,63.0
2,3,63.0,0.0,4.0,1.0,68.0
3,4,49.0,0.0,1.0,0.0,50.0
4,5,150.0,0.0,13.0,10.0,173.0


#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [21]:
# Total number of action
tot_act = user_datasample.shape[0]
print('Total number of action: %f'%tot_act)
# Total number of users
tot_user = user_datasample['user_id'].nunique()
print('Total number of users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per user: %f'%avg_act)

Total number of action: 54925330.000000
Total number of users: 424170.000000
Average number of action per user: 129.488955


In [22]:
columns = ['click', 'add', 'buy','save']
for i in columns:
    #total number of each feature
    tot_feature = total_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = total_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

Total number of click is: 48550713.000000
Mean of click is: 114.460506 

Total number of add is: 76750.000000
Mean of add is: 0.180942 

Total number of buy is: 3292144.000000
Mean of buy is: 7.761379 

Total number of save is: 3005723.000000
Mean of save is: 7.086128 



In [23]:
# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['click','add','buy','save']
for i in action_ls:
    col_name = i + '_ratio_'
    total_action[col_name] = total_action[i] / total_action['userTotalAction']

In [24]:
total_action.head()

Unnamed: 0,user_id,click,add,buy,save,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803


In [25]:
total_action.rename(columns = {"click":"click_count_", "add":"add_count_",
                              "buy":"buy_count_","save":"save_count_"}, inplace=True)
total_action.head()

Unnamed: 0,user_id,click_count_,add_count_,buy_count_,save_count_,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803


In [27]:
total_action.shape

(424170, 10)

In [30]:
total_action.user_id.nunique()

424170

#### 3). Evaluate the level of activeness, count for the whole period & each month

In [26]:
#extract month and day from time stamp
user_datasample['month'] = np.floor(user_datasample['time_stamp']/100).astype(int)
user_datasample['day'] = user_datasample['time_stamp']%100
user_datasample.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,month,day
0,328862,323294,833,2882,2661.0,829,0,8,29
1,328862,844400,1271,2882,2661.0,829,0,8,29
2,328862,575153,1271,2882,2661.0,829,0,8,29
3,328862,996875,1271,2882,2661.0,829,0,8,29
4,328862,1086186,1271,1253,1049.0,829,0,8,29


In [41]:
month_df = user_datasample[["user_id","month"]]
month_dummies = pd.get_dummies(month_df['month'])
month_df = pd.concat([month_df.user_id, month_dummies], axis=1).groupby(['user_id'],
                                                                       as_index=False).sum()

In [42]:
month_df.rename(columns={5:"May", 6:"June", 7:"July", 8:"August", 9:"September", 10:"October",
                        11:"November"}, inplace=True)
month_df.head()

Unnamed: 0,user_id,May,June,July,August,September,October,November
0,1,0.0,0.0,0.0,0.0,0.0,16.0,17.0
1,2,2.0,26.0,1.0,20.0,0.0,5.0,9.0
2,3,2.0,3.0,1.0,3.0,22.0,5.0,32.0
3,4,12.0,0.0,12.0,0.0,7.0,3.0,16.0
4,5,16.0,21.0,3.0,37.0,12.0,52.0,32.0


In [43]:
total_action = total_action.merge(month_df, how="left", on="user_id")

In [46]:
#stat: for each user, in how many days he has action(all types)
time_diff = user_datasample[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action_type
user_id,month,day,Unnamed: 3_level_1
1,10,9,5
1,10,11,4
1,10,18,5
1,10,21,2
1,11,11,17


In [48]:
time_diff.head()

Unnamed: 0,user_id,month,day,action_type,time_stamp
0,328862,8,29,0,829
1,328862,8,29,0,829
2,328862,8,29,0,829
3,328862,8,29,0,829
4,328862,8,29,0,829


In [53]:
time_diff_stat.reset_index(inplace=True)

In [55]:
time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

In [56]:
time_diff.head()

Unnamed: 0,user_id,month,day,time_stamp,action_type_0,action_type_1,action_type_2,action_type_3
0,328862,8,29,829,1,0,0,0
1,328862,8,29,829,1,0,0,0
2,328862,8,29,829,1,0,0,0
3,328862,8,29,829,1,0,0,0
4,328862,8,29,829,1,0,0,0


In [57]:
time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

In [58]:
time_diff.head()

Unnamed: 0,user_id,month,day,time_stamp,click,add,buy,save
0,328862,8,29,829,1,0,0,0
1,328862,8,29,829,1,0,0,0
2,328862,8,29,829,1,0,0,0
3,328862,8,29,829,1,0,0,0
4,328862,8,29,829,1,0,0,0


In [67]:
click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"click_day_count"},inplace=True)
click_day.head()

Unnamed: 0,user_id,click_day_count
0,1,5
1,2,5
2,3,12
3,4,10
4,5,29


In [70]:
click_day.shape

(423862, 2)

In [69]:
add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"add_day_count"}, inplace=True)
add_day.head()

Unnamed: 0,user_id,add_day_count
0,18,1
1,23,2
2,41,1
3,43,1
4,46,1


In [71]:
add_day.shape

(31044, 2)

In [72]:
buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"buy_day_count"}, inplace=True)
buy_day.head()

Unnamed: 0,user_id,buy_day_count
0,1,3
1,2,5
2,3,3
3,4,1
4,5,7


In [73]:
save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"save_day_count"}, inplace=True)
save_day.head()

Unnamed: 0,user_id,save_day_count
0,2,2
1,3,1
2,5,9
3,6,15
4,8,4


In [77]:
day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')

In [79]:
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')

In [80]:
day_count = day_count.merge(save_day, how='outer', on='user_id')

In [82]:
day_count.fillna(0, inplace=True)

In [86]:
user_overall = pd.merge(total_action, day_count, how = 'left', on='user_id')
user_overall.head()

Unnamed: 0,user_id,click_count_,add_count_,buy_count_,save_count_,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_,...,June,July,August,September,October,November,click_day_count,add_day_count,buy_day_count,save_day_count
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,...,0.0,0.0,0.0,0.0,16.0,17.0,5.0,0.0,3,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,...,26.0,1.0,20.0,0.0,5.0,9.0,5.0,0.0,5,2.0
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,...,3.0,1.0,3.0,22.0,5.0,32.0,12.0,0.0,3,1.0
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0,...,0.0,12.0,0.0,7.0,3.0,16.0,10.0,0.0,1,0.0
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,...,21.0,3.0,37.0,12.0,52.0,32.0,29.0,0.0,7,9.0


#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

In [90]:
# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['click_count_', 'add_count_', 'save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    user_overall[conversion_name] = np.log1p(user_overall['buy_count_']) - np.log1p(user_overall[i])
    user_overall[conversion_diff_name] = user_overall[conversion_name] - user_overall[conversion_name].mean()

In [92]:
user_overall.head()

Unnamed: 0,user_id,click_count_,add_count_,buy_count_,save_count_,userTotalAction,click_ratio_,add_ratio_,buy_ratio_,save_ratio_,...,click_day_count,add_day_count,buy_day_count,save_day_count,click_count__conversion,click_count_conversion_diff,add_count__conversion,add_count_conversion_diff,save_count__conversion,save_count_conversion_diff
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,...,5.0,0.0,3,0.0,-1.386294,0.88769,1.94591,0.126755,1.94591,1.11287
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,...,5.0,0.0,5,2.0,-1.163151,1.110834,2.70805,0.888895,1.609438,0.776398
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,...,12.0,0.0,3,1.0,-2.549445,-0.275461,1.609438,-0.209717,0.916291,0.08325
3,4,49.0,0.0,1.0,0.0,50.0,0.98,0.0,0.02,0.0,...,10.0,0.0,1,0.0,-3.218876,-0.944891,0.693147,-1.126008,0.693147,-0.139893
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,...,29.0,0.0,7,9.0,-2.378223,-0.104238,2.639057,0.819902,0.241162,-0.591878


### B. user age features: 
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [223]:
tot_user_df.columns

Index(['user_id', 'item_id', 'cat_id', 'seller_id', 'brand_id', 'time_stamp',
       'action_type', 'age_1.0', 'age_2.0', 'age_3.0', 'age_4.0', 'age_5.0',
       'age_6.0', 'age_7.0', 'age_8.0', 'female', 'male', 'unknown'],
      dtype='object')

In [224]:
age1 = tot_user_df[tot_user_df['age_1.0'] == 1]

In [225]:
age1_action = age1[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age1['action_type'])
age1_action = pd.concat([age1.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

In [226]:
age1_action.rename(columns={0: "age1_click_count_", 1:"age1_add_count_", 2:"age1_buy_count_", 3:"age1_save_count_"},
                   inplace=True)

In [227]:
age1_action['age1TotalAction'] = age1_action.apply(lambda x: x["age1_click_count_"] + x["age1_add_count_"] + x["age1_buy_count_"] + x["age1_save_count_"], axis=1)

In [228]:
age1_action.head()

Unnamed: 0,user_id,age1_click_count_,age1_add_count_,age1_buy_count_,age1_save_count_,age1TotalAction
0,10007,31,0,2,4,37
1,28694,57,0,4,0,61
2,30065,27,0,2,2,31
3,32372,41,0,3,0,44
4,40376,38,0,5,2,45


#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

In [229]:
# Total number of action
tot_act = age1.shape[0]
print('Total number of age1 action: %f'%tot_act)
# Total number of users
tot_user = age1['user_id'].nunique()
print('Total number of age1 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age1 user: %f'%avg_act)

Total number of age1 action: 1721.000000
Total number of age1 users: 24.000000
Average number of action per age1 user: 71.708333


In [230]:
columns = ['age1_click_count_', 'age1_add_count_', 'age1_buy_count_','age1_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age1_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age1_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

Total number of age1_click_count_ is: 1512.000000
Mean of age1_click_count_ is: 63.000000 

Total number of age1_add_count_ is: 2.000000
Mean of age1_add_count_ is: 0.083333 

Total number of age1_buy_count_ is: 96.000000
Mean of age1_buy_count_ is: 4.000000 

Total number of age1_save_count_ is: 111.000000
Mean of age1_save_count_ is: 4.625000 



In [231]:
# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age1_click_count_','age1_add_count_','age1_buy_count_','age1_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age1_action[col_name] = age1_action[i] / age1_action['age1TotalAction']

In [232]:
age1_action.head()

Unnamed: 0,user_id,age1_click_count_,age1_add_count_,age1_buy_count_,age1_save_count_,age1TotalAction,age1_click_count__ratio_,age1_add_count__ratio_,age1_buy_count__ratio_,age1_save_count__ratio_
0,10007,31,0,2,4,37,0.837838,0.0,0.054054,0.108108
1,28694,57,0,4,0,61,0.934426,0.0,0.065574,0.0
2,30065,27,0,2,2,31,0.870968,0.0,0.064516,0.064516
3,32372,41,0,3,0,44,0.931818,0.0,0.068182,0.0
4,40376,38,0,5,2,45,0.844444,0.0,0.111111,0.044444


#### 3). Evaluate the level of activeness, count for the whole period & each month

In [233]:
#extract month and day from time stamp
age1['month'] = np.floor(age1['time_stamp']/100).astype(int)
age1['day'] = age1['time_stamp']%100
age1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,female,male,unknown,month,day
3377039,202352,704631,1429,4629,2821.0,711,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7,11
3377040,202352,498966,1397,3649,7333.0,527,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5,27
3377041,202352,548821,1577,3649,7333.0,527,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5,27
3377042,202352,722125,737,459,2754.0,520,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5,20
3377043,202352,95233,1612,3623,2206.0,920,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,20


In [234]:
#stat: for each user, in how many days he has action(all types)
time_diff = age1[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action_type
user_id,month,day,Unnamed: 3_level_1
10007,8,12,1
10007,8,14,5
10007,8,23,1
10007,9,4,2
10007,9,5,2


In [235]:
time_diff.head()

Unnamed: 0,user_id,month,day,action_type,time_stamp
3377039,202352,7,11,0,711
3377040,202352,5,27,0,527
3377041,202352,5,27,0,527
3377042,202352,5,20,3,520
3377043,202352,9,20,3,920


In [236]:
time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

Unnamed: 0,user_id,month,day,time_stamp,action_type_0,action_type_1,action_type_2,action_type_3
3377039,202352,7,11,711,1,0,0,0
3377040,202352,5,27,527,1,0,0,0
3377041,202352,5,27,527,1,0,0,0
3377042,202352,5,20,520,0,0,0,1
3377043,202352,9,20,920,0,0,0,1


In [237]:
time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

Unnamed: 0,user_id,month,day,time_stamp,click,add,buy,save
3377039,202352,7,11,711,1,0,0,0
3377040,202352,5,27,527,1,0,0,0
3377041,202352,5,27,527,1,0,0,0
3377042,202352,5,20,520,0,0,0,1
3377043,202352,9,20,920,0,0,0,1


In [238]:
click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age1_click_day_count"},inplace=True)
click_day.head()

Unnamed: 0,user_id,age1_click_day_count
0,10007,11
1,28694,8
2,30065,6
3,32372,9
4,40376,10


In [239]:
add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age1_add_day_count"}, inplace=True)
add_day.head()

Unnamed: 0,user_id,age1_add_day_count
0,96093,2


In [240]:
buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age1_buy_day_count"}, inplace=True)
buy_day.head()

Unnamed: 0,user_id,age1_buy_day_count
0,10007,2
1,28694,1
2,30065,1
3,32372,3
4,40376,4


In [241]:
save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age1_save_day_count"}, inplace=True)
save_day.head()

Unnamed: 0,user_id,age1_save_day_count
0,10007,3
1,30065,1
2,40376,1
3,47745,11
4,49071,3


In [242]:
day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

In [243]:
age1_overall = pd.merge(age1_action, day_count, how = 'left', on='user_id')
age1_overall.head()

Unnamed: 0,user_id,age1_click_count_,age1_add_count_,age1_buy_count_,age1_save_count_,age1TotalAction,age1_click_count__ratio_,age1_add_count__ratio_,age1_buy_count__ratio_,age1_save_count__ratio_,age1_click_day_count,age1_add_day_count,age1_buy_day_count,age1_save_day_count
0,10007,31,0,2,4,37,0.837838,0.0,0.054054,0.108108,11,0.0,2,3.0
1,28694,57,0,4,0,61,0.934426,0.0,0.065574,0.0,8,0.0,1,0.0
2,30065,27,0,2,2,31,0.870968,0.0,0.064516,0.064516,6,0.0,1,1.0
3,32372,41,0,3,0,44,0.931818,0.0,0.068182,0.0,9,0.0,3,0.0
4,40376,38,0,5,2,45,0.844444,0.0,0.111111,0.044444,10,0.0,4,1.0


#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

In [244]:
# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age1_click_count_', 'age1_add_count_', 'age1_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age1_overall[conversion_name] = np.log1p(age1_overall['age1_buy_count_']) - np.log1p(age1_overall[i])
    age1_overall[conversion_diff_name] = age1_overall[conversion_name] - age1_overall[conversion_name].mean()

In [245]:
age1_overall.head()

Unnamed: 0,user_id,age1_click_count_,age1_add_count_,age1_buy_count_,age1_save_count_,age1TotalAction,age1_click_count__ratio_,age1_add_count__ratio_,age1_buy_count__ratio_,age1_save_count__ratio_,age1_click_day_count,age1_add_day_count,age1_buy_day_count,age1_save_day_count,age1_click_count__conversion,age1_click_count_conversion_diff,age1_add_count__conversion,age1_add_count_conversion_diff,age1_save_count__conversion,age1_save_count_conversion_diff
0,10007,31,0,2,4,37,0.837838,0.0,0.054054,0.108108,11,0.0,2,3.0,-2.367188,0.001953,1.098633,-0.307617,-0.510742,-0.726074
1,28694,57,0,4,0,61,0.934426,0.0,0.065574,0.0,8,0.0,1,0.0,-2.449219,-0.080078,1.609375,0.203125,1.609375,1.393555
2,30065,27,0,2,2,31,0.870968,0.0,0.064516,0.064516,6,0.0,1,1.0,-2.234375,0.134766,1.098633,-0.307617,0.0,-0.215454
3,32372,41,0,3,0,44,0.931818,0.0,0.068182,0.0,9,0.0,3,0.0,-2.351562,0.017578,1.386719,-0.019531,1.386719,1.170898
4,40376,38,0,5,2,45,0.844444,0.0,0.111111,0.044444,10,0.0,4,1.0,-1.87207,0.49707,1.791992,0.385742,0.693359,0.478027


In [246]:
age1_overall.shape

(24, 20)

In [247]:
age2 = tot_user_df[tot_user_df['age_2.0'] == 1]
age2_action = age2[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age2['action_type'])
age2_action = pd.concat([age2.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()
age2_action.rename(columns={0: "age2_click_count_", 1:"age2_add_count_", 2:"age2_buy_count_", 3:"age2_save_count_"},
                   inplace=True)
age2_action['age2TotalAction'] = age2_action.apply(lambda x: x["age2_click_count_"] + x["age2_add_count_"] + x["age2_buy_count_"] + x["age2_save_count_"], axis=1)
age2_action.head()

Unnamed: 0,user_id,age2_click_count_,age2_add_count_,age2_buy_count_,age2_save_count_,age2TotalAction
0,13,19.0,0.0,2.0,0.0,21.0
1,17,30.0,0.0,5.0,5.0,40.0
2,36,46.0,0.0,3.0,1.0,50.0
3,37,36.0,0.0,6.0,0.0,42.0
4,40,89.0,0.0,29.0,12.0,130.0


In [248]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite
# Total number of action
tot_act = age2.shape[0]
print('Total number of age2 action: %f'%tot_act)
# Total number of users
tot_user = age2['user_id'].nunique()
print('Total number of age2 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age2 user: %f'%avg_act)

columns = ['age2_click_count_', 'age2_add_count_', 'age2_buy_count_','age2_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age2_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age2_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age2_click_count_','age2_add_count_','age2_buy_count_','age2_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age2_action[col_name] = age2_action[i] / age2_action['age2TotalAction']

age2_action.head()

Total number of age2 action: 5385020.000000
Total number of age2 users: 52871.000000
Average number of action per age2 user: 101.852055
Total number of age2_click_count_ is: 4694010.000000
Mean of age2_click_count_ is: 88.782319 

Total number of age2_add_count_ is: 7230.000000
Mean of age2_add_count_ is: 0.136748 

Total number of age2_buy_count_ is: 295262.000000
Mean of age2_buy_count_ is: 5.584574 

Total number of age2_save_count_ is: 388518.000000
Mean of age2_save_count_ is: 7.348414 



Unnamed: 0,user_id,age2_click_count_,age2_add_count_,age2_buy_count_,age2_save_count_,age2TotalAction,age2_click_count__ratio_,age2_add_count__ratio_,age2_buy_count__ratio_,age2_save_count__ratio_
0,13,19.0,0.0,2.0,0.0,21.0,0.904762,0.0,0.095238,0.0
1,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125
2,36,46.0,0.0,3.0,1.0,50.0,0.92,0.0,0.06,0.02
3,37,36.0,0.0,6.0,0.0,42.0,0.857143,0.0,0.142857,0.0
4,40,89.0,0.0,29.0,12.0,130.0,0.684615,0.0,0.223077,0.092308


In [249]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age2['month'] = np.floor(age2['time_stamp']/100).astype(int)
age2['day'] = age2['time_stamp']%100
age2.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age2[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age2_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age2_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age2_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age2_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age2_overall = pd.merge(age2_action, day_count, how = 'left', on='user_id')
age2_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age2_click_count_,age2_add_count_,age2_buy_count_,age2_save_count_,age2TotalAction,age2_click_count__ratio_,age2_add_count__ratio_,age2_buy_count__ratio_,age2_save_count__ratio_,age2_click_day_count,age2_add_day_count,age2_buy_day_count,age2_save_day_count
0,13,19.0,0.0,2.0,0.0,21.0,0.904762,0.0,0.095238,0.0,3.0,0.0,2,0.0
1,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125,5.0,0.0,2,5.0
2,36,46.0,0.0,3.0,1.0,50.0,0.92,0.0,0.06,0.02,8.0,0.0,3,1.0
3,37,36.0,0.0,6.0,0.0,42.0,0.857143,0.0,0.142857,0.0,13.0,0.0,4,0.0
4,40,89.0,0.0,29.0,12.0,130.0,0.684615,0.0,0.223077,0.092308,18.0,0.0,21,12.0


In [250]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age2_click_count_', 'age2_add_count_', 'age2_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age2_overall[conversion_name] = np.log1p(age2_overall['age2_buy_count_']) - np.log1p(age2_overall[i])
    age2_overall[conversion_diff_name] = age2_overall[conversion_name] - age2_overall[conversion_name].mean()

age2_overall.head()

Unnamed: 0,user_id,age2_click_count_,age2_add_count_,age2_buy_count_,age2_save_count_,age2TotalAction,age2_click_count__ratio_,age2_add_count__ratio_,age2_buy_count__ratio_,age2_save_count__ratio_,age2_click_day_count,age2_add_day_count,age2_buy_day_count,age2_save_day_count,age2_click_count__conversion,age2_click_count_conversion_diff,age2_add_count__conversion,age2_add_count_conversion_diff,age2_save_count__conversion,age2_save_count_conversion_diff
0,13,19.0,0.0,2.0,0.0,21.0,0.904762,0.0,0.095238,0.0,3.0,0.0,2,0.0,-1.89712,0.357751,1.098612,-0.51575,1.098612,0.600959
1,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125,5.0,0.0,2,5.0,-1.642228,0.612643,1.791759,0.177397,0.0,-0.497654
2,36,46.0,0.0,3.0,1.0,50.0,0.92,0.0,0.06,0.02,8.0,0.0,3,1.0,-2.463853,-0.208983,1.386294,-0.228068,0.693147,0.195493
3,37,36.0,0.0,6.0,0.0,42.0,0.857143,0.0,0.142857,0.0,13.0,0.0,4,0.0,-1.665008,0.589863,1.94591,0.331547,1.94591,1.448256
4,40,89.0,0.0,29.0,12.0,130.0,0.684615,0.0,0.223077,0.092308,18.0,0.0,21,12.0,-1.098612,1.156258,3.401197,1.786835,0.836248,0.338594


In [251]:
age2_overall.shape

(52871, 20)

In [252]:
age3 = tot_user_df[tot_user_df['age_3.0'] == 1]
age3_action = age3[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age3['action_type'])
age3_action = pd.concat([age3.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age3_action.rename(columns={0: "age3_click_count_", 1:"age3_add_count_", 2:"age3_buy_count_", 3:"age3_save_count_"},
                   inplace=True)

age3_action['age3TotalAction'] = age3_action.apply(lambda x: x["age3_click_count_"] + x["age3_add_count_"] + x["age3_buy_count_"] + x["age3_save_count_"], axis=1)

age3_action.head()

Unnamed: 0,user_id,age3_click_count_,age3_add_count_,age3_buy_count_,age3_save_count_,age3TotalAction
0,1,27.0,0.0,6.0,0.0,33.0
1,2,47.0,0.0,14.0,2.0,63.0
2,3,63.0,0.0,4.0,1.0,68.0
3,15,108.0,0.0,7.0,33.0,148.0
4,18,106.0,3.0,1.0,0.0,110.0


In [253]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age3.shape[0]
print('Total number of age3 action: %f'%tot_act)
# Total number of users
tot_user = age3['user_id'].nunique()
print('Total number of age3 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age3 user: %f'%avg_act)

columns = ['age3_click_count_', 'age3_add_count_', 'age3_buy_count_','age3_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age3_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age3_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age3_click_count_','age3_add_count_','age3_buy_count_','age3_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age3_action[col_name] = age3_action[i] / age3_action['age3TotalAction']

age3_action.head()

Total number of age3 action: 14976860.000000
Total number of age3 users: 113871.000000
Average number of action per age3 user: 131.524796
Total number of age3_click_count_ is: 13193369.000000
Mean of age3_click_count_ is: 115.862414 

Total number of age3_add_count_ is: 19653.000000
Mean of age3_add_count_ is: 0.172590 

Total number of age3_buy_count_ is: 873912.000000
Mean of age3_buy_count_ is: 7.674579 

Total number of age3_save_count_ is: 889926.000000
Mean of age3_save_count_ is: 7.815212 



Unnamed: 0,user_id,age3_click_count_,age3_add_count_,age3_buy_count_,age3_save_count_,age3TotalAction,age3_click_count__ratio_,age3_add_count__ratio_,age3_buy_count__ratio_,age3_save_count__ratio_
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706
3,15,108.0,0.0,7.0,33.0,148.0,0.72973,0.0,0.047297,0.222973
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0


In [254]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age3['month'] = np.floor(age3['time_stamp']/100).astype(int)
age3['day'] = age3['time_stamp']%100
age3.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age3[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age3_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age3_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age3_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age3_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age3_overall = pd.merge(age3_action, day_count, how = 'left', on='user_id')
age3_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age3_click_count_,age3_add_count_,age3_buy_count_,age3_save_count_,age3TotalAction,age3_click_count__ratio_,age3_add_count__ratio_,age3_buy_count__ratio_,age3_save_count__ratio_,age3_click_day_count,age3_add_day_count,age3_buy_day_count,age3_save_day_count
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3,0.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5,2.0
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3,1.0
3,15,108.0,0.0,7.0,33.0,148.0,0.72973,0.0,0.047297,0.222973,20.0,0.0,5,13.0
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0,5.0,1.0,1,0.0


In [255]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age3_click_count_', 'age3_add_count_', 'age3_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age3_overall[conversion_name] = np.log1p(age3_overall['age3_buy_count_']) - np.log1p(age3_overall[i])
    age3_overall[conversion_diff_name] = age3_overall[conversion_name] - age3_overall[conversion_name].mean()

age3_overall.head()

Unnamed: 0,user_id,age3_click_count_,age3_add_count_,age3_buy_count_,age3_save_count_,age3TotalAction,age3_click_count__ratio_,age3_add_count__ratio_,age3_buy_count__ratio_,age3_save_count__ratio_,age3_click_day_count,age3_add_day_count,age3_buy_day_count,age3_save_day_count,age3_click_count__conversion,age3_click_count_conversion_diff,age3_add_count__conversion,age3_add_count_conversion_diff,age3_save_count__conversion,age3_save_count_conversion_diff
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3,0.0,-1.386294,0.904398,1.94591,0.115314,1.94591,1.196042
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5,2.0,-1.163151,1.127541,2.70805,0.877454,1.609438,0.85957
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3,1.0,-2.549445,-0.258753,1.609438,-0.221159,0.916291,0.166423
3,15,108.0,0.0,7.0,33.0,148.0,0.72973,0.0,0.047297,0.222973,20.0,0.0,5,13.0,-2.611906,-0.321214,2.079442,0.248845,-1.446919,-2.196787
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0,5.0,1.0,1,0.0,-3.979682,-1.68899,-0.693147,-2.523744,0.693147,-0.056721


In [256]:
age3_overall.shape

(113871, 20)

In [257]:
age4 = tot_user_df[tot_user_df['age_4.0'] == 1]

age4_action = age4[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age4['action_type'])
age4_action = pd.concat([age4.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age4_action.rename(columns={0: "age4_click_count_", 1:"age4_add_count_", 2:"age4_buy_count_", 3:"age4_save_count_"},
                   inplace=True)

age4_action['age4TotalAction'] = age4_action.apply(lambda x: x["age4_click_count_"] + x["age4_add_count_"] + x["age4_buy_count_"] + x["age4_save_count_"], axis=1)

age4_action.head()

Unnamed: 0,user_id,age4_click_count_,age4_add_count_,age4_buy_count_,age4_save_count_,age4TotalAction
0,6,217.0,0.0,17.0,15.0,249.0
1,7,6.0,0.0,8.0,0.0,14.0
2,8,61.0,0.0,23.0,7.0,91.0
3,10,56.0,0.0,7.0,1.0,64.0
4,11,28.0,0.0,4.0,0.0,32.0


In [258]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age4.shape[0]
print('Total number of age4 action: %f'%tot_act)
# Total number of users
tot_user = age4['user_id'].nunique()
print('Total number of age4 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age4 user: %f'%avg_act)

columns = ['age4_click_count_', 'age4_add_count_', 'age4_buy_count_','age4_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age4_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age4_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age4_click_count_','age4_add_count_','age4_buy_count_','age4_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age4_action[col_name] = age4_action[i] / age4_action['age4TotalAction']

age4_action.head()

Total number of age4 action: 11802052.000000
Total number of age4 users: 79991.000000
Average number of action per age4 user: 147.542249
Total number of age4_click_count_ is: 10411758.000000
Mean of age4_click_count_ is: 130.161618 

Total number of age4_add_count_ is: 12386.000000
Mean of age4_add_count_ is: 0.154842 

Total number of age4_buy_count_ is: 769541.000000
Mean of age4_buy_count_ is: 9.620345 

Total number of age4_save_count_ is: 608367.000000
Mean of age4_save_count_ is: 7.605443 



Unnamed: 0,user_id,age4_click_count_,age4_add_count_,age4_buy_count_,age4_save_count_,age4TotalAction,age4_click_count__ratio_,age4_add_count__ratio_,age4_buy_count__ratio_,age4_save_count__ratio_
0,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241
1,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0
2,8,61.0,0.0,23.0,7.0,91.0,0.67033,0.0,0.252747,0.076923
3,10,56.0,0.0,7.0,1.0,64.0,0.875,0.0,0.109375,0.015625
4,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0


In [259]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age4['month'] = np.floor(age4['time_stamp']/100).astype(int)
age4['day'] = age4['time_stamp']%100
age4.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age4[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age4_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age4_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age4_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age4_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age4_overall = pd.merge(age4_action, day_count, how = 'left', on='user_id')
age4_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age4_click_count_,age4_add_count_,age4_buy_count_,age4_save_count_,age4TotalAction,age4_click_count__ratio_,age4_add_count__ratio_,age4_buy_count__ratio_,age4_save_count__ratio_,age4_click_day_count,age4_add_day_count,age4_buy_day_count,age4_save_day_count
0,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241,50.0,0.0,12,15.0
1,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0,4.0,0.0,2,0.0
2,8,61.0,0.0,23.0,7.0,91.0,0.67033,0.0,0.252747,0.076923,20.0,0.0,10,4.0
3,10,56.0,0.0,7.0,1.0,64.0,0.875,0.0,0.109375,0.015625,12.0,0.0,2,1.0
4,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0,12.0,0.0,3,0.0


In [260]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age4_click_count_', 'age4_add_count_', 'age4_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age4_overall[conversion_name] = np.log1p(age4_overall['age4_buy_count_']) - np.log1p(age4_overall[i])
    age4_overall[conversion_diff_name] = age4_overall[conversion_name] - age4_overall[conversion_name].mean()

age4_overall.head()

Unnamed: 0,user_id,age4_click_count_,age4_add_count_,age4_buy_count_,age4_save_count_,age4TotalAction,age4_click_count__ratio_,age4_add_count__ratio_,age4_buy_count__ratio_,age4_save_count__ratio_,age4_click_day_count,age4_add_day_count,age4_buy_day_count,age4_save_day_count,age4_click_count__conversion,age4_click_count_conversion_diff,age4_add_count__conversion,age4_add_count_conversion_diff,age4_save_count__conversion,age4_save_count_conversion_diff
0,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241,50.0,0.0,12,15.0,-2.494123,-0.265282,2.890372,0.872446,0.117783,-0.855068
1,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0,4.0,0.0,2,0.0,0.251314,2.480155,2.197225,0.179299,2.197225,1.224373
2,8,61.0,0.0,23.0,7.0,91.0,0.67033,0.0,0.252747,0.076923,20.0,0.0,10,4.0,-0.949081,1.27976,3.178054,1.160128,1.098612,0.125761
3,10,56.0,0.0,7.0,1.0,64.0,0.875,0.0,0.109375,0.015625,12.0,0.0,2,1.0,-1.96361,0.265231,2.079442,0.061516,1.386294,0.413443
4,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0,12.0,0.0,3,0.0,-1.757858,0.470983,1.609438,-0.408488,1.609438,0.636586


In [261]:
age4_overall.shape

(79991, 20)

In [262]:
age5 = tot_user_df[tot_user_df['age_5.0'] == 1]

age5_action = age5[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age5['action_type'])
age5_action = pd.concat([age5.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age5_action.rename(columns={0: "age5_click_count_", 1:"age5_add_count_", 2:"age5_buy_count_", 3:"age5_save_count_"},
                   inplace=True)

age5_action['age5TotalAction'] = age5_action.apply(lambda x: x["age5_click_count_"] + x["age5_add_count_"] + x["age5_buy_count_"] + x["age5_save_count_"], axis=1)

age5_action.head()

Unnamed: 0,user_id,age5_click_count_,age5_add_count_,age5_buy_count_,age5_save_count_,age5TotalAction
0,5,150.0,0.0,13.0,10.0,173.0
1,9,79.0,0.0,4.0,4.0,87.0
2,16,63.0,0.0,4.0,0.0,67.0
3,20,156.0,0.0,6.0,0.0,162.0
4,24,53.0,0.0,8.0,0.0,61.0


In [263]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age5.shape[0]
print('Total number of age5 action: %f'%tot_act)
# Total number of users
tot_user = age5['user_id'].nunique()
print('Total number of age5 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age5 user: %f'%avg_act)

columns = ['age5_click_count_', 'age5_add_count_', 'age5_buy_count_','age5_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age5_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age5_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age5_click_count_','age5_add_count_','age5_buy_count_','age5_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age5_action[col_name] = age5_action[i] / age5_action['age5TotalAction']

age5_action.head()

Total number of age5 action: 6200000.000000
Total number of age5 users: 40777.000000
Average number of action per age5 user: 152.046497
Total number of age5_click_count_ is: 5509918.000000
Mean of age5_click_count_ is: 135.123182 

Total number of age5_add_count_ is: 6695.000000
Mean of age5_add_count_ is: 0.164186 

Total number of age5_buy_count_ is: 396683.000000
Mean of age5_buy_count_ is: 9.728107 

Total number of age5_save_count_ is: 286704.000000
Mean of age5_save_count_ is: 7.031022 



Unnamed: 0,user_id,age5_click_count_,age5_add_count_,age5_buy_count_,age5_save_count_,age5TotalAction,age5_click_count__ratio_,age5_add_count__ratio_,age5_buy_count__ratio_,age5_save_count__ratio_
0,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803
1,9,79.0,0.0,4.0,4.0,87.0,0.908046,0.0,0.045977,0.045977
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0
3,20,156.0,0.0,6.0,0.0,162.0,0.962963,0.0,0.037037,0.0
4,24,53.0,0.0,8.0,0.0,61.0,0.868852,0.0,0.131148,0.0


In [264]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age5['month'] = np.floor(age5['time_stamp']/100).astype(int)
age5['day'] = age5['time_stamp']%100
age5.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age5[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age5_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age5_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age5_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age5_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age5_overall = pd.merge(age5_action, day_count, how = 'left', on='user_id')
age5_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age5_click_count_,age5_add_count_,age5_buy_count_,age5_save_count_,age5TotalAction,age5_click_count__ratio_,age5_add_count__ratio_,age5_buy_count__ratio_,age5_save_count__ratio_,age5_click_day_count,age5_add_day_count,age5_buy_day_count,age5_save_day_count
0,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7,9.0
1,9,79.0,0.0,4.0,4.0,87.0,0.908046,0.0,0.045977,0.045977,17.0,0.0,3,4.0
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0,5.0,0.0,4,0.0
3,20,156.0,0.0,6.0,0.0,162.0,0.962963,0.0,0.037037,0.0,33.0,0.0,4,0.0
4,24,53.0,0.0,8.0,0.0,61.0,0.868852,0.0,0.131148,0.0,16.0,0.0,7,0.0


In [265]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age5_click_count_', 'age5_add_count_', 'age5_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age5_overall[conversion_name] = np.log1p(age5_overall['age5_buy_count_']) - np.log1p(age5_overall[i])
    age5_overall[conversion_diff_name] = age5_overall[conversion_name] - age5_overall[conversion_name].mean()

age5_overall.head()

Unnamed: 0,user_id,age5_click_count_,age5_add_count_,age5_buy_count_,age5_save_count_,age5TotalAction,age5_click_count__ratio_,age5_add_count__ratio_,age5_buy_count__ratio_,age5_save_count__ratio_,age5_click_day_count,age5_add_day_count,age5_buy_day_count,age5_save_day_count,age5_click_count__conversion,age5_click_count_conversion_diff,age5_add_count__conversion,age5_add_count_conversion_diff,age5_save_count__conversion,age5_save_count_conversion_diff
0,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7,9.0,-2.378223,-0.126329,2.639057,0.623355,0.241162,-0.819617
1,9,79.0,0.0,4.0,4.0,87.0,0.908046,0.0,0.045977,0.045977,17.0,0.0,3,4.0,-2.772589,-0.520696,1.609438,-0.406264,0.0,-1.06078
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0,5.0,0.0,4,0.0,-2.549445,-0.297552,1.609438,-0.406264,1.609438,0.548658
3,20,156.0,0.0,6.0,0.0,162.0,0.962963,0.0,0.037037,0.0,33.0,0.0,4,0.0,-3.110336,-0.858443,1.94591,-0.069792,1.94591,0.885131
4,24,53.0,0.0,8.0,0.0,61.0,0.868852,0.0,0.131148,0.0,16.0,0.0,7,0.0,-1.791759,0.460134,2.197225,0.181522,2.197225,1.136445


In [266]:
age5_overall.shape

(40777, 20)

In [267]:
age6 = tot_user_df[tot_user_df['age_6.0'] == 1]

age6_action = age6[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age6['action_type'])
age6_action = pd.concat([age6.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age6_action.rename(columns={0: "age6_click_count_", 1:"age6_add_count_", 2:"age6_buy_count_", 3:"age6_save_count_"},
                   inplace=True)

age6_action['age6TotalAction'] = age6_action.apply(lambda x: x["age6_click_count_"] + x["age6_add_count_"] + x["age6_buy_count_"] + x["age6_save_count_"], axis=1)

age6_action.head()

Unnamed: 0,user_id,age6_click_count_,age6_add_count_,age6_buy_count_,age6_save_count_,age6TotalAction
0,14,952.0,0.0,52.0,30.0,1034.0
1,21,582.0,0.0,31.0,7.0,620.0
2,23,165.0,3.0,6.0,0.0,174.0
3,31,144.0,0.0,2.0,0.0,146.0
4,34,44.0,0.0,12.0,0.0,56.0


In [268]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age6.shape[0]
print('Total number of age6 action: %f'%tot_act)
# Total number of users
tot_user = age6['user_id'].nunique()
print('Total number of age6 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age6 user: %f'%avg_act)

columns = ['age6_click_count_', 'age6_add_count_', 'age6_buy_count_','age6_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age6_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age6_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age6_click_count_','age6_add_count_','age6_buy_count_','age6_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age6_action[col_name] = age6_action[i] / age6_action['age6TotalAction']

age6_action.head()

Total number of age6 action: 5413716.000000
Total number of age6 users: 35464.000000
Average number of action per age6 user: 152.653846
Total number of age6_click_count_ is: 4843730.000000
Mean of age6_click_count_ is: 136.581604 

Total number of age6_add_count_ is: 6279.000000
Mean of age6_add_count_ is: 0.177053 

Total number of age6_buy_count_ is: 308718.000000
Mean of age6_buy_count_ is: 8.705109 

Total number of age6_save_count_ is: 254989.000000
Mean of age6_save_count_ is: 7.190080 



Unnamed: 0,user_id,age6_click_count_,age6_add_count_,age6_buy_count_,age6_save_count_,age6TotalAction,age6_click_count__ratio_,age6_add_count__ratio_,age6_buy_count__ratio_,age6_save_count__ratio_
0,14,952.0,0.0,52.0,30.0,1034.0,0.920696,0.0,0.05029,0.029014
1,21,582.0,0.0,31.0,7.0,620.0,0.93871,0.0,0.05,0.01129
2,23,165.0,3.0,6.0,0.0,174.0,0.948276,0.017241,0.034483,0.0
3,31,144.0,0.0,2.0,0.0,146.0,0.986301,0.0,0.013699,0.0
4,34,44.0,0.0,12.0,0.0,56.0,0.785714,0.0,0.214286,0.0


In [269]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age6['month'] = np.floor(age6['time_stamp']/100).astype(int)
age6['day'] = age6['time_stamp']%100
age6.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age6[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age6_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age6_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age6_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age6_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age6_overall = pd.merge(age6_action, day_count, how = 'left', on='user_id')
age6_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age6_click_count_,age6_add_count_,age6_buy_count_,age6_save_count_,age6TotalAction,age6_click_count__ratio_,age6_add_count__ratio_,age6_buy_count__ratio_,age6_save_count__ratio_,age6_click_day_count,age6_add_day_count,age6_buy_day_count,age6_save_day_count
0,14,952.0,0.0,52.0,30.0,1034.0,0.920696,0.0,0.05029,0.029014,63.0,0.0,24,12.0
1,21,582.0,0.0,31.0,7.0,620.0,0.93871,0.0,0.05,0.01129,68.0,0.0,21,7.0
2,23,165.0,3.0,6.0,0.0,174.0,0.948276,0.017241,0.034483,0.0,22.0,2.0,4,0.0
3,31,144.0,0.0,2.0,0.0,146.0,0.986301,0.0,0.013699,0.0,9.0,0.0,1,0.0
4,34,44.0,0.0,12.0,0.0,56.0,0.785714,0.0,0.214286,0.0,14.0,0.0,6,0.0


In [270]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age6_click_count_', 'age6_add_count_', 'age6_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age6_overall[conversion_name] = np.log1p(age6_overall['age6_buy_count_']) - np.log1p(age6_overall[i])
    age6_overall[conversion_diff_name] = age6_overall[conversion_name] - age6_overall[conversion_name].mean()

age6_overall.head()

Unnamed: 0,user_id,age6_click_count_,age6_add_count_,age6_buy_count_,age6_save_count_,age6TotalAction,age6_click_count__ratio_,age6_add_count__ratio_,age6_buy_count__ratio_,age6_save_count__ratio_,age6_click_day_count,age6_add_day_count,age6_buy_day_count,age6_save_day_count,age6_click_count__conversion,age6_click_count_conversion_diff,age6_add_count__conversion,age6_add_count_conversion_diff,age6_save_count__conversion,age6_save_count_conversion_diff
0,14,952.0,0.0,52.0,30.0,1034.0,0.920696,0.0,0.05029,0.029014,63.0,0.0,24,12.0,-2.889323,-0.546914,3.970292,2.061791,0.536305,-0.422206
1,21,582.0,0.0,31.0,7.0,620.0,0.93871,0.0,0.05,0.01129,68.0,0.0,21,7.0,-2.902451,-0.560043,3.465736,1.557235,1.386294,0.427783
2,23,165.0,3.0,6.0,0.0,174.0,0.948276,0.017241,0.034483,0.0,22.0,2.0,4,0.0,-3.166078,-0.823669,0.559616,-1.348885,1.94591,0.987399
3,31,144.0,0.0,2.0,0.0,146.0,0.986301,0.0,0.013699,0.0,9.0,0.0,1,0.0,-3.878121,-1.535713,1.098612,-0.809888,1.098612,0.140101
4,34,44.0,0.0,12.0,0.0,56.0,0.785714,0.0,0.214286,0.0,14.0,0.0,6,0.0,-1.241713,1.100695,2.564949,0.656449,2.564949,1.606438


In [271]:
age6_overall.shape

(35464, 20)

In [272]:
age7 = tot_user_df[tot_user_df['age_7.0'] == 1]

age7_action = age7[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age7['action_type'])
age7_action = pd.concat([age7.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age7_action.rename(columns={0: "age7_click_count_", 1:"age7_add_count_", 2:"age7_buy_count_", 3:"age7_save_count_"},
                   inplace=True)

age7_action['age7TotalAction'] = age7_action.apply(lambda x: x["age7_click_count_"] + x["age7_add_count_"] + x["age7_buy_count_"] + x["age7_save_count_"], axis=1)

age7_action.head()

Unnamed: 0,user_id,age7_click_count_,age7_add_count_,age7_buy_count_,age7_save_count_,age7TotalAction
0,70,85.0,0.0,3.0,0.0,88.0
1,74,153.0,0.0,7.0,0.0,160.0
2,152,277.0,0.0,14.0,0.0,291.0
3,391,389.0,0.0,59.0,7.0,455.0
4,400,99.0,0.0,8.0,3.0,110.0


In [273]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age7.shape[0]
print('Total number of age7 action: %f'%tot_act)
# Total number of users
tot_user = age7['user_id'].nunique()
print('Total number of age7 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age7 user: %f'%avg_act)

columns = ['age7_click_count_', 'age7_add_count_', 'age7_buy_count_','age7_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age7_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age7_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age7_click_count_','age7_add_count_','age7_buy_count_','age7_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age7_action[col_name] = age7_action[i] / age7_action['age7TotalAction']

age7_action.head()

Total number of age7 action: 1052265.000000
Total number of age7 users: 6992.000000
Average number of action per age7 user: 150.495566
Total number of age7_click_count_ is: 945046.000000
Mean of age7_click_count_ is: 135.161041 

Total number of age7_add_count_ is: 1060.000000
Mean of age7_add_count_ is: 0.151602 

Total number of age7_buy_count_ is: 53932.000000
Mean of age7_buy_count_ is: 7.713387 

Total number of age7_save_count_ is: 52227.000000
Mean of age7_save_count_ is: 7.469537 



Unnamed: 0,user_id,age7_click_count_,age7_add_count_,age7_buy_count_,age7_save_count_,age7TotalAction,age7_click_count__ratio_,age7_add_count__ratio_,age7_buy_count__ratio_,age7_save_count__ratio_
0,70,85.0,0.0,3.0,0.0,88.0,0.965909,0.0,0.034091,0.0
1,74,153.0,0.0,7.0,0.0,160.0,0.95625,0.0,0.04375,0.0
2,152,277.0,0.0,14.0,0.0,291.0,0.95189,0.0,0.04811,0.0
3,391,389.0,0.0,59.0,7.0,455.0,0.854945,0.0,0.12967,0.015385
4,400,99.0,0.0,8.0,3.0,110.0,0.9,0.0,0.072727,0.027273


In [274]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age7['month'] = np.floor(age7['time_stamp']/100).astype(int)
age7['day'] = age7['time_stamp']%100
age7.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age7[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age7_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age7_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age7_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age7_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age7_overall = pd.merge(age7_action, day_count, how = 'left', on='user_id')
age7_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age7_click_count_,age7_add_count_,age7_buy_count_,age7_save_count_,age7TotalAction,age7_click_count__ratio_,age7_add_count__ratio_,age7_buy_count__ratio_,age7_save_count__ratio_,age7_click_day_count,age7_add_day_count,age7_buy_day_count,age7_save_day_count
0,70,85.0,0.0,3.0,0.0,88.0,0.965909,0.0,0.034091,0.0,13.0,0.0,2,0.0
1,74,153.0,0.0,7.0,0.0,160.0,0.95625,0.0,0.04375,0.0,19.0,0.0,4,0.0
2,152,277.0,0.0,14.0,0.0,291.0,0.95189,0.0,0.04811,0.0,47.0,0.0,10,0.0
3,391,389.0,0.0,59.0,7.0,455.0,0.854945,0.0,0.12967,0.015385,23.0,0.0,11,6.0
4,400,99.0,0.0,8.0,3.0,110.0,0.9,0.0,0.072727,0.027273,20.0,0.0,4,3.0


In [275]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age7_click_count_', 'age7_add_count_', 'age7_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age7_overall[conversion_name] = np.log1p(age7_overall['age7_buy_count_']) - np.log1p(age7_overall[i])
    age7_overall[conversion_diff_name] = age7_overall[conversion_name] - age7_overall[conversion_name].mean()

age7_overall.head()

Unnamed: 0,user_id,age7_click_count_,age7_add_count_,age7_buy_count_,age7_save_count_,age7TotalAction,age7_click_count__ratio_,age7_add_count__ratio_,age7_buy_count__ratio_,age7_save_count__ratio_,age7_click_day_count,age7_add_day_count,age7_buy_day_count,age7_save_day_count,age7_click_count__conversion,age7_click_count_conversion_diff,age7_add_count__conversion,age7_add_count_conversion_diff,age7_save_count__conversion,age7_save_count_conversion_diff
0,70,85.0,0.0,3.0,0.0,88.0,0.965909,0.0,0.034091,0.0,13.0,0.0,2,0.0,-3.068053,-0.623371,1.386294,-0.425427,1.386294,0.562835
1,74,153.0,0.0,7.0,0.0,160.0,0.95625,0.0,0.04375,0.0,19.0,0.0,4,0.0,-2.957511,-0.512829,2.079442,0.26772,2.079442,1.255982
2,152,277.0,0.0,14.0,0.0,291.0,0.95189,0.0,0.04811,0.0,47.0,0.0,10,0.0,-2.919571,-0.474889,2.70805,0.896329,2.70805,1.884591
3,391,389.0,0.0,59.0,7.0,455.0,0.854945,0.0,0.12967,0.015385,23.0,0.0,11,6.0,-1.871802,0.57288,4.094345,2.282623,2.014903,1.191443
4,400,99.0,0.0,8.0,3.0,110.0,0.9,0.0,0.072727,0.027273,20.0,0.0,4,3.0,-2.407946,0.036736,2.197225,0.385503,0.81093,-0.012529


In [276]:
age7_overall.shape

(6992, 20)

In [277]:
age8 = tot_user_df[tot_user_df['age_8.0'] == 1]

age8_action = age8[["user_id", "action_type"]]
action_dummies = pd.get_dummies(age8['action_type'])
age8_action = pd.concat([age8.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

age8_action.rename(columns={0: "age8_click_count_", 1:"age8_add_count_", 2:"age8_buy_count_", 3:"age8_save_count_"},
                   inplace=True)

age8_action['age8TotalAction'] = age8_action.apply(lambda x: x["age8_click_count_"] + x["age8_add_count_"] + x["age8_buy_count_"] + x["age8_save_count_"], axis=1)

age8_action.head()

Unnamed: 0,user_id,age8_click_count_,age8_add_count_,age8_buy_count_,age8_save_count_,age8TotalAction
0,241,27.0,0.0,10.0,1.0,38.0
1,342,13.0,0.0,1.0,0.0,14.0
2,358,331.0,0.0,2.0,0.0,333.0
3,1270,22.0,0.0,3.0,0.0,25.0
4,1546,10.0,0.0,1.0,0.0,11.0


In [278]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = age8.shape[0]
print('Total number of age8 action: %f'%tot_act)
# Total number of users
tot_user = age8['user_id'].nunique()
print('Total number of age8 users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per age8 user: %f'%avg_act)

columns = ['age8_click_count_', 'age8_add_count_', 'age8_buy_count_','age8_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = age8_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = age8_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['age8_click_count_','age8_add_count_','age8_buy_count_','age8_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    age8_action[col_name] = age8_action[i] / age8_action['age8TotalAction']

age8_action.head()

Total number of age8 action: 162534.000000
Total number of age8 users: 1266.000000
Average number of action per age8 user: 128.383886
Total number of age8_click_count_ is: 145456.000000
Mean of age8_click_count_ is: 114.894155 

Total number of age8_add_count_ is: 186.000000
Mean of age8_add_count_ is: 0.146919 

Total number of age8_buy_count_ is: 9631.000000
Mean of age8_buy_count_ is: 7.607425 

Total number of age8_save_count_ is: 7261.000000
Mean of age8_save_count_ is: 5.735387 



Unnamed: 0,user_id,age8_click_count_,age8_add_count_,age8_buy_count_,age8_save_count_,age8TotalAction,age8_click_count__ratio_,age8_add_count__ratio_,age8_buy_count__ratio_,age8_save_count__ratio_
0,241,27.0,0.0,10.0,1.0,38.0,0.710526,0.0,0.263158,0.026316
1,342,13.0,0.0,1.0,0.0,14.0,0.928571,0.0,0.071429,0.0
2,358,331.0,0.0,2.0,0.0,333.0,0.993994,0.0,0.006006,0.0
3,1270,22.0,0.0,3.0,0.0,25.0,0.88,0.0,0.12,0.0
4,1546,10.0,0.0,1.0,0.0,11.0,0.909091,0.0,0.090909,0.0


In [279]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
age8['month'] = np.floor(age8['time_stamp']/100).astype(int)
age8['day'] = age8['time_stamp']%100
age8.head()

#stat: for each user, in how many days he has action(all types)
time_diff = age8[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"age8_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"age8_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"age8_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"age8_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

age8_overall = pd.merge(age8_action, day_count, how = 'left', on='user_id')
age8_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,age8_click_count_,age8_add_count_,age8_buy_count_,age8_save_count_,age8TotalAction,age8_click_count__ratio_,age8_add_count__ratio_,age8_buy_count__ratio_,age8_save_count__ratio_,age8_click_day_count,age8_add_day_count,age8_buy_day_count,age8_save_day_count
0,241,27.0,0.0,10.0,1.0,38.0,0.710526,0.0,0.263158,0.026316,7,0.0,5,1.0
1,342,13.0,0.0,1.0,0.0,14.0,0.928571,0.0,0.071429,0.0,3,0.0,1,0.0
2,358,331.0,0.0,2.0,0.0,333.0,0.993994,0.0,0.006006,0.0,17,0.0,1,0.0
3,1270,22.0,0.0,3.0,0.0,25.0,0.88,0.0,0.12,0.0,4,0.0,1,0.0
4,1546,10.0,0.0,1.0,0.0,11.0,0.909091,0.0,0.090909,0.0,4,0.0,1,0.0


In [280]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['age8_click_count_', 'age8_add_count_', 'age8_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    age8_overall[conversion_name] = np.log1p(age8_overall['age8_buy_count_']) - np.log1p(age8_overall[i])
    age8_overall[conversion_diff_name] = age8_overall[conversion_name] - age8_overall[conversion_name].mean()

age8_overall.head()

Unnamed: 0,user_id,age8_click_count_,age8_add_count_,age8_buy_count_,age8_save_count_,age8TotalAction,age8_click_count__ratio_,age8_add_count__ratio_,age8_buy_count__ratio_,age8_save_count__ratio_,age8_click_day_count,age8_add_day_count,age8_buy_day_count,age8_save_day_count,age8_click_count__conversion,age8_click_count_conversion_diff,age8_add_count__conversion,age8_add_count_conversion_diff,age8_save_count__conversion,age8_save_count_conversion_diff
0,241,27.0,0.0,10.0,1.0,38.0,0.710526,0.0,0.263158,0.026316,7,0.0,5,1.0,-0.934309,1.371,2.397895,0.617091,1.704748,0.784638
1,342,13.0,0.0,1.0,0.0,14.0,0.928571,0.0,0.071429,0.0,3,0.0,1,0.0,-1.94591,0.359399,0.693147,-1.087657,0.693147,-0.226963
2,358,331.0,0.0,2.0,0.0,333.0,0.993994,0.0,0.006006,0.0,17,0.0,1,0.0,-4.706523,-2.401214,1.098612,-0.682192,1.098612,0.178502
3,1270,22.0,0.0,3.0,0.0,25.0,0.88,0.0,0.12,0.0,4,0.0,1,0.0,-1.7492,0.556109,1.386294,-0.39451,1.386294,0.466184
4,1546,10.0,0.0,1.0,0.0,11.0,0.909091,0.0,0.090909,0.0,4,0.0,1,0.0,-1.704748,0.600561,0.693147,-1.087657,0.693147,-0.226963


In [281]:
age8_overall.shape

(1266, 20)

In [333]:
age_overall = pd.DataFrame(index = user_overall.user_id)

In [334]:
age_overall.reset_index(inplace=True)

In [335]:
age_overall = age_overall.merge(age1_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age2_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age3_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age4_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age5_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age6_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age7_overall, on = 'user_id', how='outer')
age_overall = age_overall.merge(age8_overall, on = 'user_id', how='outer')

In [336]:
#fill na as 9999, means not applicable
age_overall.fillna(9999, inplace=True)

In [337]:
age_overall.head()

Unnamed: 0,user_id,age1_click_count_,age1_add_count_,age1_buy_count_,age1_save_count_,age1TotalAction,age1_click_count__ratio_,age1_add_count__ratio_,age1_buy_count__ratio_,age1_save_count__ratio_,age1_click_day_count,age1_add_day_count,age1_buy_day_count,age1_save_day_count,age1_click_count__conversion,age1_click_count_conversion_diff,age1_add_count__conversion,age1_add_count_conversion_diff,age1_save_count__conversion,age1_save_count_conversion_diff,age2_click_count_,age2_add_count_,age2_buy_count_,age2_save_count_,age2TotalAction,age2_click_count__ratio_,age2_add_count__ratio_,age2_buy_count__ratio_,age2_save_count__ratio_,age2_click_day_count,age2_add_day_count,age2_buy_day_count,age2_save_day_count,age2_click_count__conversion,age2_click_count_conversion_diff,age2_add_count__conversion,age2_add_count_conversion_diff,age2_save_count__conversion,age2_save_count_conversion_diff,age3_click_count_,age3_add_count_,age3_buy_count_,age3_save_count_,age3TotalAction,age3_click_count__ratio_,age3_add_count__ratio_,age3_buy_count__ratio_,age3_save_count__ratio_,age3_click_day_count,age3_add_day_count,age3_buy_day_count,age3_save_day_count,age3_click_count__conversion,age3_click_count_conversion_diff,age3_add_count__conversion,age3_add_count_conversion_diff,age3_save_count__conversion,age3_save_count_conversion_diff,age4_click_count_,age4_add_count_,age4_buy_count_,age4_save_count_,age4TotalAction,age4_click_count__ratio_,age4_add_count__ratio_,age4_buy_count__ratio_,age4_save_count__ratio_,age4_click_day_count,age4_add_day_count,age4_buy_day_count,age4_save_day_count,age4_click_count__conversion,age4_click_count_conversion_diff,age4_add_count__conversion,age4_add_count_conversion_diff,age4_save_count__conversion,age4_save_count_conversion_diff,age5_click_count_,age5_add_count_,age5_buy_count_,age5_save_count_,age5TotalAction,age5_click_count__ratio_,age5_add_count__ratio_,age5_buy_count__ratio_,age5_save_count__ratio_,age5_click_day_count,age5_add_day_count,age5_buy_day_count,age5_save_day_count,age5_click_count__conversion,age5_click_count_conversion_diff,age5_add_count__conversion,age5_add_count_conversion_diff,age5_save_count__conversion,age5_save_count_conversion_diff,age6_click_count_,age6_add_count_,age6_buy_count_,age6_save_count_,age6TotalAction,age6_click_count__ratio_,age6_add_count__ratio_,age6_buy_count__ratio_,age6_save_count__ratio_,age6_click_day_count,age6_add_day_count,age6_buy_day_count,age6_save_day_count,age6_click_count__conversion,age6_click_count_conversion_diff,age6_add_count__conversion,age6_add_count_conversion_diff,age6_save_count__conversion,age6_save_count_conversion_diff,age7_click_count_,age7_add_count_,age7_buy_count_,age7_save_count_,age7TotalAction,age7_click_count__ratio_,age7_add_count__ratio_,age7_buy_count__ratio_,age7_save_count__ratio_,age7_click_day_count,age7_add_day_count,age7_buy_day_count,age7_save_day_count,age7_click_count__conversion,age7_click_count_conversion_diff,age7_add_count__conversion,age7_add_count_conversion_diff,age7_save_count__conversion,age7_save_count_conversion_diff,age8_click_count_,age8_add_count_,age8_buy_count_,age8_save_count_,age8TotalAction,age8_click_count__ratio_,age8_add_count__ratio_,age8_buy_count__ratio_,age8_save_count__ratio_,age8_click_day_count,age8_add_day_count,age8_buy_day_count,age8_save_day_count,age8_click_count__conversion,age8_click_count_conversion_diff,age8_add_count__conversion,age8_add_count_conversion_diff,age8_save_count__conversion,age8_save_count_conversion_diff
0,1,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3.0,0.0,-1.386294,0.904398,1.94591,0.115314,1.94591,1.196042,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
1,2,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5.0,2.0,-1.163151,1.127541,2.70805,0.877454,1.609438,0.85957,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
2,3,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3.0,1.0,-2.549445,-0.258753,1.609438,-0.221159,0.916291,0.166423,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
3,4,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
4,5,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7.0,9.0,-2.378223,-0.126329,2.639057,0.623355,0.241162,-0.819617,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0


In [338]:
age_overall.shape

(424170, 153)

### C. user gender features: 
#### 1) Activate user analysis: total/click/added to shopping cart/purchase/save to favourite

In [302]:
female = tot_user_df[tot_user_df['female'] == 1]
female_action = female[["user_id", "action_type"]]
action_dummies = pd.get_dummies(female['action_type'])
female_action = pd.concat([female.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

female_action.rename(columns={0: "female_click_count_", 1:"female_add_count_", 2:"female_buy_count_", 3:"female_save_count_"},
                   inplace=True)

female_action['femaleTotalAction'] = female_action.apply(lambda x: x["female_click_count_"] + x["female_add_count_"] + x["female_buy_count_"] + x["female_save_count_"], axis=1)

female_action.head()

Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction
0,2,47.0,0.0,14.0,2.0,63.0
1,3,63.0,0.0,4.0,1.0,68.0
2,5,150.0,0.0,13.0,10.0,173.0
3,6,217.0,0.0,17.0,15.0,249.0
4,7,6.0,0.0,8.0,0.0,14.0


In [303]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = female.shape[0]
print('Total number of female action: %f'%tot_act)
# Total number of users
tot_user = female['user_id'].nunique()
print('Total number of female users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per female user: %f'%avg_act)

columns = ['female_click_count_', 'female_add_count_', 'female_buy_count_','female_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = female_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = female_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['female_click_count_','female_add_count_','female_buy_count_','female_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    female_action[col_name] = female_action[i] / female_action['femaleTotalAction']

female_action.head()

Total number of female action: 32469221.000000
Total number of female users: 218829.000000
Average number of action per female user: 148.377139
Total number of female_click_count_ is: 28598079.000000
Mean of female_click_count_ is: 130.686879 

Total number of female_add_count_ is: 32539.000000
Mean of female_add_count_ is: 0.148696 

Total number of female_buy_count_ is: 1886475.000000
Mean of female_buy_count_ is: 8.620772 

Total number of female_save_count_ is: 1952128.000000
Mean of female_save_count_ is: 8.920792 



Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction,female_click_count__ratio_,female_add_count__ratio_,female_buy_count__ratio_,female_save_count__ratio_
0,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746
1,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706
2,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803
3,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241
4,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0


In [304]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
female['month'] = np.floor(female['time_stamp']/100).astype(int)
female['day'] = female['time_stamp']%100
female.head()

#stat: for each user, in how many days he has action(all types)
time_diff = female[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"female_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"female_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"female_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"female_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

female_overall = pd.merge(female_action, day_count, how = 'left', on='user_id')
female_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction,female_click_count__ratio_,female_add_count__ratio_,female_buy_count__ratio_,female_save_count__ratio_,female_click_day_count,female_add_day_count,female_buy_day_count,female_save_day_count
0,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5,2.0
1,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3,1.0
2,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7,9.0
3,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241,50.0,0.0,12,15.0
4,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0,4.0,0.0,2,0.0


In [305]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['female_click_count_', 'female_add_count_', 'female_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    female_overall[conversion_name] = np.log1p(female_overall['female_buy_count_']) - np.log1p(female_overall[i])
    female_overall[conversion_diff_name] = female_overall[conversion_name] - female_overall[conversion_name].mean()

female_overall.head()

Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction,female_click_count__ratio_,female_add_count__ratio_,female_buy_count__ratio_,female_save_count__ratio_,female_click_day_count,female_add_day_count,female_buy_day_count,female_save_day_count,female_click_count__conversion,female_click_count_conversion_diff,female_add_count__conversion,female_add_count_conversion_diff,female_save_count__conversion,female_save_count_conversion_diff
0,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5,2.0,-1.163151,1.158035,2.70805,0.780892,1.609438,0.866011
1,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3,1.0,-2.549445,-0.228259,1.609438,-0.31772,0.916291,0.172864
2,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7,9.0,-2.378223,-0.057037,2.639057,0.711899,0.241162,-0.502265
3,6,217.0,0.0,17.0,15.0,249.0,0.871486,0.0,0.068273,0.060241,50.0,0.0,12,15.0,-2.494123,-0.172937,2.890372,0.963214,0.117783,-0.625644
4,7,6.0,0.0,8.0,0.0,14.0,0.428571,0.0,0.571429,0.0,4.0,0.0,2,0.0,0.251314,2.5725,2.197225,0.270067,2.197225,1.453798


In [306]:
female_overall.shape

(218829, 20)

In [307]:
male = tot_user_df[tot_user_df['male'] == 1]
male_action = male[["user_id", "action_type"]]
action_dummies = pd.get_dummies(male['action_type'])
male_action = pd.concat([male.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

male_action.rename(columns={0: "male_click_count_", 1:"male_add_count_", 2:"male_buy_count_", 3:"male_save_count_"},
                   inplace=True)

male_action['maleTotalAction'] = male_action.apply(lambda x: x["male_click_count_"] + x["male_add_count_"] + x["male_buy_count_"] + x["male_save_count_"], axis=1)

male_action.head()

Unnamed: 0,user_id,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction
0,1,27.0,0.0,6.0,0.0,33.0
1,11,28.0,0.0,4.0,0.0,32.0
2,16,63.0,0.0,4.0,0.0,67.0
3,17,30.0,0.0,5.0,5.0,40.0
4,18,106.0,3.0,1.0,0.0,110.0


In [308]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = male.shape[0]
print('Total number of male action: %f'%tot_act)
# Total number of users
tot_user = male['user_id'].nunique()
print('Total number of male users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per male user: %f'%avg_act)

columns = ['male_click_count_', 'male_add_count_', 'male_buy_count_','male_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = male_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = male_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['male_click_count_','male_add_count_','male_buy_count_','male_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    male_action[col_name] = male_action[i] / male_action['maleTotalAction']

male_action.head()

Total number of male action: 10382763.000000
Total number of male users: 99030.000000
Average number of action per male user: 104.844623
Total number of male_click_count_ is: 9251269.000000
Mean of male_click_count_ is: 93.418853 

Total number of male_add_count_ is: 17273.000000
Mean of male_add_count_ is: 0.174422 

Total number of male_buy_count_ is: 691451.000000
Mean of male_buy_count_ is: 6.982238 

Total number of male_save_count_ is: 422770.000000
Mean of male_save_count_ is: 4.269110 



Unnamed: 0,user_id,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction,male_click_count__ratio_,male_add_count__ratio_,male_buy_count__ratio_,male_save_count__ratio_
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0
1,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0
3,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0


In [309]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
male['month'] = np.floor(male['time_stamp']/100).astype(int)
male['day'] = male['time_stamp']%100
male.head()

#stat: for each user, in how many days he has action(all types)
time_diff = male[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"male_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"male_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"male_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"male_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

male_overall = pd.merge(male_action, day_count, how = 'left', on='user_id')
male_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction,male_click_count__ratio_,male_add_count__ratio_,male_buy_count__ratio_,male_save_count__ratio_,male_click_day_count,male_add_day_count,male_buy_day_count,male_save_day_count
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3,0.0
1,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0,12.0,0.0,3,0.0
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0,5.0,0.0,4,0.0
3,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125,5.0,0.0,2,5.0
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0,5.0,1.0,1,0.0


In [310]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['male_click_count_', 'male_add_count_', 'male_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    male_overall[conversion_name] = np.log1p(male_overall['male_buy_count_']) - np.log1p(male_overall[i])
    male_overall[conversion_diff_name] = male_overall[conversion_name] - male_overall[conversion_name].mean()

male_overall.head()

Unnamed: 0,user_id,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction,male_click_count__ratio_,male_add_count__ratio_,male_buy_count__ratio_,male_save_count__ratio_,male_click_day_count,male_add_day_count,male_buy_day_count,male_save_day_count,male_click_count__conversion,male_click_count_conversion_diff,male_add_count__conversion,male_add_count_conversion_diff,male_save_count__conversion,male_save_count_conversion_diff
0,1,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3,0.0,-1.386294,0.784473,1.94591,0.203838,1.94591,0.941063
1,11,28.0,0.0,4.0,0.0,32.0,0.875,0.0,0.125,0.0,12.0,0.0,3,0.0,-1.757858,0.412909,1.609438,-0.132634,1.609438,0.604591
2,16,63.0,0.0,4.0,0.0,67.0,0.940299,0.0,0.059701,0.0,5.0,0.0,4,0.0,-2.549445,-0.378678,1.609438,-0.132634,1.609438,0.604591
3,17,30.0,0.0,5.0,5.0,40.0,0.75,0.0,0.125,0.125,5.0,0.0,2,5.0,-1.642228,0.528539,1.791759,0.049688,0.0,-1.004847
4,18,106.0,3.0,1.0,0.0,110.0,0.963636,0.027273,0.009091,0.0,5.0,1.0,1,0.0,-3.979682,-1.808914,-0.693147,-2.435219,0.693147,-0.3117


In [311]:
male_overall.shape

(99030, 20)

In [313]:
no_gender = tot_user_df[tot_user_df['unknown'] == 1]
no_gender_action = no_gender[["user_id", "action_type"]]
action_dummies = pd.get_dummies(no_gender['action_type'])
no_gender_action = pd.concat([no_gender.user_id, action_dummies], axis=1).groupby(['user_id'], as_index=False).sum()

no_gender_action.rename(columns={0: "no_gender_click_count_", 1:"no_gender_add_count_", 2:"no_gender_buy_count_", 3:"no_gender_save_count_"},
                   inplace=True)

no_gender_action['no_genderTotalAction'] = no_gender_action.apply(lambda x: x["no_gender_click_count_"] + x["no_gender_add_count_"] + x["no_gender_buy_count_"] + x["no_gender_save_count_"], axis=1)

no_gender_action.head()

Unnamed: 0,user_id,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction
0,45,19.0,0.0,1.0,0.0,20.0
1,59,14.0,0.0,2.0,0.0,16.0
2,94,334.0,2.0,9.0,15.0,360.0
3,128,22.0,0.0,4.0,0.0,26.0
4,135,68.0,0.0,9.0,0.0,77.0


In [314]:
#### 2). Individual user analysis: proportion/mean -> total/click/add shopping cart/purchase/save to favourite

# Total number of action
tot_act = no_gender.shape[0]
print('Total number of no_gender action: %f'%tot_act)
# Total number of users
tot_user = no_gender['user_id'].nunique()
print('Total number of no_gender users: %f'%tot_user)
# Average number of action per user
avg_act = tot_act/tot_user
print('Average number of action per no_gender user: %f'%avg_act)

columns = ['no_gender_click_count_', 'no_gender_add_count_', 'no_gender_buy_count_','no_gender_save_count_']
for i in columns:
    #total number of each feature
    tot_feature = no_gender_action[i].sum()
    print("Total number of %s is: %f" % (i, tot_feature))
    #mean of each feature
    mean_feature = no_gender_action[i].mean()
    print("Mean of %s is: %f \n" % (i, mean_feature))

# proportion of click/add/purchased/save in terms of per individual user
action_ls = ['no_gender_click_count_','no_gender_add_count_','no_gender_buy_count_','no_gender_save_count_']
for i in action_ls:
    col_name = i + '_ratio_'
    no_gender_action[col_name] = no_gender_action[i] / no_gender_action['no_genderTotalAction']

no_gender_action.head()

Total number of no_gender action: 2142184.000000
Total number of no_gender users: 13397.000000
Average number of action per no_gender user: 159.900276
Total number of no_gender_click_count_ is: 1895451.000000
Mean of no_gender_click_count_ is: 141.483243 

Total number of no_gender_add_count_ is: 3679.000000
Mean of no_gender_add_count_ is: 0.274614 

Total number of no_gender_buy_count_ is: 129849.000000
Mean of no_gender_buy_count_ is: 9.692394 

Total number of no_gender_save_count_ is: 113205.000000
Mean of no_gender_save_count_ is: 8.450026 



Unnamed: 0,user_id,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction,no_gender_click_count__ratio_,no_gender_add_count__ratio_,no_gender_buy_count__ratio_,no_gender_save_count__ratio_
0,45,19.0,0.0,1.0,0.0,20.0,0.95,0.0,0.05,0.0
1,59,14.0,0.0,2.0,0.0,16.0,0.875,0.0,0.125,0.0
2,94,334.0,2.0,9.0,15.0,360.0,0.927778,0.005556,0.025,0.041667
3,128,22.0,0.0,4.0,0.0,26.0,0.846154,0.0,0.153846,0.0
4,135,68.0,0.0,9.0,0.0,77.0,0.883117,0.0,0.116883,0.0


In [315]:
#### 3). Evaluate the level of activeness, count for the whole period & each month

#extract month and day from time stamp
no_gender['month'] = np.floor(no_gender['time_stamp']/100).astype(int)
no_gender['day'] = no_gender['time_stamp']%100
no_gender.head()

#stat: for each user, in how many days he has action(all types)
time_diff = no_gender[['user_id','month','day','action_type','time_stamp']]
time_diff_stat = pd.DataFrame(time_diff.groupby(['user_id','month','day'])['action_type'].count())
time_diff_stat.head()

time_diff.head()

time_diff_stat.reset_index(inplace=True)

time_diff = pd.get_dummies(time_diff, columns = ["action_type"])

time_diff.head()

time_diff.rename(columns={"action_type_0": "click", "action_type_1":"add", 
                          "action_type_2":"buy", "action_type_3":"save"},inplace=True)

time_diff.head()

click = time_diff[time_diff["click"] == 1]
click_day = click.groupby(["user_id"])["time_stamp"].nunique()
click_day = pd.DataFrame(click_day).reset_index()
click_day.rename(columns = {"time_stamp":"no_gender_click_day_count"},inplace=True)
click_day.head()

add = time_diff[time_diff["add"] == 1]
add_day = add.groupby(["user_id"])["time_stamp"].nunique()
add_day = pd.DataFrame(add_day).reset_index()
add_day.rename(columns= {"time_stamp":"no_gender_add_day_count"}, inplace=True)
add_day.head()

buy = time_diff[time_diff["buy"] == 1]
buy_day = buy.groupby(["user_id"])["time_stamp"].nunique()
buy_day = pd.DataFrame(buy_day).reset_index()
buy_day.rename(columns= {"time_stamp":"no_gender_buy_day_count"}, inplace=True)
buy_day.head()

save = time_diff[time_diff["save"] == 1]
save_day = save.groupby(["user_id"])["time_stamp"].nunique()
save_day = pd.DataFrame(save_day).reset_index()
save_day.rename(columns= {"time_stamp":"no_gender_save_day_count"}, inplace=True)
save_day.head()

day_count = pd.merge(click_day, add_day, how = 'outer', on='user_id')
day_count = day_count.merge(buy_day, how = 'outer', on ='user_id')
day_count = day_count.merge(save_day, how='outer', on='user_id')
day_count.fillna(0, inplace=True)

no_gender_overall = pd.merge(no_gender_action, day_count, how = 'left', on='user_id')
no_gender_overall.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,user_id,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction,no_gender_click_count__ratio_,no_gender_add_count__ratio_,no_gender_buy_count__ratio_,no_gender_save_count__ratio_,no_gender_click_day_count,no_gender_add_day_count,no_gender_buy_day_count,no_gender_save_day_count
0,45,19.0,0.0,1.0,0.0,20.0,0.95,0.0,0.05,0.0,7.0,0.0,1,0.0
1,59,14.0,0.0,2.0,0.0,16.0,0.875,0.0,0.125,0.0,4.0,0.0,1,0.0
2,94,334.0,2.0,9.0,15.0,360.0,0.927778,0.005556,0.025,0.041667,21.0,2.0,4,3.0
3,128,22.0,0.0,4.0,0.0,26.0,0.846154,0.0,0.153846,0.0,2.0,0.0,1,0.0
4,135,68.0,0.0,9.0,0.0,77.0,0.883117,0.0,0.116883,0.0,10.0,0.0,2,0.0


In [316]:
#### 4). Conversion rate per user: click/saved to favourite/added to shopping cart -> purchase

# Conversion rate
#here we noticed that there are many 0 s in our data, thus, we use np.log1p to smooth our data.
conversion_ls = ['no_gender_click_count_', 'no_gender_add_count_', 'no_gender_save_count_']
for i in conversion_ls:
    conversion_name = i + '_conversion'
    conversion_diff_name = i + 'conversion_diff'
    no_gender_overall[conversion_name] = np.log1p(no_gender_overall['no_gender_buy_count_']) - np.log1p(no_gender_overall[i])
    no_gender_overall[conversion_diff_name] = no_gender_overall[conversion_name] - no_gender_overall[conversion_name].mean()

no_gender_overall.head()

Unnamed: 0,user_id,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction,no_gender_click_count__ratio_,no_gender_add_count__ratio_,no_gender_buy_count__ratio_,no_gender_save_count__ratio_,no_gender_click_day_count,no_gender_add_day_count,no_gender_buy_day_count,no_gender_save_day_count,no_gender_click_count__conversion,no_gender_click_count_conversion_diff,no_gender_add_count__conversion,no_gender_add_count_conversion_diff,no_gender_save_count__conversion,no_gender_save_count_conversion_diff
0,45,19.0,0.0,1.0,0.0,20.0,0.95,0.0,0.05,0.0,7.0,0.0,1,0.0,-2.302585,-0.033481,0.693147,-1.234033,0.693147,-0.165353
1,59,14.0,0.0,2.0,0.0,16.0,0.875,0.0,0.125,0.0,4.0,0.0,1,0.0,-1.609438,0.659666,1.098612,-0.828568,1.098612,0.240112
2,94,334.0,2.0,9.0,15.0,360.0,0.927778,0.005556,0.025,0.041667,21.0,2.0,4,3.0,-3.511545,-1.242441,1.203973,-0.723208,-0.470004,-1.328504
3,128,22.0,0.0,4.0,0.0,26.0,0.846154,0.0,0.153846,0.0,2.0,0.0,1,0.0,-1.526056,0.743048,1.609438,-0.317743,1.609438,0.750937
4,135,68.0,0.0,9.0,0.0,77.0,0.883117,0.0,0.116883,0.0,10.0,0.0,2,0.0,-1.931521,0.337583,2.302585,0.375404,2.302585,1.444085


In [317]:
no_gender_overall.shape

(13397, 20)

In [328]:
gender_overall = pd.DataFrame(index = user_overall.user_id)

In [329]:
gender_overall.reset_index(inplace=True)

In [330]:
gender_overall = gender_overall.merge(female_overall, on = 'user_id', how='outer')
gender_overall = gender_overall.merge(male_overall, on = 'user_id', how='outer')
gender_overall = gender_overall.merge(no_gender_overall, on = 'user_id', how='outer')
gender_overall.head()

Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction,female_click_count__ratio_,female_add_count__ratio_,female_buy_count__ratio_,female_save_count__ratio_,female_click_day_count,female_add_day_count,female_buy_day_count,female_save_day_count,female_click_count__conversion,female_click_count_conversion_diff,female_add_count__conversion,female_add_count_conversion_diff,female_save_count__conversion,female_save_count_conversion_diff,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction,male_click_count__ratio_,male_add_count__ratio_,male_buy_count__ratio_,male_save_count__ratio_,male_click_day_count,male_add_day_count,male_buy_day_count,male_save_day_count,male_click_count__conversion,male_click_count_conversion_diff,male_add_count__conversion,male_add_count_conversion_diff,male_save_count__conversion,male_save_count_conversion_diff,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction,no_gender_click_count__ratio_,no_gender_add_count__ratio_,no_gender_buy_count__ratio_,no_gender_save_count__ratio_,no_gender_click_day_count,no_gender_add_day_count,no_gender_buy_day_count,no_gender_save_day_count,no_gender_click_count__conversion,no_gender_click_count_conversion_diff,no_gender_add_count__conversion,no_gender_add_count_conversion_diff,no_gender_save_count__conversion,no_gender_save_count_conversion_diff
0,1,,,,,,,,,,,,,,,,,,,,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3.0,0.0,-1.386294,0.784473,1.94591,0.203838,1.94591,0.941063,,,,,,,,,,,,,,,,,,,
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5.0,2.0,-1.163151,1.158035,2.70805,0.780892,1.609438,0.866011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3.0,1.0,-2.549445,-0.228259,1.609438,-0.31772,0.916291,0.172864,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7.0,9.0,-2.378223,-0.057037,2.639057,0.711899,0.241162,-0.502265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [331]:
#fill na as 9999, means not applicable
gender_overall.fillna(9999, inplace=True)

In [332]:
gender_overall.head()

Unnamed: 0,user_id,female_click_count_,female_add_count_,female_buy_count_,female_save_count_,femaleTotalAction,female_click_count__ratio_,female_add_count__ratio_,female_buy_count__ratio_,female_save_count__ratio_,female_click_day_count,female_add_day_count,female_buy_day_count,female_save_day_count,female_click_count__conversion,female_click_count_conversion_diff,female_add_count__conversion,female_add_count_conversion_diff,female_save_count__conversion,female_save_count_conversion_diff,male_click_count_,male_add_count_,male_buy_count_,male_save_count_,maleTotalAction,male_click_count__ratio_,male_add_count__ratio_,male_buy_count__ratio_,male_save_count__ratio_,male_click_day_count,male_add_day_count,male_buy_day_count,male_save_day_count,male_click_count__conversion,male_click_count_conversion_diff,male_add_count__conversion,male_add_count_conversion_diff,male_save_count__conversion,male_save_count_conversion_diff,no_gender_click_count_,no_gender_add_count_,no_gender_buy_count_,no_gender_save_count_,no_genderTotalAction,no_gender_click_count__ratio_,no_gender_add_count__ratio_,no_gender_buy_count__ratio_,no_gender_save_count__ratio_,no_gender_click_day_count,no_gender_add_day_count,no_gender_buy_day_count,no_gender_save_day_count,no_gender_click_count__conversion,no_gender_click_count_conversion_diff,no_gender_add_count__conversion,no_gender_add_count_conversion_diff,no_gender_save_count__conversion,no_gender_save_count_conversion_diff
0,1,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,27.0,0.0,6.0,0.0,33.0,0.818182,0.0,0.181818,0.0,5.0,0.0,3.0,0.0,-1.386294,0.784473,1.94591,0.203838,1.94591,0.941063,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
1,2,47.0,0.0,14.0,2.0,63.0,0.746032,0.0,0.222222,0.031746,5.0,0.0,5.0,2.0,-1.163151,1.158035,2.70805,0.780892,1.609438,0.866011,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
2,3,63.0,0.0,4.0,1.0,68.0,0.926471,0.0,0.058824,0.014706,12.0,0.0,3.0,1.0,-2.549445,-0.228259,1.609438,-0.31772,0.916291,0.172864,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
3,4,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
4,5,150.0,0.0,13.0,10.0,173.0,0.867052,0.0,0.075145,0.057803,29.0,0.0,7.0,9.0,-2.378223,-0.057037,2.639057,0.711899,0.241162,-0.502265,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0


In [339]:
gender_overall.shape

(424170, 58)

In [340]:
user_overall.shape

(424170, 27)

In [341]:
age_overall.shape

(424170, 153)

In [342]:
user_overall.to_csv('user_overall.csv')
age_overall.to_csv('age_overall.csv')
gender_overall.to_csv('gender_overall.csv')