In this code we will construct behavioral profiles for each customer by month. This notebook will run for each of 7 months (from October 2019 to April 2020) worth of ecommerce data from an online multi-category store. There will be four types of customer profiles:
    - Overall profile - customer overall behavior
    - Customer Category profile
    - Customer Brand profile
    - Customer Product profile
 
- The dataset was gathered from the Kaggle website from the following link: https://drive.google.com/drive/folders/1Nan8X33H8xrXS5XhCKZmSpClFTCJsSpE
- The main Kaggle page for the dataset is the following: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store
- The dataset in Kaggle originally comes from REES46 Marketing Platform : https://rees46.com/

In [1]:
# Loading basic needed libraries
import pandas as pd
import numpy as np
import gc
from functools import reduce

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 


In [2]:
# Code Parameters - Using code parameters in order to make code reusable and easy to run for each of the 7 months of data
month = '2019-Oct'

#### Data Gathering Step

In [3]:
# Reading csv file from S3
df_month = pd.read_csv('s3://predictive-maintenance-bucket/data/{0}.csv'.format(month))
df_month.nunique()

event_time       2621538
event_type             3
product_id        166794
category_id          624
category_code        126
brand               3444
price              65298
user_id          3022290
user_session     9244421
dtype: int64

In [4]:
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_month = df_month.dropna(subset=['category_code', 'brand'])
df_month['category'] = df_month['category_id'].astype(str) + '_' + df_month['category_code']# Creating category column
df_month.nunique()

event_time       2558113
event_type             3
product_id         60371
category_id          248
category_code        126
brand               1731
price              50795
user_id          2323036
user_session     6419693
category             248
dtype: int64

In [5]:
# Checking distinct events
pd.DataFrame(df_month['event_type'].value_counts()).reset_index()

Unnamed: 0,index,event_type
0,view,25201706
1,cart,809409
2,purchase,549507


In [6]:
# Checking distribution of price
pd.options.display.float_format = '{:.2f}'.format# Suppresing scientific notation
df_month['price'].describe()

count   26560622.00
mean         352.86
std          381.08
min            0.88
25%          108.11
50%          218.51
75%          459.09
max         2574.07
Name: price, dtype: float64

In [7]:
# Creating date column from the event_time column 
df_month['event_date'] = df_month['event_time'].str[:10]# Grabbing only date portion from original event_time column
df_month['event_date'] = df_month['event_date'].astype('datetime64[ns]')# Transforming to datetime format
df_month.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,category,event_date
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2053013552326770905_appliances.environment.wat...,2019-10-01
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2053013558920217191_computers.notebook,2019-10-01
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2053013555631882655_electronics.smartphone,2019-10-01
5,2019-10-01 00:00:05 UTC,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9,2053013561092866779_computers.desktop,2019-10-01
8,2019-10-01 00:00:10 UTC,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880,2053013565480109009_apparel.shoes.keds,2019-10-01


In [8]:
df_month.dtypes# Checking typing of columns

event_time               object
event_type               object
product_id                int64
category_id               int64
category_code            object
brand                    object
price                   float64
user_id                   int64
user_session             object
category                 object
event_date       datetime64[ns]
dtype: object

#### Customer Overall Profile Construction

In [9]:
# Obtaining event counts by customer
customer_views = pd.DataFrame(df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id', 'event_type']).size()).reset_index()
customer_views.columns = ['user_id','event_type','view_count']# Renaming columns
customer_views = customer_views[['user_id','view_count']]# Keeping only wanted columns

customer_carts = pd.DataFrame(df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id', 'event_type']).size()).reset_index()
customer_carts.columns = ['user_id','event_type','cart_count']# Renaming columns
customer_carts = customer_carts[['user_id','cart_count']]# Keeping only wanted columns

customer_purchases = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id', 'event_type']).size()).reset_index()
customer_purchases.columns = ['user_id','event_type','purchase_count']# Renaming columns
customer_purchases = customer_purchases[['user_id','purchase_count']]# Keeping only wanted columns

# Merging event counts dfs together
data_frames = [customer_views, customer_carts, customer_purchases]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id'],how='outer'), data_frames)
# Filling NaN values with 0
customer_profile[['view_count', 'cart_count', 'purchase_count']] = customer_profile[['view_count', 'cart_count', 'purchase_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count
0,33869381,1.0,0.0,0.0
1,184265397,4.0,0.0,0.0
2,195082191,1.0,0.0,0.0
3,200673532,4.0,0.0,0.0
4,208669541,1.0,0.0,0.0


In [10]:
customer_profile.nunique()

user_id           2323036
view_count            754
cart_count            128
purchase_count        123
dtype: int64

In [11]:
# Obtaining sessions by customer
customer_sessions = pd.DataFrame(df_month.groupby(['user_id', 'user_session']).size()).reset_index()
customer_sessions.columns = ['user_id','user_session','session_count']# Renaming columns
customer_sessions = customer_sessions[['user_id','session_count']]# Keeping only wanted columns

#Merging customer_sessions with customer_profile
customer_profile = pd.merge(customer_profile, customer_sessions, on=["user_id"], how='left')
# Filling NaN values with 0
customer_profile[['session_count']] = customer_profile[['session_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count
0,33869381,1.0,0.0,0.0,1
1,184265397,4.0,0.0,0.0,4
2,195082191,1.0,0.0,0.0,1
3,200673532,4.0,0.0,0.0,1
4,200673532,4.0,0.0,0.0,1


In [12]:
# Dropping duplicates as we only want one row per each user_id
customer_profile = customer_profile.drop_duplicates(subset=['user_id'])
customer_profile.shape

(2323036, 5)

In [13]:
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count
0,33869381,1.0,0.0,0.0,1
1,184265397,4.0,0.0,0.0,4
2,195082191,1.0,0.0,0.0,1
3,200673532,4.0,0.0,0.0,1
6,208669541,1.0,0.0,0.0,1


In [14]:
# Obtaining spent by customer
customer_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id'])['price'].agg('sum')).reset_index()
customer_spent.columns = ['user_id','spent']# Renaming columns

#Merging customer_spent with customer_profile
customer_profile = pd.merge(customer_profile, customer_spent, on=["user_id"], how='left')
# Filling NaN values with 0
customer_profile[['spent']] = customer_profile[['spent']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count,spent
0,33869381,1.0,0.0,0.0,1,0.0
1,184265397,4.0,0.0,0.0,4,0.0
2,195082191,1.0,0.0,0.0,1,0.0
3,200673532,4.0,0.0,0.0,1,0.0
4,208669541,1.0,0.0,0.0,1,0.0


In [15]:
# Grabbing min and max spent a customer has
cust_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id']).price.agg(min_spent=np.min, max_spent=np.max, median_spent=np.median, avg_spent=np.average).reset_index())
# Merging results
customer_profile = pd.merge(customer_profile, cust_spent, on=["user_id"], how='left')
# Filling NaN values with 0
customer_profile[['min_spent','max_spent','median_spent','avg_spent']] = customer_profile[['min_spent','max_spent','median_spent','avg_spent']].fillna(value=0)                          
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count,spent,min_spent,max_spent,median_spent,avg_spent
0,33869381,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,184265397,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2,195082191,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,200673532,4.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
4,208669541,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0


In [16]:
# Grabbing first and last date a customer did an event (any event)
cust_dates = df_month.groupby(['user_id']).event_date.agg(min_activity_date=np.min, max_activity_date=np.max).reset_index()
# Merging results
customer_profile = pd.merge(customer_profile, cust_dates, on=["user_id"], how='inner')
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count,spent,min_spent,max_spent,median_spent,avg_spent,min_activity_date,max_activity_date
0,33869381,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23
1,184265397,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04
2,195082191,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10
3,200673532,4.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-13
4,208669541,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04


In [17]:
customer_profile.shape

(2323036, 12)

In [18]:
# Grabbing first and last date by event for each customer
cust_dates_view = df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id']).event_date.agg(min_view_date=np.min, max_view_date=np.max).reset_index()
cust_dates_cart = df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id']).event_date.agg(min_cart_date=np.min, max_cart_date=np.max).reset_index()
cust_dates_purchase = df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id']).event_date.agg(min_purchase_date=np.min, max_purchase_date=np.max).reset_index()

#Merging dfs together
data_frames = [customer_profile,cust_dates_view, cust_dates_cart, cust_dates_purchase]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id'],how='outer'), data_frames)
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count,spent,min_spent,max_spent,median_spent,avg_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date
0,33869381,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT
1,184265397,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT
2,195082191,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT
3,200673532,4.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-13,2019-10-10,2019-10-13,NaT,NaT,NaT,NaT
4,208669541,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT


In [19]:
customer_profile.nunique()

user_id              2323036
view_count               754
cart_count               128
purchase_count           123
session_count            167
spent                  80790
min_spent              15621
max_spent              15951
median_spent           46290
avg_spent              79706
min_activity_date         31
max_activity_date         31
min_view_date             31
max_view_date             31
min_cart_date             31
max_cart_date             31
min_purchase_date         31
max_purchase_date         31
dtype: int64

In [20]:
# Adding month columns
customer_profile['month'] = month
customer_profile.head()

Unnamed: 0,user_id,view_count,cart_count,purchase_count,session_count,spent,min_spent,max_spent,median_spent,avg_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date,month
0,33869381,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT,2019-Oct
1,184265397,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct
2,195082191,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT,2019-Oct
3,200673532,4.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-13,2019-10-10,2019-10-13,NaT,NaT,NaT,NaT,2019-Oct
4,208669541,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct


In [21]:
# Saving results in S3
customer_profile.to_csv('s3://myaws-capstone-bucket/data/{0}/customer_profile.csv'.format(month),index=False)

#### Customer Category Profile Construction

In [22]:
# Obtaining event counts by customer and category
customer_views = pd.DataFrame(df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','category','event_type']).size()).reset_index()
customer_views.columns = ['user_id','category','event_type','view_count']# Renaming columns
customer_views = customer_views[['user_id','category','view_count']]# Keeping only wanted columns

customer_carts = pd.DataFrame(df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','category', 'event_type']).size()).reset_index()
customer_carts.columns = ['user_id','category','event_type','cart_count']# Renaming columns
customer_carts = customer_carts[['user_id','category','cart_count']]# Keeping only wanted columns

customer_purchases = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','category', 'event_type']).size()).reset_index()
customer_purchases.columns = ['user_id','category','event_type','purchase_count']# Renaming columns
customer_purchases = customer_purchases[['user_id','category','purchase_count']]# Keeping only wanted columns

# Merging event counts dfs together
data_frames = [customer_views, customer_carts, customer_purchases]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','category'],how='outer'), data_frames)
# Filling NaN values with 0
customer_profile[['view_count', 'cart_count', 'purchase_count']] = customer_profile[['view_count', 'cart_count', 'purchase_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0


In [23]:
# Obtaining sessions by customer and category
customer_sessions = pd.DataFrame(df_month.groupby(['user_id','category', 'user_session']).size()).reset_index()
customer_sessions.columns = ['user_id','category','user_session','session_count']# Renaming columns
customer_sessions = customer_sessions[['user_id','category','session_count']]# Keeping only wanted columns

#Merging customer_sessions with customer_profile
customer_profile = pd.merge(customer_profile, customer_sessions, on=["user_id","category"], how='left')
# Filling NaN values with 0
customer_profile[['session_count']] = customer_profile[['session_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1


In [24]:
# Dropping duplicates as we only want one row per distinct user_id/category
customer_profile = customer_profile.drop_duplicates(subset=['user_id','category'])

In [25]:
# Obtaining spent by customer/category combination
customer_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','category'])['price'].agg('sum')).reset_index()
customer_spent.columns = ['user_id','category','spent']# Renaming columns

#Merging customer_spent with customer_profile
customer_profile = pd.merge(customer_profile, customer_spent, on=["user_id","category"], how='left')
# Filling NaN values with 0
customer_profile[['spent']] = customer_profile[['spent']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count,spent
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1,0.0
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4,0.0
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1,0.0
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2,0.0
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1,0.0


In [26]:
# Grabbing min and max spent a customer has per category
cust_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','category']).price.agg(min_category_spent=np.min, max_category_spent=np.max, median_category_spent=np.median, avg_category_spent=np.average).reset_index())
# Merging results
customer_profile = pd.merge(customer_profile, cust_spent, on=["user_id","category"], how='left')
# Filling NaN values with 0
customer_profile[['min_category_spent','max_category_spent','median_category_spent','avg_category_spent']] = customer_profile[['min_category_spent','max_category_spent','median_category_spent','avg_category_spent']].fillna(value=0)                          
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count,spent,min_category_spent,max_category_spent,median_category_spent,avg_category_spent
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0


In [27]:
# Grabbing first and last date a customer did an event (any event) for a specific category
cust_dates = df_month.groupby(['user_id','category']).event_date.agg(min_activity_date=np.min, max_activity_date=np.max).reset_index()
# Merging results
customer_profile = pd.merge(customer_profile, cust_dates, on=["user_id","category"], how='inner')
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count,spent,min_category_spent,max_category_spent,median_category_spent,avg_category_spent,min_activity_date,max_activity_date
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12


In [28]:
# Grabbing first and last date by event for each customer/category
cust_dates_view = df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','category']).event_date.agg(min_view_date=np.min, max_view_date=np.max).reset_index()
cust_dates_cart = df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','category']).event_date.agg(min_cart_date=np.min, max_cart_date=np.max).reset_index()
cust_dates_purchase = df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','category']).event_date.agg(min_purchase_date=np.min, max_purchase_date=np.max).reset_index()

#Merging dfs together
data_frames = [customer_profile,cust_dates_view, cust_dates_cart, cust_dates_purchase]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','category'],how='outer'), data_frames)
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count,spent,min_category_spent,max_category_spent,median_category_spent,avg_category_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12,2019-10-10,2019-10-12,NaT,NaT,NaT,NaT


In [29]:
# Adding month columns
customer_profile['month'] = month
# Dropping duplicates as we only want one row per distinct user_id/category
customer_profile = customer_profile.drop_duplicates(subset=['user_id','category'])
customer_profile.head()

Unnamed: 0,user_id,category,view_count,cart_count,purchase_count,session_count,spent,min_category_spent,max_category_spent,median_category_spent,avg_category_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date,month
0,33869381,2053013560346280633_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT,2019-Oct
1,184265397,2053013560312726199_furniture.living_room.chair,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct
2,195082191,2053013554658804075_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT,2019-Oct
3,200673532,2053013554155487563_computers.components.mothe...,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT,2019-Oct
4,200673532,2053013554247762257_computers.components.video...,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12,2019-10-10,2019-10-12,NaT,NaT,NaT,NaT,2019-Oct


In [30]:
# Saving results in S3
customer_profile.to_csv('s3://myaws-capstone-bucket/data/{0}/customer_category_profile.csv'.format(month),index=False)

#### Customer Brand Profile Construction

In [31]:
# Obtaining event counts by customer and brand
customer_views = pd.DataFrame(df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','brand','event_type']).size()).reset_index()
customer_views.columns = ['user_id','brand','event_type','view_count']# Renaming columns
customer_views = customer_views[['user_id','brand','view_count']]# Keeping only wanted columns

customer_carts = pd.DataFrame(df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','brand', 'event_type']).size()).reset_index()
customer_carts.columns = ['user_id','brand','event_type','cart_count']# Renaming columns
customer_carts = customer_carts[['user_id','brand','cart_count']]# Keeping only wanted columns

customer_purchases = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','brand', 'event_type']).size()).reset_index()
customer_purchases.columns = ['user_id','brand','event_type','purchase_count']# Renaming columns
customer_purchases = customer_purchases[['user_id','brand','purchase_count']]# Keeping only wanted columns

# Merging event counts dfs together
data_frames = [customer_views, customer_carts, customer_purchases]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','brand'],how='outer'), data_frames)
# Filling NaN values with 0
customer_profile[['view_count', 'cart_count', 'purchase_count']] = customer_profile[['view_count', 'cart_count', 'purchase_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count
0,33869381,bumbleride,1.0,0.0,0.0
1,184265397,joie,4.0,0.0,0.0
2,195082191,apple,1.0,0.0,0.0
3,200673532,asrock,2.0,0.0,0.0
4,200673532,sinotex,2.0,0.0,0.0


In [32]:
# Obtaining sessions by customer and brand
customer_sessions = pd.DataFrame(df_month.groupby(['user_id','brand', 'user_session']).size()).reset_index()
customer_sessions.columns = ['user_id','brand','user_session','session_count']# Renaming columns
customer_sessions = customer_sessions[['user_id','brand','session_count']]# Keeping only wanted columns

#Merging customer_sessions with customer_profile
customer_profile = pd.merge(customer_profile, customer_sessions, on=["user_id","brand"], how='left')
# Filling NaN values with 0
customer_profile[['session_count']] = customer_profile[['session_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count
0,33869381,bumbleride,1.0,0.0,0.0,1
1,184265397,joie,4.0,0.0,0.0,4
2,195082191,apple,1.0,0.0,0.0,1
3,200673532,asrock,2.0,0.0,0.0,2
4,200673532,sinotex,2.0,0.0,0.0,1


In [33]:
# Dropping duplicates as we only want one row per distinct user_id/brand
customer_profile = customer_profile.drop_duplicates(subset=['user_id','brand'])

In [34]:
# Obtaining spent by customer/brand combination
customer_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','brand'])['price'].agg('sum')).reset_index()
customer_spent.columns = ['user_id','brand','spent']# Renaming columns

#Merging customer_spent with customer_profile
customer_profile = pd.merge(customer_profile, customer_spent, on=["user_id","brand"], how='left')
# Filling NaN values with 0
customer_profile[['spent']] = customer_profile[['spent']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count,spent
0,33869381,bumbleride,1.0,0.0,0.0,1,0.0
1,184265397,joie,4.0,0.0,0.0,4,0.0
2,195082191,apple,1.0,0.0,0.0,1,0.0
3,200673532,asrock,2.0,0.0,0.0,2,0.0
4,200673532,sinotex,2.0,0.0,0.0,1,0.0


In [35]:
# Grabbing min and max spent a customer has per brand
cust_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','brand']).price.agg(min_brand_spent=np.min, max_brand_spent=np.max, median_brand_spent=np.median, avg_brand_spent=np.average).reset_index())
# Merging results
customer_profile = pd.merge(customer_profile, cust_spent, on=["user_id","brand"], how='left')
# Filling NaN values with 0
customer_profile[['min_brand_spent','max_brand_spent','median_brand_spent','avg_brand_spent']] = customer_profile[['min_brand_spent','max_brand_spent','median_brand_spent','avg_brand_spent']].fillna(value=0)                          
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count,spent,min_brand_spent,max_brand_spent,median_brand_spent,avg_brand_spent
0,33869381,bumbleride,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,184265397,joie,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2,195082191,apple,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,200673532,asrock,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
4,200673532,sinotex,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0


In [36]:
# Grabbing first and last date a customer did an event (any event) for a specific brand
cust_dates = df_month.groupby(['user_id','brand']).event_date.agg(min_activity_date=np.min, max_activity_date=np.max).reset_index()
# Merging results
customer_profile = pd.merge(customer_profile, cust_dates, on=["user_id","brand"], how='inner')
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count,spent,min_brand_spent,max_brand_spent,median_brand_spent,avg_brand_spent,min_activity_date,max_activity_date
0,33869381,bumbleride,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23
1,184265397,joie,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04
2,195082191,apple,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10
3,200673532,asrock,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13
4,200673532,sinotex,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12


In [37]:
# Grabbing first and last date by event for each customer/brand
cust_dates_view = df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','brand']).event_date.agg(min_view_date=np.min, max_view_date=np.max).reset_index()
cust_dates_cart = df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','brand']).event_date.agg(min_cart_date=np.min, max_cart_date=np.max).reset_index()
cust_dates_purchase = df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','brand']).event_date.agg(min_purchase_date=np.min, max_purchase_date=np.max).reset_index()

#Merging dfs together
data_frames = [customer_profile,cust_dates_view, cust_dates_cart, cust_dates_purchase]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','brand'],how='outer'), data_frames)
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count,spent,min_brand_spent,max_brand_spent,median_brand_spent,avg_brand_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date
0,33869381,bumbleride,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT
1,184265397,joie,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT
2,195082191,apple,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT
3,200673532,asrock,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT
4,200673532,sinotex,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12,2019-10-10,2019-10-12,NaT,NaT,NaT,NaT


In [38]:
# Adding month columns
customer_profile['month'] = month
# Dropping duplicates as we only want one row per distinct user_id/brand
customer_profile = customer_profile.drop_duplicates(subset=['user_id','brand'])
customer_profile.head()

Unnamed: 0,user_id,brand,view_count,cart_count,purchase_count,session_count,spent,min_brand_spent,max_brand_spent,median_brand_spent,avg_brand_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date,month
0,33869381,bumbleride,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT,2019-Oct
1,184265397,joie,4.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct
2,195082191,apple,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT,2019-Oct
3,200673532,asrock,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT,2019-Oct
4,200673532,sinotex,2.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-12,2019-10-10,2019-10-12,NaT,NaT,NaT,NaT,2019-Oct


In [39]:
# Saving results in S3
customer_profile.to_csv('s3://myaws-capstone-bucket/data/{0}/customer_brand_profile.csv'.format(month),index=False)

#### Customer Product Profile Construction

In [40]:
# Creating product column that will also contain the category to make it more meaningful when looking at it
df_month['product'] = df_month['product_id'].astype(str) + '_' + df_month['category_code']
df_month.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,category,event_date,product
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2053013552326770905_appliances.environment.wat...,2019-10-01,3900821_appliances.environment.water_heater
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2053013558920217191_computers.notebook,2019-10-01,1307067_computers.notebook
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2053013555631882655_electronics.smartphone,2019-10-01,1004237_electronics.smartphone
5,2019-10-01 00:00:05 UTC,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9,2053013561092866779_computers.desktop,2019-10-01,1480613_computers.desktop
8,2019-10-01 00:00:10 UTC,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880,2053013565480109009_apparel.shoes.keds,2019-10-01,28719074_apparel.shoes.keds


In [41]:
# Obtaining event counts by customer and product
customer_views = pd.DataFrame(df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','product','event_type']).size()).reset_index()
customer_views.columns = ['user_id','product','event_type','view_count']# Renaming columns
customer_views = customer_views[['user_id','product','view_count']]# Keeping only wanted columns

customer_carts = pd.DataFrame(df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','product', 'event_type']).size()).reset_index()
customer_carts.columns = ['user_id','product','event_type','cart_count']# Renaming columns
customer_carts = customer_carts[['user_id','product','cart_count']]# Keeping only wanted columns

customer_purchases = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','product', 'event_type']).size()).reset_index()
customer_purchases.columns = ['user_id','product','event_type','purchase_count']# Renaming columns
customer_purchases = customer_purchases[['user_id','product','purchase_count']]# Keeping only wanted columns

# Merging event counts dfs together
data_frames = [customer_views, customer_carts, customer_purchases]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','product'],how='outer'), data_frames)
# Filling NaN values with 0
customer_profile[['view_count', 'cart_count', 'purchase_count']] = customer_profile[['view_count', 'cart_count', 'purchase_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count
0,33869381,7002639_kids.carriage,1.0,0.0,0.0
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0


In [42]:
# Obtaining sessions by customer and product
customer_sessions = pd.DataFrame(df_month.groupby(['user_id','product', 'user_session']).size()).reset_index()
customer_sessions.columns = ['user_id','product','user_session','session_count']# Renaming columns
customer_sessions = customer_sessions[['user_id','product','session_count']]# Keeping only wanted columns

#Merging customer_sessions with customer_profile
customer_profile = pd.merge(customer_profile, customer_sessions, on=["user_id","product"], how='left')
# Filling NaN values with 0
customer_profile[['session_count']] = customer_profile[['session_count']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2


In [43]:
# Dropping duplicates as we only want one row per distinct user_id/product
customer_profile = customer_profile.drop_duplicates(subset=['user_id','product'])

In [44]:
# Obtaining spent by customer/product combination
customer_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','product'])['price'].agg('sum')).reset_index()
customer_spent.columns = ['user_id','product','spent']# Renaming columns

#Merging customer_spent with customer_profile
customer_profile = pd.merge(customer_profile, customer_spent, on=["user_id","product"], how='left')
# Filling NaN values with 0
customer_profile[['spent']] = customer_profile[['spent']].fillna(value=0)
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count,spent
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1,0.0
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2,0.0
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2,0.0
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1,0.0
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2,0.0


In [45]:
# Grabbing min and max spent a customer has per product
cust_spent = pd.DataFrame(df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','product']).price.agg(min_product_spent=np.min, max_product_spent=np.max, median_product_spent=np.median, avg_product_spent=np.average).reset_index())
# Merging results
customer_profile = pd.merge(customer_profile, cust_spent, on=["user_id","product"], how='left')
# Filling NaN values with 0
customer_profile[['min_product_spent','max_product_spent','median_product_spent','avg_product_spent']] = customer_profile[['min_product_spent','max_product_spent','median_product_spent','avg_product_spent']].fillna(value=0)                          
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count,spent,min_product_spent,max_product_spent,median_product_spent,avg_product_spent
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0


In [46]:
# Grabbing first and last date a customer did an event (any event) for a specific product
cust_dates = df_month.groupby(['user_id','product']).event_date.agg(min_activity_date=np.min, max_activity_date=np.max).reset_index()
# Merging results
customer_profile = pd.merge(customer_profile, cust_dates, on=["user_id","product"], how='inner')
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count,spent,min_product_spent,max_product_spent,median_product_spent,avg_product_spent,min_activity_date,max_activity_date
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13


In [47]:
# Grabbing first and last date by event for each customer/product
cust_dates_view = df_month.loc[df_month['event_type'] == 'view'].groupby(['user_id','product']).event_date.agg(min_view_date=np.min, max_view_date=np.max).reset_index()
cust_dates_cart = df_month.loc[df_month['event_type'] == 'cart'].groupby(['user_id','product']).event_date.agg(min_cart_date=np.min, max_cart_date=np.max).reset_index()
cust_dates_purchase = df_month.loc[df_month['event_type'] == 'purchase'].groupby(['user_id','product']).event_date.agg(min_purchase_date=np.min, max_purchase_date=np.max).reset_index()

#Merging dfs together
data_frames = [customer_profile,cust_dates_view, cust_dates_cart, cust_dates_purchase]
customer_profile = reduce(lambda  left,right: pd.merge(left,right,on=['user_id','product'],how='outer'), data_frames)
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count,spent,min_product_spent,max_product_spent,median_product_spent,avg_product_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT


In [48]:
# Adding month columns
customer_profile['month'] = month
# Dropping duplicates as we only want one row per distinct user_id/product
customer_profile = customer_profile.drop_duplicates(subset=['user_id','product'])
customer_profile.head()

Unnamed: 0,user_id,product,view_count,cart_count,purchase_count,session_count,spent,min_product_spent,max_product_spent,median_product_spent,avg_product_spent,min_activity_date,max_activity_date,min_view_date,max_view_date,min_cart_date,max_cart_date,min_purchase_date,max_purchase_date,month
0,33869381,7002639_kids.carriage,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-23,2019-10-23,2019-10-23,2019-10-23,NaT,NaT,NaT,NaT,2019-Oct
1,184265397,6902133_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct
2,184265397,6902303_furniture.living_room.chair,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-04,2019-10-04,2019-10-04,2019-10-04,NaT,NaT,NaT,NaT,2019-Oct
3,195082191,4804056_electronics.audio.headphone,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2019-10-10,2019-10-10,2019-10-10,2019-10-10,NaT,NaT,NaT,NaT,2019-Oct
4,200673532,6501011_computers.components.motherboard,2.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,2019-10-12,2019-10-13,2019-10-12,2019-10-13,NaT,NaT,NaT,NaT,2019-Oct


In [49]:
# Saving results in S3
customer_profile.to_csv('s3://myaws-capstone-bucket/data/{0}/customer_product_profile.csv'.format(month),index=False)