This notebook loads the 7 months (from October 2019 to April 2020) of data only focusing on the purchase events for each user, taking their event_time, category_code and brand. At the end we will have one full dataframe containing the purchases of each user with their date. 

The resulting dataframe will be used for Basket Market Analysis along with other product recommendation models based on sequences of purchase such as RNN and CNN recommendation models and Neural Collaborative Filtering model.

In [1]:
# Loading basic needed libraries
import pandas as pd
import gc

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

#### Loading and preparing data

In [2]:
# Reading csv file from S3 - Selecting specific columns
df_oct = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Oct.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code','category_id','brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_oct = df_oct.dropna(subset=['category_code', 'brand'])
df_oct = df_oct.loc[df_oct['event_type'] == 'purchase']
df_oct.nunique()

event_time       477551
event_type            1
product_id        16737
category_id         215
category_code       120
brand              1052
user_id          263445
user_session     466799
dtype: int64

In [3]:
# Reading csv file from S3 - Selecting specific columns
df_nov = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Nov.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code','category_id', 'brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_nov = df_nov.dropna(subset=['category_code', 'brand'])
df_nov = df_nov.loc[df_nov['event_type'] == 'purchase']
df_nov.nunique()

event_time       483369
event_type            1
product_id        20772
category_id         240
category_code       126
brand              1236
user_id          330394
user_session     558494
dtype: int64

In [4]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [df_oct, df_nov]
full_data = pd.concat(dfs)

In [5]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_oct
del df_nov
gc.collect()

98

In [6]:
# Reading csv file from S3 - Selecting specific columns
df_dec = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Dec.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code', 'category_id', 'brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_dec = df_dec.dropna(subset=['category_code', 'brand'])
df_dec = df_dec.loc[df_dec['event_type'] == 'purchase']
df_dec.nunique()

event_time       779313
event_type            1
product_id        37504
category_id         786
category_code       134
brand              2643
user_id          444825
user_session     844153
dtype: int64

In [7]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_dec]
full_data = pd.concat(dfs)

In [8]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_dec
gc.collect()

464

In [9]:
# Reading csv file from S3 - Selecting specific columns
df_jan = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Jan.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code', 'category_id','brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_jan = df_jan.dropna(subset=['category_code', 'brand'])
df_jan = df_jan.loc[df_jan['event_type'] == 'purchase']
df_jan.nunique()

event_time       611861
event_type            1
product_id        31523
category_id         805
category_code       134
brand              2554
user_id          323918
user_session     616833
dtype: int64

In [10]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_jan]
full_data = pd.concat(dfs)

In [11]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_jan
gc.collect()

224

In [12]:
# Reading csv file from S3 - Selecting specific columns
df_feb = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Feb.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code', 'category_id', 'brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_feb = df_feb.dropna(subset=['category_code', 'brand'])
df_feb = df_feb.loc[df_feb['event_type'] == 'purchase']
df_feb.nunique()

event_time       710536
event_type            1
product_id        33207
category_id         740
category_code       135
brand              2230
user_id          349081
user_session     662989
dtype: int64

In [13]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_feb]
full_data = pd.concat(dfs)

In [14]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_feb
gc.collect()

179

In [15]:
# Reading csv file from S3 - Selecting specific columns
df_march = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Mar.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code', 'category_id', 'brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_march = df_march.dropna(subset=['category_code', 'brand'])
df_march = df_march.loc[df_march['event_type'] == 'purchase']
df_march.nunique()

event_time       697345
event_type            1
product_id        37166
category_id         725
category_code       133
brand              2344
user_id          393040
user_session     723857
dtype: int64

In [16]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_march]
full_data = pd.concat(dfs)

In [17]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_march
gc.collect()

304

In [18]:
# Reading csv file from S3 - Selecting specific columns
df_apr = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Apr.csv', usecols=['user_id', 'user_session', 'event_time', 'category_code', 'category_id', 'brand', 'product_id', 'event_type'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_apr = df_apr.dropna(subset=['category_code', 'brand'])
df_apr = df_apr.loc[df_apr['event_type'] == 'purchase']
df_apr.nunique()

event_time       655585
event_type            1
product_id        39294
category_id         749
category_code       135
brand              2313
user_id          425603
user_session     673460
dtype: int64

In [19]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_apr]
full_data = pd.concat(dfs)

In [20]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_apr
gc.collect()

409

In [24]:
full_data = full_data[['user_id','user_session','event_time','category_code', 'category_id', 'brand', 'product_id']]
full_data['category'] = full_data['category_id'].astype(str) + '_' + full_data['category_code']# Creating category column
full_data.head()

Unnamed: 0,user_id,user_session,event_time,category_code,category_id,brand,product_id,category
162,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,2019-10-01 00:02:14 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone
308,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:04:37 UTC,electronics.smartphone,2053013555631882655,apple,1002532,2053013555631882655_electronics.smartphone
442,555332717,1dea3ee2-2ded-42e8-8e7a-4e2ad6ae942f,2019-10-01 00:07:07 UTC,furniture.bathroom.toilet,2053013557418656265,santeri,13800054,2053013557418656265_furniture.bathroom.toilet
574,524601178,2af9b570-0942-4dcd-8f25-4d84fba82553,2019-10-01 00:09:26 UTC,electronics.audio.headphone,2053013554658804075,apple,4804055,2053013554658804075_electronics.audio.headphone
603,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:09:54 UTC,electronics.audio.headphone,2053013554658804075,apple,4804056,2053013554658804075_electronics.audio.headphone


In [25]:
full_data.nunique()

user_id          1817173
user_session     4544395
event_time       4415560
category_code        139
category_id          932
brand               4081
product_id         96037
category             932
dtype: int64

In [26]:
# Saving results in S3
full_data.to_csv('s3://myaws-capstone-bucket/eCommerce_purchase_data.csv', index=False)