This notebook loads the 7 months (from October 2019 to April 2020) of data only focusing on the purchase events to obtain the product information for all products purchased during the 7 months. 

In [1]:
# Loading basic needed libraries
import pandas as pd
import gc

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

#### Loading and preparing data

In [2]:
# Reading csv file from S3 - Selecting specific columns
df_oct = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Oct.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_oct = df_oct.dropna(subset=['category_code', 'brand'])
df_oct = df_oct.loc[df_oct['event_type'] == 'purchase']
df_oct.nunique()

event_type           1
product_id       16737
category_id        215
category_code      120
brand             1052
price            18390
dtype: int64

In [3]:
# Reading csv file from S3 - Selecting specific columns
df_nov = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Nov.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_nov = df_nov.dropna(subset=['category_code', 'brand'])
df_nov = df_nov.loc[df_nov['event_type'] == 'purchase']
df_nov.nunique()

event_type           1
product_id       20772
category_id        240
category_code      126
brand             1236
price            17335
dtype: int64

In [4]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [df_oct, df_nov]
full_data = pd.concat(dfs)

In [5]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_oct
del df_nov
gc.collect()

118

In [6]:
# Reading csv file from S3 - Selecting specific columns
df_dec = pd.read_csv('s3://predictive-maintenance-bucket/data/2019-Dec.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_dec = df_dec.dropna(subset=['category_code', 'brand'])
df_dec = df_dec.loc[df_dec['event_type'] == 'purchase']
df_dec.nunique()

event_type           1
product_id       37504
category_id        786
category_code      134
brand             2643
price            21880
dtype: int64

In [7]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_dec]
full_data = pd.concat(dfs)

In [8]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_dec
gc.collect()

479

In [9]:
# Reading csv file from S3 - Selecting specific columns
df_jan = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Jan.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_jan = df_jan.dropna(subset=['category_code', 'brand'])
df_jan = df_jan.loc[df_jan['event_type'] == 'purchase']
df_jan.nunique()

event_type           1
product_id       31523
category_id        805
category_code      134
brand             2554
price            29890
dtype: int64

In [10]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_jan]
full_data = pd.concat(dfs)

In [11]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_jan
gc.collect()

474

In [12]:
# Reading csv file from S3 - Selecting specific columns
df_feb = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Feb.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_feb = df_feb.dropna(subset=['category_code', 'brand'])
df_feb = df_feb.loc[df_feb['event_type'] == 'purchase']
df_feb.nunique()

event_type           1
product_id       33207
category_id        740
category_code      135
brand             2230
price            22874
dtype: int64

In [13]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_feb]
full_data = pd.concat(dfs)

In [14]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_feb
gc.collect()

439

In [15]:
# Reading csv file from S3 - Selecting specific columns
df_march = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Mar.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_march = df_march.dropna(subset=['category_code', 'brand'])
df_march = df_march.loc[df_march['event_type'] == 'purchase']
df_march.nunique()

event_type           1
product_id       37166
category_id        725
category_code      133
brand             2344
price            21352
dtype: int64

In [16]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_march]
full_data = pd.concat(dfs)

In [17]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_march
gc.collect()

29

In [18]:
# Reading csv file from S3 - Selecting specific columns
df_apr = pd.read_csv('s3://predictive-maintenance-bucket/data/2020-Apr.csv', usecols=['event_type','category_code','category_id','brand', 'product_id', 'price'])
# Preliminary Cleaning
# Only keeping records that contain the category_code and brand in order to make more meaningful recommendations
df_apr = df_apr.dropna(subset=['category_code', 'brand'])
df_apr = df_apr.loc[df_apr['event_type'] == 'purchase']
df_apr.nunique()

event_type           1
product_id       39294
category_id        749
category_code      135
brand             2313
price            21481
dtype: int64

In [19]:
# Concatanating dfs together step by step to avoid the memory overload of loading and concatanating all 7 at a time
dfs = [full_data, df_apr]
full_data = pd.concat(dfs)

In [20]:
# Deleting unneeded dfs and clearing memory using the gc library
del df_apr
gc.collect()

359

In [21]:
full_data = full_data[['category_code','category_id','brand', 'product_id', 'price']]
full_data = full_data.drop_duplicates(subset=['category_code','category_id','brand', 'product_id', 'price'])
full_data.head()

Unnamed: 0,category_code,category_id,brand,product_id,price
162,electronics.smartphone,2053013555631882655,samsung,1004856,130.76
308,electronics.smartphone,2053013555631882655,apple,1002532,642.69
442,furniture.bathroom.toilet,2053013557418656265,santeri,13800054,54.42
574,electronics.audio.headphone,2053013554658804075,apple,4804055,189.91
603,electronics.audio.headphone,2053013554658804075,apple,4804056,161.98


In [22]:
full_data.nunique()

category_code      139
category_id        932
brand             4081
product_id       96037
price            67748
dtype: int64

In [23]:
# Saving results in S3
full_data.to_csv('s3://myaws-capstone-bucket/eCommerce_product_data.csv', index=False)