In this code a simple popularity based recommendation model is built in order to provide recommendations based on most purchased categories to different customers. This is meant to have options for customers that we do not have a lot of information about or in cases where other recommendation systems fail to provide recommendations to customers.

Popularity Based recommendations will be provided to each customer based on their Segment which was predicted on the notebook:Customer_Segmentation_Clustering.ipynb

The idea takes inspiration from the research paper:
- Rodrigues, F., & Ferreira, B. (2016). Product recommendation based on shared Customer's Behaviour. Procedia Computer Science, 100, 136-146. doi:10.1016/j.procs.2016.09.133

In [11]:
# Loading basic needed libraries
import pandas as pd
import numpy as np
import gc

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

In [12]:
# Reading clustered dataset to segment the customers based on their K-means cluster segments and perform Basket Market Analysis per Segment
# dataset came from notebook - Customer_Segmentation_Clustering.ipynb
cluster_df = pd.read_csv('s3://myaws-capstone-bucket/data/cluster_segments.csv')
cluster_df.nunique()

user_id    914574
cluster         6
dtype: int64

In [13]:
# Reading purchased dataframe 
purchase_df = pd.read_csv('s3://myaws-capstone-bucket/eCommerce_purchase_data.csv')
# Combining each user in the training df with their respective cluster
purchase_df = pd.merge(purchase_df,cluster_df, on=["user_id"], how='inner')
purchase_df.nunique()

user_id           914574
user_session     3641796
event_time       3828804
category_code        139
category_id          930
brand               3959
product_id         89325
category             930
cluster                6
dtype: int64

In [14]:
purchase_df.head()

Unnamed: 0,user_id,user_session,event_time,category_code,category_id,brand,product_id,category,cluster
0,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,2019-10-01 00:02:14 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3
1,543272936,3591a683-59b0-41d0-94b7-fbc381401119,2019-10-01 03:42:37 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3
2,543272936,4ab63ddd-717a-435b-93cd-934176ecfc0e,2019-10-02 00:38:12 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3
3,543272936,d5de376a-9d8a-4e66-9fcd-74d00a384daa,2019-10-02 00:43:45 UTC,electronics.telephone,2053013555531219353,panasonic,11300010,2053013555531219353_electronics.telephone,3
4,543272936,5fa53005-2891-4880-a4ac-aa039b71c37c,2019-10-02 22:42:22 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3


In [15]:
# Creating a dfaining df for each customer cluster 
df1 = purchase_df.loc[purchase_df['cluster'] == 0]
df2 = purchase_df.loc[purchase_df['cluster'] == 1]
df3 = purchase_df.loc[purchase_df['cluster'] == 2]
df4 = purchase_df.loc[purchase_df['cluster'] == 3]
df5 = purchase_df.loc[purchase_df['cluster'] == 4]
df6 = purchase_df.loc[purchase_df['cluster'] == 5]

In [16]:
# Counting the number of times each category was purchased for this cluster group
cluster0_recs = df1.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')
cluster0_recs.head(10)

Unnamed: 0,category,cluster,total_purchases
0,2053013551865397438_sport.trainer,0,1
1,2053013551882174655_construction.tools.welding,0,144
2,2053013551898951873_construction.tools.light,0,81
3,2053013551907340482_sport.ski,0,632
4,2053013551924117699_sport.ski,0,12
5,2053013551932506308_construction.tools.drill,0,1480
6,2053013551940894917_computers.desktop,0,6
7,2053013551966060743_kids.carriage,0,279
8,2053013552058335434_appliances.kitchen.meat_gr...,0,36
9,2053013552125444301_appliances.environment.vacuum,0,1


In [17]:
# Grabbing Top 10 most purchased categories for this cluster group
cluster0_recs = cluster0_recs.sort_values(['total_purchases'],ascending = False).head(10)
cluster0_recs

Unnamed: 0,category,cluster,total_purchases
605,2232732093077520756_construction.tools.light,0,308503
106,2053013555631882655_electronics.smartphone,0,69289
83,2053013554658804075_electronics.audio.headphone,0,33562
651,2232732099754852875_appliances.personal.massager,0,26992
545,2232732079706079299_sport.bicycle,0,26952
600,2232732092297380188_appliances.kitchen.washer,0,11043
678,2232732103101907535_electronics.clocks,0,10994
662,2232732101063475749_appliances.environment.vacuum,0,10920
595,2232732091718566220_appliances.kitchen.refrige...,0,8381
314,2053013563835941749_appliances.kitchen.refrige...,0,7749


In [21]:
# Creating purchase_rank column based on total purchases
cluster0_recs['purchase_rank'] = cluster0_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster0_recs['purchase_rank'] = cluster0_recs['purchase_rank'].astype('int64')
cluster0_recs = cluster0_recs[['category', 'cluster', 'purchase_rank']]
cluster0_recs

Unnamed: 0,category,cluster,purchase_rank
605,2232732093077520756_construction.tools.light,0,1
106,2053013555631882655_electronics.smartphone,0,2
83,2053013554658804075_electronics.audio.headphone,0,3
651,2232732099754852875_appliances.personal.massager,0,4
545,2232732079706079299_sport.bicycle,0,5
600,2232732092297380188_appliances.kitchen.washer,0,6
678,2232732103101907535_electronics.clocks,0,7
662,2232732101063475749_appliances.environment.vacuum,0,8
595,2232732091718566220_appliances.kitchen.refrige...,0,9
314,2053013563835941749_appliances.kitchen.refrige...,0,10


In [23]:
# Counting the number of times each category was purchased for this cluster group
cluster1_recs = df2.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')

cluster1_recs = cluster1_recs.sort_values(['total_purchases'],ascending = False).head(10)

cluster1_recs['purchase_rank'] = cluster1_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster1_recs['purchase_rank'] = cluster1_recs['purchase_rank'].astype('int64')
cluster1_recs = cluster1_recs[['category', 'cluster', 'purchase_rank']]
cluster1_recs

Unnamed: 0,category,cluster,purchase_rank
66,2053013555631882655_electronics.smartphone,1,1
245,2232732093077520756_construction.tools.light,1,2
139,2053013563810775923_appliances.kitchen.washer,1,3
98,2053013558920217191_computers.notebook,1,4
48,2053013554658804075_electronics.audio.headphone,1,5
42,2053013554415534427_electronics.video.tv,1,6
163,2053013565983425517_appliances.environment.vacuum,1,7
142,2053013563911439225_appliances.kitchen.refrige...,1,8
126,2053013561579406073_electronics.clocks,1,9
28,2053013553341792533_electronics.clocks,1,10


In [24]:
# Counting the number of times each category was purchased for this cluster group
cluster2_recs = df3.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')

cluster2_recs = cluster2_recs.sort_values(['total_purchases'],ascending = False).head(10)

cluster2_recs['purchase_rank'] = cluster2_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster2_recs['purchase_rank'] = cluster2_recs['purchase_rank'].astype('int64')
cluster2_recs = cluster2_recs[['category', 'cluster', 'purchase_rank']]
cluster2_recs

Unnamed: 0,category,cluster,purchase_rank
103,2053013555631882655_electronics.smartphone,2,1
573,2232732093077520756_construction.tools.light,2,2
515,2232732079706079299_sport.bicycle,2,3
80,2053013554658804075_electronics.audio.headphone,2,4
651,2232732103831716449_apparel.shoes,2,5
626,2232732101063475749_appliances.environment.vacuum,2,6
347,2053013565983425517_appliances.environment.vacuum,2,7
562,2232732091391410500_appliances.kitchen.blender,2,8
604,2232732098446229999_apparel.shoes.sandals,2,9
634,2232732102103663163_furniture.bedroom.blanket,2,10


In [25]:
# Counting the number of times each category was purchased for this cluster group
cluster3_recs = df4.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')

cluster3_recs = cluster3_recs.sort_values(['total_purchases'],ascending = False).head(10)

cluster3_recs['purchase_rank'] = cluster3_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster3_recs['purchase_rank'] = cluster3_recs['purchase_rank'].astype('int64')
cluster3_recs = cluster3_recs[['category', 'cluster', 'purchase_rank']]
cluster3_recs

Unnamed: 0,category,cluster,purchase_rank
645,2232732093077520756_construction.tools.light,3,1
107,2053013555631882655_electronics.smartphone,3,2
583,2232732079706079299_sport.bicycle,3,3
84,2053013554658804075_electronics.audio.headphone,3,4
691,2232732099754852875_appliances.personal.massager,3,5
718,2232732103101907535_electronics.clocks,3,6
640,2232732092297380188_appliances.kitchen.washer,3,7
76,2053013554415534427_electronics.video.tv,3,8
701,2232732101063475749_appliances.environment.vacuum,3,9
86,2053013554725912943_appliances.kitchen.coffee_...,3,10


In [26]:
# Counting the number of times each category was purchased for this cluster group
cluster4_recs = df5.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')

cluster4_recs = cluster4_recs.sort_values(['total_purchases'],ascending = False).head(10)

cluster4_recs['purchase_rank'] = cluster4_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster4_recs['purchase_rank'] = cluster4_recs['purchase_rank'].astype('int64')
cluster4_recs = cluster4_recs[['category', 'cluster', 'purchase_rank']]
cluster4_recs

Unnamed: 0,category,cluster,purchase_rank
604,2232732093077520756_construction.tools.light,4,1
104,2053013555631882655_electronics.smartphone,4,2
542,2232732079706079299_sport.bicycle,4,3
81,2053013554658804075_electronics.audio.headphone,4,4
658,2232732101063475749_appliances.environment.vacuum,4,5
647,2232732099754852875_appliances.personal.massager,4,6
73,2053013554415534427_electronics.video.tv,4,7
599,2232732092297380188_appliances.kitchen.washer,4,8
313,2053013563810775923_appliances.kitchen.washer,4,9
674,2232732103101907535_electronics.clocks,4,10


In [27]:
# Counting the number of times each category was purchased for this cluster group
cluster5_recs = df6.groupby(['category', 'cluster']).size().reset_index(name = 'total_purchases')

cluster5_recs = cluster5_recs.sort_values(['total_purchases'],ascending = False).head(10)

cluster5_recs['purchase_rank'] = cluster5_recs.sort_values(['total_purchases'],ascending=False).groupby('cluster').cumcount()+1
cluster5_recs['purchase_rank'] = cluster5_recs['purchase_rank'].astype('int64')
cluster5_recs = cluster5_recs[['category', 'cluster', 'purchase_rank']]
cluster5_recs

Unnamed: 0,category,cluster,purchase_rank
604,2232732093077520756_construction.tools.light,5,1
544,2232732079706079299_sport.bicycle,5,2
659,2232732101063475749_appliances.environment.vacuum,5,3
105,2053013555631882655_electronics.smartphone,5,4
684,2232732103831716449_apparel.shoes,5,5
585,2232732089587859740_appliances.personal.hair_c...,5,6
593,2232732091391410500_appliances.kitchen.blender,5,7
575,2232732086928670945_electronics.camera.photo,5,8
82,2053013554658804075_electronics.audio.headphone,5,9
636,2232732098446229999_apparel.shoes.sandals,5,10


In [28]:
# Stacking cluster recs together with concat function
data_frames = [cluster0_recs, cluster1_recs, cluster2_recs, cluster3_recs, cluster4_recs, cluster5_recs]
top_recs_df = pd.concat(data_frames)
top_recs_df.nunique()

category         24
cluster           6
purchase_rank    10
dtype: int64

In [29]:
top_recs_df.head()

Unnamed: 0,category,cluster,purchase_rank
605,2232732093077520756_construction.tools.light,0,1
106,2053013555631882655_electronics.smartphone,0,2
83,2053013554658804075_electronics.audio.headphone,0,3
651,2232732099754852875_appliances.personal.massager,0,4
545,2232732079706079299_sport.bicycle,0,5


In [30]:
# Reading the test users and mapping each user to their cluster
test_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_test.csv')
test_df = pd.merge(test_df,cluster_df, on=["user_id"], how='inner')
test_df.nunique()

user_id            548860
category              916
category_id           916
implicit_rating         5
catID                 916
cluster                 6
dtype: int64

In [32]:
# Mapping each user to their top 10 popular recs based on their cluster
popular_recs_df = pd.merge(test_df[['user_id','cluster']],top_recs_df, on=["cluster"], how='inner')
popular_recs_df.nunique()

user_id          548860
cluster               6
category             24
purchase_rank        10
dtype: int64

In [35]:
popular_recs_df = popular_recs_df.drop_duplicates(subset=['user_id', 'category', 'purchase_rank'])
popular_recs_df.head(50)

Unnamed: 0,user_id,cluster,category,purchase_rank
0,512823699,2,2053013555631882655_electronics.smartphone,1
1,512823699,2,2232732093077520756_construction.tools.light,2
2,512823699,2,2232732079706079299_sport.bicycle,3
3,512823699,2,2053013554658804075_electronics.audio.headphone,4
4,512823699,2,2232732103831716449_apparel.shoes,5
5,512823699,2,2232732101063475749_appliances.environment.vacuum,6
6,512823699,2,2053013565983425517_appliances.environment.vacuum,7
7,512823699,2,2232732091391410500_appliances.kitchen.blender,8
8,512823699,2,2232732098446229999_apparel.shoes.sandals,9
9,512823699,2,2232732102103663163_furniture.bedroom.blanket,10


In [36]:
popular_recs_df.tail(50)

Unnamed: 0,user_id,cluster,category,purchase_rank
7860760,533156827,1,2053013555631882655_electronics.smartphone,1
7860761,533156827,1,2232732093077520756_construction.tools.light,2
7860762,533156827,1,2053013563810775923_appliances.kitchen.washer,3
7860763,533156827,1,2053013558920217191_computers.notebook,4
7860764,533156827,1,2053013554658804075_electronics.audio.headphone,5
7860765,533156827,1,2053013554415534427_electronics.video.tv,6
7860766,533156827,1,2053013565983425517_appliances.environment.vacuum,7
7860767,533156827,1,2053013563911439225_appliances.kitchen.refrige...,8
7860768,533156827,1,2053013561579406073_electronics.clocks,9
7860769,533156827,1,2053013553341792533_electronics.clocks,10


In [38]:
# Saving results of recommendations in S3
popular_recs_df.to_csv('s3://myaws-capstone-bucket/data/modeling/output/popular_recs.csv',index=False)