This code makes use of the MAP@K Rank Accuracy Metric to evaluate the created models 

In [1]:
! pip install ml_metrics



In [2]:
# Loading basic needed libraries
import pandas as pd
import numpy as np
import gc
import ml_metrics as metrics

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

#### Reading the testing dataset

- All models will be tested for accuracy and compared with the same test dataset so that the results are comparable

In [3]:
test_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/implicit_cat_rating_test.csv')
test_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID
0,512823699,2232732109796016868_apparel.shirt,2232732109796016868,2,803
1,543131199,2232732135054115162_apparel.trousers,2232732135054115162,2,916
2,519321120,2053013555631882655_electronics.smartphone,2053013555631882655,2,107
3,567347471,2232732101063475749_appliances.environment.vacuum,2232732101063475749,2,725
4,566466570,2053013556311359947_construction.tools.drill,2053013556311359947,2,126


In [4]:
test_df.shape

(786081, 5)

In [5]:
test_df.nunique()

user_id            548860
category              916
category_id           916
implicit_rating         5
catID                 916
dtype: int64

In [6]:
# Creating rank columns for implicit rating
test_df['category_rank'] = test_df.sort_values(['implicit_rating'],ascending=False).groupby('user_id').cumcount()+1
test_df = test_df.sort_values(['user_id','category_rank'],ascending=True)
test_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
465153,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
87773,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
706020,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
119371,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,1,107,1
159886,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1


In [7]:
users_2_test = test_df[['user_id']]
users_2_test = users_2_test.drop_duplicates(subset=['user_id'])
users_2_test.nunique()

user_id    548860
dtype: int64

#### Testing Popularity Based Recommender

In [8]:
pop_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/popular_recs.csv')
pop_recs.head()

Unnamed: 0,user_id,cluster,category,purchase_rank
0,512823699,2,2053013555631882655_electronics.smartphone,1
1,512823699,2,2232732093077520756_construction.tools.light,2
2,512823699,2,2232732079706079299_sport.bicycle,3
3,512823699,2,2053013554658804075_electronics.audio.headphone,4
4,512823699,2,2232732103831716449_apparel.shoes,5


In [9]:
pop_recs.nunique()

user_id          548860
cluster               6
category             24
purchase_rank        10
dtype: int64

In [10]:
pop_recs.shape

(5488600, 4)

In [11]:
pop_recs = pop_recs.sort_values(['user_id','purchase_rank'],ascending=True)
pop_recs.head()

Unnamed: 0,user_id,cluster,category,purchase_rank
729090,128968633,5,2232732093077520756_construction.tools.light,1
729091,128968633,5,2232732079706079299_sport.bicycle,2
729092,128968633,5,2232732101063475749_appliances.environment.vacuum,3
729093,128968633,5,2053013555631882655_electronics.smartphone,4
729094,128968633,5,2232732103831716449_apparel.shoes,5


Making sure output is as expected for mapk function 

Using the following article as reference: https://www.kaggle.com/wendykan/map-k-demo/notebook?select=test.csv

In [13]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = pop_recs.groupby('user_id')['category'].apply(list)
actual = test_df.groupby('user_id')['category'].apply(list)

In [14]:
predicted

user_id
128968633    [2232732093077520756_construction.tools.light,...
145611266    [2053013555631882655_electronics.smartphone, 2...
200985178    [2232732093077520756_construction.tools.light,...
221480173    [2232732093077520756_construction.tools.light,...
237973968    [2232732093077520756_construction.tools.light,...
                                   ...                        
649728050    [2232732093077520756_construction.tools.light,...
649736039    [2232732093077520756_construction.tools.light,...
649740417    [2232732093077520756_construction.tools.light,...
649754834    [2232732093077520756_construction.tools.light,...
649768788    [2232732093077520756_construction.tools.light,...
Name: category, Length: 548860, dtype: object

In [15]:
actual

user_id
128968633    [2232732093077520756_construction.tools.light,...
145611266    [2053013552259662037_computers.components.powe...
200985178         [2053013555631882655_electronics.smartphone]
221480173    [2232732093077520756_construction.tools.light,...
237973968    [2232732091307524418_appliances.kitchen.refrig...
                                   ...                        
649728050       [2232732093077520756_construction.tools.light]
649736039                     [2232732105635267203_kids.swing]
649740417    [2232732089587859740_appliances.personal.hair_...
649754834       [2232732093077520756_construction.tools.light]
649768788       [2232732093077520756_construction.tools.light]
Name: category, Length: 548860, dtype: object

In [16]:
# Obtaining mapk value from popular recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10)
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5)
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.34294623
MAP@10: 0.33130742


#### Testing MF Spotlight Recommender

In [17]:
MF_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/MF_Spotlight_Param1.csv')
MF_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,0,0,843,2232732113948377930_sport.bicycle,2232732113948377930,128968633
1,0,7,668,2232732093077520756_construction.tools.light,2232732093077520756,128968633
2,0,6,855,2232732115005342564_apparel.shoes.keds,2232732115005342564,128968633
3,0,3,791,2232732108839715530_apparel.costume,2232732108839715530,128968633
4,0,9,862,2232732116347519880_appliances.environment.vacuum,2232732116347519880,128968633


In [18]:
MF_recs.nunique()

userID           548860
Category_Rank        10
catID               588
category            588
category_id         588
user_id          548860
dtype: int64

In [19]:
MF_recs = MF_recs.sort_values(['user_id','Category_Rank'],ascending=True)
MF_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,0,0,843,2232732113948377930_sport.bicycle,2232732113948377930,128968633
8,0,1,611,2232732082063278200_electronics.clocks,2232732082063278200,128968633
5,0,2,365,2053013565639492569_apparel.shoes,2053013565639492569,128968633
3,0,3,791,2232732108839715530_apparel.costume,2232732108839715530,128968633
9,0,4,602,2232732079009824823_kids.skates,2232732079009824823,128968633


In [20]:
MF_recs.shape

(5488600, 6)

In [21]:
test_df = test_df.sort_values(['user_id','category_rank'],ascending=True)
test_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
465153,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
87773,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
706020,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
119371,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,1,107,1
159886,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1


In [22]:
test_df.shape

(786081, 6)

In [23]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = MF_recs.groupby('user_id')['category'].apply(list)
actual = test_df.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from MF recs vs actual test dataset comparrison
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.05093121
MAP@5: 0.04654937


In [24]:
MF_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/MF_Spotlight_Param2.csv')
MF_recs = MF_recs.sort_values(['user_id','Category_Rank'],ascending=True)
MF_recs.shape

(5488600, 6)

In [25]:
MF_recs.head(15)

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,0,0,30,2053013552863641845_appliances.environment.fan,2053013552863641845,128968633
6,0,1,563,2173216765583032544_apparel.shirt,2173216765583032544,128968633
1,0,2,523,2135658543242543872_apparel.shoes,2135658543242543872,128968633
4,0,3,285,2053013562183385881_apparel.shoes,2053013562183385881,128968633
2,0,4,915,2232732134987006296_electronics.audio.headphone,2232732134987006296,128968633
5,0,5,900,2232732130591375596_sport.ski,2232732130591375596,128968633
3,0,6,513,2134904980736311929_electronics.clocks,2134904980736311929,128968633
7,0,7,1,2053013551865397438_sport.trainer,2053013551865397438,128968633
8,0,8,499,2116907524379639897_apparel.shoes,2116907524379639897,128968633
9,0,9,536,2145039101303194425_sport.tennis,2145039101303194425,128968633


In [26]:
test_df.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
465153,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
87773,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
706020,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
119371,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,1,107,1
159886,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1


In [27]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = MF_recs.groupby('user_id')['category'].apply(list)
actual = test_df.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from MF recs vs actual test dataset comparrison
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.00026392
MAP@5: 0.00018179


In [28]:
MF_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/MF_Spotlight_Param3.csv')
MF_recs = MF_recs.sort_values(['user_id','Category_Rank'],ascending=True)
MF_recs.shape

(5488600, 6)

In [29]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = MF_recs.groupby('user_id')['category'].apply(list)
actual = test_df.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from MF recs vs actual test dataset comparrison
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.34039973
MAP@5: 0.32977143


#### Testing LSTM - RNN Model

In [32]:
LSTM_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/LSTM_param1.csv')
LSTM_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,1,0,669,2232732093077520756_construction.tools.light,2232732093077520756,128968633
1,1,6,715,2232732099754852875_appliances.personal.massager,2232732099754852875,128968633
2,1,9,72,2053013554155487563_computers.components.mothe...,2053013554155487563,128968633
3,1,2,639,2232732086928670945_electronics.camera.photo,2232732086928670945,128968633
4,1,1,789,2232732108613223108_sport.trainer,2232732108613223108,128968633


In [33]:
LSTM_recs.nunique()

userID           535748
Category_Rank        10
catID               578
category            578
category_id         578
user_id          535748
dtype: int64

In [34]:
LSTM_recs = LSTM_recs.sort_values(['user_id','Category_Rank'],ascending=True)
LSTM_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,1,0,669,2232732093077520756_construction.tools.light,2232732093077520756,128968633
4,1,1,789,2232732108613223108_sport.trainer,2232732108613223108,128968633
3,1,2,639,2232732086928670945_electronics.camera.photo,2232732086928670945,128968633
9,1,3,155,2053013557452210699_electronics.clocks,2053013557452210699,128968633
6,1,4,726,2232732101063475749_appliances.environment.vacuum,2232732101063475749,128968633


In [35]:
LSTM_recs.shape

(5357480, 6)

In [36]:
# Grabbing ids to use for test_df to verify performance of LSTM model
lstm_users = LSTM_recs[['user_id']]
lstm_users.drop_duplicates(subset=['user_id'])
lstm_users.nunique()

user_id    535748
dtype: int64

In [37]:
actual = pd.merge(test_df,lstm_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
10,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
20,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
30,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1
40,221480173,2053013553140465927_kids.toys,2053013553140465927,1,39,2


In [38]:
actual.nunique()

user_id            535748
category              915
category_id           915
implicit_rating         5
catID                 915
category_rank           6
dtype: int64

In [39]:
actual.shape

(768462, 6)

In [40]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = LSTM_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from LSTM recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10)
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5)
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.38102643
MAP@5: 0.36714771


In [41]:
LSTM_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/LSTM_param2.csv')
LSTM_recs = LSTM_recs.sort_values(['user_id','Category_Rank'],ascending=True)
LSTM_recs.shape

(5357479, 6)

In [42]:
# Grabbing ids to use for test_df to verify performance of LSTM model
lstm_users = LSTM_recs[['user_id']]
lstm_users.drop_duplicates(subset=['user_id'])

actual = pd.merge(test_df,lstm_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.shape

(768462, 6)

In [43]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = LSTM_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from LSTM recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10)
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5)
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.36536315
MAP@5: 0.35434608


In [44]:
LSTM_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/LSTM_param3.csv')
LSTM_recs = LSTM_recs.sort_values(['user_id','Category_Rank'],ascending=True)
LSTM_recs.shape

(5357480, 6)

In [45]:
# Grabbing ids to use for test_df to verify performance of LSTM model
lstm_users = LSTM_recs[['user_id']]
lstm_users.drop_duplicates(subset=['user_id'])

actual = pd.merge(test_df,lstm_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.shape

(768462, 6)

In [46]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = LSTM_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from LSTM recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10)
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5)
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.41688882
MAP@5: 0.40584938


#### Testing CNN Model

In [47]:
CNN_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/CNN_param1.csv')
CNN_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,1,0,789,2232732108613223108_sport.trainer,2232732108613223108,128968633
1,1,1,669,2232732093077520756_construction.tools.light,2232732093077520756,128968633
2,1,2,726,2232732101063475749_appliances.environment.vacuum,2232732101063475749,128968633
3,1,5,715,2232732099754852875_appliances.personal.massager,2232732099754852875,128968633
4,1,3,606,2232732079706079299_sport.bicycle,2232732079706079299,128968633


In [48]:
CNN_recs.nunique()

userID           535748
Category_Rank        10
catID               851
category            851
category_id         851
user_id          535748
dtype: int64

In [49]:
CNN_recs = CNN_recs.sort_values(['user_id','Category_Rank'],ascending=True)
CNN_recs.head()

Unnamed: 0,userID,Category_Rank,catID,category,category_id,user_id
0,1,0,789,2232732108613223108_sport.trainer,2232732108613223108,128968633
1,1,1,669,2232732093077520756_construction.tools.light,2232732093077520756,128968633
2,1,2,726,2232732101063475749_appliances.environment.vacuum,2232732101063475749,128968633
4,1,3,606,2232732079706079299_sport.bicycle,2232732079706079299,128968633
9,1,4,155,2053013557452210699_electronics.clocks,2053013557452210699,128968633


In [50]:
CNN_recs.shape

(5357480, 6)

In [51]:
# Grabbing ids to use for test_df to verify performance of cnn model
cnn_users = CNN_recs[['user_id']]
cnn_users.drop_duplicates(subset=['user_id'])
cnn_users.nunique()

user_id    535748
dtype: int64

In [52]:
actual = pd.merge(test_df,cnn_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
10,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
20,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
30,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1
40,221480173,2053013553140465927_kids.toys,2053013553140465927,1,39,2


In [53]:
actual.nunique()

user_id            535748
category              915
category_id           915
implicit_rating         5
catID                 915
category_rank           6
dtype: int64

In [54]:
actual.shape

(768462, 6)

In [55]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = CNN_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from CNN recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.47194730
MAP@5: 0.46082251


In [56]:
CNN_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/CNN_param2.csv')
CNN_recs = CNN_recs.sort_values(['user_id','Category_Rank'],ascending=True)
CNN_recs.shape

(5357480, 6)

In [57]:
# Grabbing ids to use for test_df to verify performance of cnn model
cnn_users = CNN_recs[['user_id']]
cnn_users.drop_duplicates(subset=['user_id'])

actual = pd.merge(test_df,cnn_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.shape

(768462, 6)

In [58]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = CNN_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from CNN recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.45011793
MAP@5: 0.43937514


In [59]:
CNN_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/CNN_param3.csv')
CNN_recs = CNN_recs.sort_values(['user_id','Category_Rank'],ascending=True)
CNN_recs.shape

(5357480, 6)

In [60]:
# Grabbing ids to use for test_df to verify performance of cnn model
cnn_users = CNN_recs[['user_id']]
cnn_users.drop_duplicates(subset=['user_id'])

actual = pd.merge(test_df,cnn_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.shape

(768462, 6)

In [61]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = CNN_recs.groupby('user_id')['category'].apply(list)
actual = actual.groupby('user_id')['category'].apply(list)

# Obtaining mapk value from CNN recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.46774429
MAP@5: 0.45731451


#### Testing RankFM model

In [62]:
RankFM_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/output/rankFM_T_category.csv')
RankFM_recs.columns = ['user_id','predicted_Rank','catID']
RankFM_recs.head()

Unnamed: 0,user_id,predicted_Rank,catID
0,128968633,0,668
1,145611266,0,668
2,200985178,0,107
3,221480173,0,107
4,237973968,0,668


In [63]:
RankFM_recs = RankFM_recs.sort_values(['user_id','predicted_Rank'],ascending=True)
RankFM_recs.head()

Unnamed: 0,user_id,predicted_Rank,catID
0,128968633,0,668
473469,128968633,1,107
946938,128968633,2,605
1420407,128968633,3,714
1893876,128968633,4,84


In [64]:
RankFM_recs.nunique()

user_id           473469
predicted_Rank        10
catID                474
dtype: int64

In [65]:
RankFM_recs.shape

(4734690, 3)

In [66]:
# Grabbing ids to use for test_df to verify performance of RankFM model
RankFM_users = RankFM_recs[['user_id']]
RankFM_users.drop_duplicates(subset=['user_id'])
RankFM_users.nunique()

user_id    473469
dtype: int64

In [67]:
actual = pd.merge(test_df,RankFM_users, on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
10,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
20,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
30,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,1,107,1
40,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1


In [68]:
actual.nunique()

user_id            473469
category              916
category_id           916
implicit_rating         5
catID                 916
category_rank           6
dtype: int64

In [69]:
actual.shape

(672115, 6)

In [70]:
RankFM_recs.head()

Unnamed: 0,user_id,predicted_Rank,catID
0,128968633,0,668
473469,128968633,1,107
946938,128968633,2,605
1420407,128968633,3,714
1893876,128968633,4,84


In [71]:
predicted = pd.merge(actual[['user_id']],RankFM_recs, on=["user_id"], how='inner')

In [72]:
predicted = predicted.drop_duplicates(subset=['user_id', 'predicted_Rank', 'catID'])
predicted = predicted.sort_values(['user_id','predicted_Rank'],ascending=True)
predicted.head()

Unnamed: 0,user_id,predicted_Rank,catID
0,128968633,0,668
1,128968633,1,107
2,128968633,2,605
3,128968633,3,714
4,128968633,4,84


In [73]:
predicted.nunique()

user_id           473469
predicted_Rank        10
catID                474
dtype: int64

In [74]:
predicted.shape

(4734690, 3)

In [76]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = predicted.groupby('user_id')['catID'].apply(list)
actual = actual.groupby('user_id')['catID'].apply(list)

# Obtaining mapk value from RankFM recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.08936124
MAP@5: 0.08513299


#### Testing CF Model

In [83]:
# Reading CF results
CF_recs = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/ALS/results/part-00000-4e8dadd0-ae3c-4650-a762-ccf1cf39864a-c000.csv')
CF_recs.columns = ['user_id','catID','predicted_rank','predicted_rating']
# Keeping only test users for model evaluation
CF_recs = pd.merge(users_2_test,CF_recs, on=["user_id"], how='inner')
CF_recs = CF_recs.drop_duplicates(subset=['user_id','catID','predicted_rank','predicted_rating'])
CF_recs = CF_recs.sort_values(['user_id','predicted_rank'],ascending=True)
CF_recs.head()

Unnamed: 0,user_id,catID,predicted_rank,predicted_rating
0,128968633,416,0,2.944473
1,128968633,570,1,2.679109
2,128968633,668,2,2.218332
3,128968633,577,3,2.137392
4,128968633,742,4,2.098948


In [84]:
CF_recs.nunique()

user_id              473469
catID                    20
predicted_rank           10
predicted_rating    1657816
dtype: int64

In [85]:
# Keeping same amount of users on test data
actual = pd.merge(test_df,CF_recs[['user_id']], on=["user_id"], how='inner')
actual = actual.drop_duplicates(subset=['user_id', 'category', 'category_id', 'implicit_rating', 'catID'])
actual = actual.sort_values(['user_id','category_rank'],ascending=True)
actual.head()

Unnamed: 0,user_id,category,category_id,implicit_rating,catID,category_rank
0,128968633,2232732093077520756_construction.tools.light,2232732093077520756,3,668,1
10,128968633,2053013552351936731_appliances.environment.air...,2053013552351936731,2,16,2
20,145611266,2053013552259662037_computers.components.power...,2053013552259662037,2,13,1
30,200985178,2053013555631882655_electronics.smartphone,2053013555631882655,1,107,1
40,221480173,2232732093077520756_construction.tools.light,2232732093077520756,2,668,1


In [86]:
actual.nunique()

user_id            473469
category              916
category_id           916
implicit_rating         5
catID                 916
category_rank           6
dtype: int64

In [87]:
#Transforming categories for both recs and actual categories to lists for each user
predicted = CF_recs.groupby('user_id')['catID'].apply(list)
actual = actual.groupby('user_id')['catID'].apply(list)

# Obtaining mapk value from RankFM recs vs actual test dataset comparrison 
mapk_score = metrics.mapk(actual, predicted, 10) 
print('MAP@10:','{:.8f}'.format(mapk_score))

mapk_score = metrics.mapk(actual, predicted, 5) 
print('MAP@5:','{:.8f}'.format(mapk_score))

MAP@10: 0.07047043
MAP@5: 0.06241091
