In [1]:
# Import libraries
import pandas as pd
from surprise import prediction_algorithms, Reader, Dataset, accuracy, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from collections import defaultdict


In [2]:
# import CSV
df_combined = pd.read_csv('../combined.csv', encoding='latin-1', low_memory=False)

In [3]:
# Filtering NaN from item_id column and removing event_name from event_name column
df_combined_filtered = df_combined.drop(df_combined[df_combined.item_id.isnull()].index)
df_combined_filtered = df_combined_filtered.drop(df_combined_filtered[df_combined_filtered.event_name == "event_name"].index)

In [4]:
# Replacing events by assigned weight
df_combined_filtered = df_combined_filtered.replace('view_item', 1)
df_combined_filtered = df_combined_filtered.replace('add_to_cart', 2)
df_combined_filtered = df_combined_filtered.replace('purchase', 3)

In [5]:
# Create a new DF with only the columns we need
df_matrix = df_combined_filtered.filter(['user_id','item_id','event_name'], axis=1)
df_matrix.head()

Unnamed: 0,user_id,item_id,event_name
0,02FE33EE8FA641E8B0510FAAA737D927,505e396a-f4a0-46be-911f-809d29d7d3d6,2
1,1D9B56EB0D5347C586799CAC48397B5E,5459ae0d-ae4f-49fa-bd03-4a101f02b6a3,1
2,E8AE92144FB94FBA91C99DE513C6F910,494bc0b7-de10-428a-94f1-8a0da5161774,1
3,7F35FA4501F34494A9EC6B5BD77368F8,15adc2b9-0e7c-439b-a265-3f7ae703a82d,1
4,BA9567962ABE41EDBC7A7FF9476C80FB,00d0a735-79ad-4dad-b8f5-88e305f444c2,2


In [6]:
# we only need highest event_name value 
# (there are some cases that for an uiser and item we have 3 records for each event_name value)
df_matrix = df_matrix.sort_values("event_name", ascending=False).drop_duplicates(['user_id','item_id'])

In [7]:
# reset indexes 
df_matrix = df_matrix.sample(frac=1).reset_index(drop=True)

In [8]:
# We extract the unique values from the user_id to a new variable
users_unique = df_matrix.user_id.unique()
# Assigning the users_unique to a dictionary
users_dict = dict(enumerate(users_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
users_dict_inv = {v: k for k, v in users_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['user_id'] = df_matrix['user_id'].map(users_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
0,1,57d552fd-043f-449e-94af-d0d6b98d0b70,1
1,2,41fa9bb7-27a9-4741-bd6d-b2c9da441d58,2
2,3,57533f59-9589-4ccc-b549-d2424e938119,2
3,4,2ce8cf90-d41c-4bdc-9265-c9e530040f84,3
4,5,615bfb32-bd81-4bda-9d7e-e234e51162db,2
...,...,...,...
9340219,287265,ded6f1c6-da60-48d5-b083-37c019623395,2
9340220,115733,3f5bd747-60a0-484f-b757-8953048dda2e,3
9340221,52428,e559647d-635d-4545-9a50-cad7a0b17239,3
9340222,112493,bbf04a5f-c814-4d8b-a5da-6db35e70a6dd,3


In [9]:
# We extract the unique values from the user_id to a new variable
items_unique = df_matrix.item_id.unique()
# Assigning the users_unique to a dictionary
items_dict = dict(enumerate(items_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
items_dict_inv = {v: k for k, v in items_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['item_id'] = df_matrix['item_id'].map(items_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
0,1,1,1
1,2,2,2
2,3,3,2
3,4,4,3
4,5,5,2
...,...,...,...
9340219,287265,3592,2
9340220,115733,14565,3
9340221,52428,1831,3
9340222,112493,316,3


In [10]:
#create a data reader in scale of 1-3 of rating (min-max)
reader = Reader(rating_scale=(1, 3))
# generate data only with needed columns, event_name colums will be rating for Suprise
data = Dataset.load_from_df(df_matrix[['user_id', 'item_id', 'event_name']], reader)

In [11]:
# event_name (rating)
raw_ratings = data.raw_ratings

In [12]:
# train with 90%
# test set with 10%
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
data.raw_ratings = A_raw_ratings  # data is now the set A

In [13]:
# we have 3*3*3*2 combinations of model, later we will pick the best combination
param_grid = {'n_factors': [5],
              'n_epochs': [5],
              'lr_all': [0.01],
              'reg_all': [0.05],
              'verbose': [True]}
# initialize SVD algo
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=2)

In [14]:
# fit
grid_search.fit(data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [15]:
# get the best score
grid_search.best_score

{'rmse': 0.6605484516177441, 'mae': 0.5545457289133329}

In [16]:
# see results for each combination.
grid_search.cv_results

{'split0_test_rmse': array([0.66069653]),
 'split1_test_rmse': array([0.66040038]),
 'mean_test_rmse': array([0.66054845]),
 'std_test_rmse': array([0.00014807]),
 'rank_test_rmse': array([1]),
 'split0_test_mae': array([0.55462998]),
 'split1_test_mae': array([0.55446148]),
 'mean_test_mae': array([0.55454573]),
 'std_test_mae': array([8.42484046e-05]),
 'rank_test_mae': array([1]),
 'mean_fit_time': array([24.5442791]),
 'std_fit_time': array([0.14565611]),
 'mean_test_time': array([44.7590735]),
 'std_test_time': array([0.33422256]),
 'params': [{'n_factors': 5,
   'n_epochs': 5,
   'lr_all': 0.01,
   'reg_all': 0.05,
   'verbose': True}],
 'param_n_factors': [5],
 'param_n_epochs': [5],
 'param_lr_all': [0.01],
 'param_reg_all': [0.05],
 'param_verbose': [True]}

In [17]:
# set the algorithm with best options (learning rate, epochs, reg_alls, nfactor)
algo = grid_search.best_estimator['rmse']

In [18]:
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb1d4fee070>

### RMSE for traindata

In [19]:
# Compute biased accuracy on A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.6472


0.6471911263125557

### RMSE for testdata

In [20]:
# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_for_test = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions_for_test)

Unbiased accuracy on B, RMSE: 0.6584


0.6584451700028027

In [21]:
# manually checking predictions for testset
def get_prediction_for_index(id_prediction):
    user = predictions_for_test[id_prediction][0]
    item = predictions_for_test[id_prediction][1]
    print('user_id:', users_dict[user], ' product_id: ', items_dict[item])
    ratings = df_matrix.loc[(df_matrix['user_id'] == user) & (df_matrix['item_id'] == item)]
    print('\n')
    print('Recommended Score: ',predictions_for_test[id_prediction][3])
    print('User rating (from settest): ', ratings.event_name.values[0] )


In [22]:
# get top recommendations by each user
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [23]:
top_n = get_top_n(predictions_for_test)

In [24]:
def get_user_top_n(user):
    user_id = users_dict_inv[user]
    top = top_n[user_id]
    new_list = []
    for idx, val in enumerate(top):
        item_id = items_dict[val[0]]
        new_list.append((item_id, val[1]))
    return new_list

In [52]:
# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k = 15, threshold = 3):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    
    return precisions, recalls;

## Recall and Precision

In [59]:
precisions, recalls = precision_recall_at_k(predictions_for_test, k = 20, threshold = 2)
precision = sum(prec for prec in precisions.values()) / len(precisions)
precision

0.934766635971483

In [60]:
recall = sum(rec for rec in recalls.values()) / len(recalls)
recall

0.681918040356784

In [44]:
get_prediction_for_index(6718)

user_id: BAD094451BEE4D159F86FC150FF9728B  product_id:  9e096825-4f3c-4273-bfea-116164b7691a


Recommended Score:  1.6270987824413818
User rating (from settest):  2


In [45]:
get_prediction_for_index(912379)

user_id: 18A564B8B3604A33ABCE71153787A4BE  product_id:  8f9fb479-af05-45dc-b81e-d76ae8274b30


Recommended Score:  2.5931078259921505
User rating (from settest):  3


In [46]:
get_prediction_for_index(86123)

user_id: E15E6143272D4D778C87782CD83B0E20  product_id:  d3602fe3-829e-41a6-a06a-1eb4964d1eb1


Recommended Score:  1.4817500375522719
User rating (from settest):  3


In [47]:
get_prediction_for_index(12319)

user_id: 6d9a6646a8e5b77a4ea8232c863d3bd3  product_id:  cb55c1de-6f0e-44d5-85a5-61041f6063df


Recommended Score:  2.2090904958027777
User rating (from settest):  2


In [48]:
get_prediction_for_index(8612)

user_id: daf2d2862ead2e24eeeee3a017a6eb95  product_id:  eeb38d9d-5fb6-4927-b66d-8b8b7487a722


Recommended Score:  2.107171162838111
User rating (from settest):  2


## testing by user id
#### this will get top of reccomendations for a given user

In [51]:
get_user_top_n('enter user id above')

[('89eef3ca-786d-41eb-ad8c-b7b0613f32c7', 2.356930416142695),
 ('67220e57-17fd-40f4-8d31-40e7fe348622', 2.228293225402106),
 ('017be44f-ffbc-44dd-951f-2abc929c27ff', 2.2278075898308622),
 ('cb55c1de-6f0e-44d5-85a5-61041f6063df', 2.2090904958027777),
 ('13a46f60-6e5b-4da3-ac1b-2223b63da305', 2.194459578002611)]

In [52]:
get_user_top_n('enter user id above')

[('0f4a329f-ee1d-495e-bd49-6d57e41da1c2', 2.397619920982559),
 ('6a9525b8-23c8-40f2-80e8-3f9e7c75f605', 2.371654305239083),
 ('9a433a53-d489-4bff-abe2-2cc611d00d3e', 2.294907019489244),
 ('641dec60-82d4-48c6-a1be-82593c73ebe3', 2.272811685435234),
 ('2bbc897c-368b-4615-9f76-48a871806172', 2.2272181433378138),
 ('eeb38d9d-5fb6-4927-b66d-8b8b7487a722', 2.107171162838111),
 ('44bfa1d5-b6bc-4917-b899-72460a3d9dd4', 2.0930094440031004),
 ('8662b085-511f-4ae1-a4ef-063ebf5c5b98', 2.082354121965674),
 ('148f9a23-082a-4fbf-9c5b-4f53c86931c6', 1.9933496013479801),
 ('b2166d46-d0b2-4c04-bae0-6eacc62128f1', 1.8972737689258985)]