In [101]:
# Import libraries
import pandas as pd
from surprise import prediction_algorithms, Reader, Dataset, accuracy, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [102]:
# import CSV
df_combined = pd.read_csv('combined.csv', encoding='latin-1', low_memory=False)

In [103]:
# Filtering NaN from item_id column and removing event_name from event_name column
df_combined_filtered = df_combined.drop(df_combined[df_combined.item_id.isnull()].index)
df_combined_filtered = df_combined_filtered.drop(df_combined_filtered[df_combined_filtered.event_name == "event_name"].index)

In [104]:
# Replacing events by assigned weight
df_combined_filtered = df_combined_filtered.replace('view_item', 1)
df_combined_filtered = df_combined_filtered.replace('add_to_cart', 2)
df_combined_filtered = df_combined_filtered.replace('purchase', 3)

In [105]:
# Create a new DF with only the columns we need
df_matrix = df_combined_filtered.filter(['user_id','item_id','event_name'], axis=1)
df_matrix.head()

Unnamed: 0,user_id,item_id,event_name
0,02FE33EE8FA641E8B0510FAAA737D927,505e396a-f4a0-46be-911f-809d29d7d3d6,2
1,1D9B56EB0D5347C586799CAC48397B5E,5459ae0d-ae4f-49fa-bd03-4a101f02b6a3,1
2,E8AE92144FB94FBA91C99DE513C6F910,494bc0b7-de10-428a-94f1-8a0da5161774,1
3,7F35FA4501F34494A9EC6B5BD77368F8,15adc2b9-0e7c-439b-a265-3f7ae703a82d,1
4,BA9567962ABE41EDBC7A7FF9476C80FB,00d0a735-79ad-4dad-b8f5-88e305f444c2,2


In [106]:
# we only need highest event_name value 
# (there are some cases that for an uiser and item we have 3 records for each event_name value)
df_matrix = df_matrix.sort_values("event_name", ascending=False).drop_duplicates(['user_id','item_id'])

In [None]:
# reset indexes 
df_matrix = df_matrix.sample(frac=1).reset_index(drop=True)

In [107]:
# We extract the unique values from the user_id to a new variable
users_unique = df_matrix.user_id.unique()
# Assigning the users_unique to a dictionary
users_dict = dict(enumerate(users_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
users_dict_inv = {v: k for k, v in users_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['user_id'] = df_matrix['user_id'].map(users_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
11374311,1,02fd5cc5-848e-42f9-9621-31f58f68ad72,3
16473398,2,3c7b1162-1ea6-4b25-a47c-ea5996bdb68e,3
16473406,3,31bacab6-33bb-4faf-9d48-35da0422032f,3
16473407,4,7774266f-8f12-44fc-8a54-44afe830892d,3
6546387,5,b2a6781e-8499-4643-8b56-b7467c29ea4e,3
...,...,...,...
9178040,52359,8146e83d-78a3-4804-9857-b228ff0074ee,1
4732733,8463,ba3e965b-4fcf-4e3c-931b-d032faedbaf2,1
4732734,68643,60c4a84f-5ffa-40fc-8d87-93ea9b2368fd,1
9178074,40734,0ab72f7f-8912-490b-8d06-17e34240428d,1


In [108]:
# We extract the unique values from the user_id to a new variable
items_unique = df_matrix.item_id.unique()
# Assigning the users_unique to a dictionary
items_dict = dict(enumerate(items_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
items_dict_inv = {v: k for k, v in items_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['item_id'] = df_matrix['item_id'].map(items_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
11374311,1,1,3
16473398,2,2,3
16473406,3,3,3
16473407,4,4,3
6546387,5,5,3
...,...,...,...
9178040,52359,480,1
4732733,8463,6812,1
4732734,68643,436,1
9178074,40734,7127,1


In [109]:
#create a data reader in scale of 1-3 of rating (min-max)
reader = Reader(rating_scale=(1, 3))
# generate data only with needed columns, event_name colums will be rating for Suprise
data = Dataset.load_from_df(df_matrix[['user_id', 'item_id', 'event_name']], reader)

In [110]:
# event_name (rating)
raw_ratings = data.raw_ratings

In [111]:
# train with 90%
# test set with 10%
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
data.raw_ratings = A_raw_ratings  # data is now the set A

In [126]:
# we have 3*3*3*2 combinations of model, later we will pick the best combination
param_grid = {'n_factors': [5, 10, 20], # , 10, 20
              'n_epochs': [5], # 10, 20
              'lr_all': [0.005, 0.002, 0.01], # 0.002, 0.01
              'reg_all': [0.05, 0.02, 0.1], # 0.02, 0.1
              'verbose': [True]}

# initialize SVD algo
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=2)

In [127]:
# fit
grid_search.fit(data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing e

In [128]:
# get the best score
grid_search.best_score

{'rmse': 0.6096369290422055, 'mae': 0.5060156869795038}

In [154]:
grid_search.best_params

{'rmse': {'n_factors': 5,
  'n_epochs': 5,
  'lr_all': 0.01,
  'reg_all': 0.05,
  'verbose': True},
 'mae': {'n_factors': 5,
  'n_epochs': 5,
  'lr_all': 0.01,
  'reg_all': 0.02,
  'verbose': True}}

In [129]:
# see results for each combination.
grid_search.cv_results

{'split0_test_rmse': array([0.615897  , 0.61562369, 0.6165214 , 0.63239955, 0.63203274,
        0.63307999, 0.60979361, 0.60988739, 0.61003403, 0.61605359,
        0.61588015, 0.61662485, 0.6325973 , 0.63229899, 0.6332387 ,
        0.6098934 , 0.61008888, 0.61009793, 0.61636691, 0.61632663,
        0.61680215, 0.63303236, 0.63283469, 0.63354144, 0.61012844,
        0.61052919, 0.61020664]),
 'split1_test_rmse': array([0.6155621 , 0.61531323, 0.61615608, 0.63197907, 0.63160785,
        0.63264287, 0.60948024, 0.6095807 , 0.60967339, 0.61571514,
        0.6155634 , 0.61625366, 0.63218354, 0.63191808, 0.63279253,
        0.6095918 , 0.60979586, 0.60973774, 0.61601236, 0.6160352 ,
        0.61643982, 0.63262379, 0.63245003, 0.6330891 , 0.6098172 ,
        0.61021057, 0.609849  ]),
 'mean_test_rmse': array([0.61572955, 0.61546846, 0.61633874, 0.63218931, 0.63182029,
        0.63286143, 0.60963693, 0.60973404, 0.60985371, 0.61588437,
        0.61572178, 0.61643926, 0.63239042, 0.63210854, 0.

In [130]:
# set the algorithm with best options (learning rate, epochs, reg_alls, nfactor)
algo = grid_search.best_estimator['rmse']

In [131]:
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd04949e910>

In [132]:
# Compute biased accuracy on A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.7533


0.7532748601851799

In [133]:
# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_for_test = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions_for_test)

Unbiased accuracy on B, RMSE: 0.8149


0.8148926216543957

In [170]:
# manually checking predictions for testset
def get_prediction_for_index(id_prediction):
    user = predictions_for_test[id_prediction][0]
    item = predictions_for_test[id_prediction][1]
    print('user_id:', users_dict[user], ' product_id: ', items_dict[item])
    ratings = df_matrix.loc[(df_matrix['user_id'] == user) & (df_matrix['item_id'] == item)]
    print('\n')
    print('Recommended Score: ',predictions_for_test[id_prediction][3])
    print('User rating (from settest): ', ratings.event_name.values[0] )


In [171]:
get_prediction_for_index(6718)

user_id: EDA37DABA7224214BB00F69751BC3FB7  product_id:  235b7f6c-3801-4ff2-9bbb-dcb3e72ff70b


Recommended Score:  1.3822629413971663
User rating (from settest):  1


In [172]:
get_prediction_for_index(912379)

user_id: DFF98225AE7244C6B927616BA98C26CC  product_id:  c5f1d65c-e2c7-4fc9-be15-dfeb171bd0f3


Recommended Score:  1.5800868948885718
User rating (from settest):  1


In [173]:
get_prediction_for_index(86123)

user_id: 15DCD20C37E240499DAD1B6E1F2D3447  product_id:  ce0e51d1-563f-43c0-888a-537c3c423102


Recommended Score:  1.372545865386999
User rating (from settest):  1


In [174]:
get_prediction_for_index(5612)

user_id: 312FD614380945FC8715560F6273E166  product_id:  cdb7185f-a309-4865-ae19-8922365239b1


Recommended Score:  1.6289798607360364
User rating (from settest):  1


In [175]:
get_prediction_for_index(7612)

user_id: 44bb02858494cc3e497173c68b3e0406  product_id:  15ce428d-6262-4afd-948d-07b572d38a0f


Recommended Score:  1.6000187103357784
User rating (from settest):  1


In [176]:
get_prediction_for_index(9999)

user_id: B86B41E27016445AA92ED0094571575A  product_id:  c5f1d65c-e2c7-4fc9-be15-dfeb171bd0f3


Recommended Score:  1.6270755285424643
User rating (from settest):  1
