In [1]:
# Import libraries
import pandas as pd
from surprise import prediction_algorithms, Reader, Dataset, accuracy, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [2]:
# import CSV
df_combined = pd.read_csv('combined.csv', encoding='latin-1', low_memory=False)

In [3]:
# Filtering NaN from item_id column and removing event_name from event_name column
df_combined_filtered = df_combined.drop(df_combined[df_combined.item_id.isnull()].index)
df_combined_filtered = df_combined_filtered.drop(df_combined_filtered[df_combined_filtered.event_name == "event_name"].index)

In [4]:
# Replacing events by assigned weight
df_combined_filtered = df_combined_filtered.replace('view_item', 1)
df_combined_filtered = df_combined_filtered.replace('add_to_cart', 2)
df_combined_filtered = df_combined_filtered.replace('purchase', 3)

In [5]:
# Create a new DF with only the columns we need
df_matrix = df_combined_filtered.filter(['user_id','item_id','event_name'], axis=1)
df_matrix.head()

Unnamed: 0,user_id,item_id,event_name
0,02FE33EE8FA641E8B0510FAAA737D927,505e396a-f4a0-46be-911f-809d29d7d3d6,2
1,1D9B56EB0D5347C586799CAC48397B5E,5459ae0d-ae4f-49fa-bd03-4a101f02b6a3,1
2,E8AE92144FB94FBA91C99DE513C6F910,494bc0b7-de10-428a-94f1-8a0da5161774,1
3,7F35FA4501F34494A9EC6B5BD77368F8,15adc2b9-0e7c-439b-a265-3f7ae703a82d,1
4,BA9567962ABE41EDBC7A7FF9476C80FB,00d0a735-79ad-4dad-b8f5-88e305f444c2,2


In [6]:
# we only need highest event_name value 
# (there are some cases that for an uiser and item we have 3 records for each event_name value)
df_matrix = df_matrix.sort_values("event_name", ascending=False).drop_duplicates(['user_id','item_id'])

In [7]:
# reset indexes 
df_matrix = df_matrix.sample(frac=1).reset_index(drop=True)

In [8]:
# We extract the unique values from the user_id to a new variable
users_unique = df_matrix.user_id.unique()
# Assigning the users_unique to a dictionary
users_dict = dict(enumerate(users_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
users_dict_inv = {v: k for k, v in users_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['user_id'] = df_matrix['user_id'].map(users_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
0,1,80786dd9-3705-4b75-9544-2b3cbb669937,2
1,2,988f672b-8238-4872-8c6f-eeff15179ba0,2
2,3,66bed58d-05d6-4821-a621-dbdec9347f9f,1
3,4,53da424b-f9e0-4c3b-8936-1664a25051f4,2
4,5,e8e60f4d-e0f9-402b-8cfd-a9d971adfbc9,1
...,...,...,...
9340219,10567,75b098d0-3952-42aa-920e-56f3c943212d,3
9340220,61224,419d5caa-03ec-4850-aac0-b0cee0938af2,2
9340221,82364,672a6167-f404-44c4-83d4-9b77c60925ec,3
9340222,1889,7d2a0a74-cfa3-4253-9664-e046100fb5dd,3


In [9]:
# We extract the unique values from the user_id to a new variable
items_unique = df_matrix.item_id.unique()
# Assigning the users_unique to a dictionary
items_dict = dict(enumerate(items_unique.flatten(), 1))
# Same dictionary as above but with the keys and values inverted
items_dict_inv = {v: k for k, v in items_dict.items()}
# We assign the dictionary to our user_id column in the df_matrix
df_matrix['item_id'] = df_matrix['item_id'].map(items_dict_inv)
df_matrix

Unnamed: 0,user_id,item_id,event_name
0,1,1,2
1,2,2,2
2,3,3,1
3,4,4,2
4,5,5,1
...,...,...,...
9340219,10567,2272,3
9340220,61224,3380,2
9340221,82364,1815,3
9340222,1889,4144,3


In [10]:
#create a data reader in scale of 1-3 of rating (min-max)
reader = Reader(rating_scale=(1, 3))
# generate data only with needed columns, event_name colums will be rating for Suprise
data = Dataset.load_from_df(df_matrix[['user_id', 'item_id', 'event_name']], reader)

In [11]:
# event_name (rating)
raw_ratings = data.raw_ratings

In [12]:
# train with 90%
# test set with 10%
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
data.raw_ratings = A_raw_ratings  # data is now the set A

In [14]:
# we have 3*3*3*2 combinations of model, later we will pick the best combination
param_grid = {'n_factors': [5],
              'n_epochs': [5],
              'lr_all': [0.01],
              'reg_all': [0.05],
              'verbose': [True]}
# initialize SVD algo
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=2)

In [15]:
# fit
grid_search.fit(data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [16]:
# get the best score
grid_search.best_score

{'rmse': 0.660463717212572, 'mae': 0.554368173990127}

In [17]:
# see results for each combination.
grid_search.cv_results

{'split0_test_rmse': array([0.66057558]),
 'split1_test_rmse': array([0.66035185]),
 'mean_test_rmse': array([0.66046372]),
 'std_test_rmse': array([0.00011187]),
 'rank_test_rmse': array([1]),
 'split0_test_mae': array([0.55439299]),
 'split1_test_mae': array([0.55434336]),
 'mean_test_mae': array([0.55436817]),
 'std_test_mae': array([2.48137127e-05]),
 'rank_test_mae': array([1]),
 'mean_fit_time': array([23.15863311]),
 'std_fit_time': array([0.03759801]),
 'mean_test_time': array([42.95196009]),
 'std_test_time': array([0.00791597]),
 'params': [{'n_factors': 5,
   'n_epochs': 5,
   'lr_all': 0.01,
   'reg_all': 0.05,
   'verbose': True}],
 'param_n_factors': [5],
 'param_n_epochs': [5],
 'param_lr_all': [0.01],
 'param_reg_all': [0.05],
 'param_verbose': [True]}

In [18]:
# set the algorithm with best options (learning rate, epochs, reg_alls, nfactor)
algo = grid_search.best_estimator['rmse']

In [19]:
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f900b446730>

### RMSE for traindata

In [20]:
# Compute biased accuracy on A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.6471


0.6470847318322269

### RMSE for testdata

In [21]:
# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_for_test = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions_for_test)

Unbiased accuracy on B, RMSE: 0.6579


0.6578851487338951

In [31]:
# manually checking predictions for testset
def get_prediction_for_index(id_prediction):
    user = predictions_for_test[id_prediction][0]
    item = predictions_for_test[id_prediction][1]
    print('user_id:', users_dict[user], ' product_id: ', items_dict[item])
    ratings = df_matrix.loc[(df_matrix['user_id'] == user) & (df_matrix['item_id'] == item)]
    print('\n')
    print('Recommended Score: ',predictions_for_test[id_prediction][3])
    print('User rating (from settest): ', ratings.event_name.values[0] )


## Testing agains 10% (testset)

In [32]:
get_prediction_for_index(6718)

user_id: E6628FCB85BD411896243927671B5A91  product_id:  74b0ceab-93fa-4393-926d-d75bf6d7b9ec


Recommended Score:  1.9722018596953674
User rating (from settest):  2


In [33]:
get_prediction_for_index(912379)

user_id: ab16a0070adea5e8b10de7902d309c77  product_id:  b9ebe038-c068-4c4d-8b95-06581bff5dde


Recommended Score:  2.2415990190866366
User rating (from settest):  2


In [34]:
get_prediction_for_index(86123)

user_id: a2a5effaf38a3e5b00b8dba649625198  product_id:  187e9f20-4112-4be0-a988-de7e314a0dcb


Recommended Score:  1.4257712999658985
User rating (from settest):  1


In [35]:
get_prediction_for_index(12319)

user_id: 4cbd457ebaca827fe5e9cba0b8bd6b60  product_id:  22e4e58d-60d5-41f9-be59-f453ccbf6902


Recommended Score:  2.5437329453936823
User rating (from settest):  3


In [36]:
get_prediction_for_index(8612)

user_id: ec35c05f2b000056c3e44c6e4cde1bb4  product_id:  39c363cb-31c6-4df7-b1f0-b21b41e02ae9


Recommended Score:  2.446613930898492
User rating (from settest):  3
