# E-commerce recommendation system
We will be using [Surprise](https://surprise.readthedocs.io/en/stable/index.html), a Python scikit for recommender systems, to build our model. Surprise contains lots of algorithms that are commonly used for recommendation.

In [None]:
!pip install surprise

## if getting a numpy related error, revert back to an older version of numpy
#!pip install numpy==1.23.5

After installing Surprise, we can begin to create our model. We are using the Amazon Reviews '23 dataset found [here](https://amazon-reviews-2023.github.io/data_processing/5core.html). As our memory is limited and the datasets are quite large with many of them containing miillions of reviews, we have been limited to smaller datasets from here. Specifically, we will be using the "all beauty" review data.

We can read the data, which is downloaded as a csv file into a pandas dataframe. The dataframe is then trimmed to only include the columns with the user ID, item ID, and the rating so that it can be converted into a Surprise dataset.

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import Dataset, Reader, SVD, SVDpp, CoClustering, KNNBasic
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise.model_selection import KFold, GridSearchCV

filename ="/kaggle/input/all-beauty/All_Beauty.csv"

df = pd.read_csv(filename)

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[["user_id", "parent_asin", "rating"]], reader)

Now, we want to return the top-N recommendations for each user. We will try multiple different models.

In [None]:
# This function gets the top n (5 by default) recommendations for a user (from documentation)
def get_top_n(predictions, n=5):
    """
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # Map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
# Precision@K and Recall@K from documentation
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

kf = KFold(n_splits=5)

## calculates the average precision@k and recall@k
def get_precision_recall(prediction):
    precision_list = []
    recall_list = []
    for trainset, testset in kf.split(data):
        precisions, recalls = precision_recall_at_k(prediction, k=5, threshold=4)
    
        # Precision and recall can then be averaged over all users
        precision_list.append(sum(prec for prec in precisions.values()) / len(precisions))
        recall_list.append(sum(rec for rec in recalls.values()) / len(recalls))
    print("avg precision: " + str(np.mean(precision_list)))
    print("avg recall: " + str(np.mean(recall_list)))

In [None]:
## Create trainset and testset using 80/20 split
trainset, testset = train_test_split(data, test_size=0.2)

The first model we will try is SVD.

In [None]:
param_grid_svd = {
    'n_epochs': [20, 25, 30],
    'n_factors': [140, 155, 160],
    'lr_all': [0.017, 0.02, 0.025],
    'reg_all': [0.12, 0.15, 0.18], 
    'init_std_dev': [0.05, 0.1, 0.15],
}

# Initialize GridSearchCV with the SVD algorithm
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

# Perform grid search on the entire dataset
gs_svd.fit(data)

# Get the best score and corresponding parameters
print("Best RMSE score:", gs_svd.best_score['rmse'])
print("Best parameters:", gs_svd.best_params['rmse'])

# Use the best parameters to train the final model
best_params = gs_svd.best_params['rmse']
svd_algo = SVD(n_factors=best_params['n_factors'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'], 
              n_epochs=best_params['n_epochs'], init_std_dev=best_params['init_std_dev'])

# Train the model on the trainset
svd_algo.fit(trainset)

# Test the model on the testset
svd_predictions = svd_algo.test(testset)
accuracy.rmse(svd_predictions)

Cross validation for SVD:

In [None]:
cross_validate(svd_algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

get_precision_recall(svd_predictions)

Now we will try an extension of SVD, using the SVD++ algorithm.

In [None]:
#IMPORTANT NOTES

#When we initially ran this algorithm, we obsered low scores for RMSE and MAE (0.40 and 0.31 respectively).
#However, we were unable to reproduce these results consistently in later runs, even when using the same dataset and training/testing sets
#We investigated the issue by running the same code on multiple environments such as Google Colab and locally, and trying multiple random seeds and tuning hyperparameters
#Despite our efforts, we could not replicate the original results, and we believe this could be due to cached variables in the kernel or unintentional overlap in train-test sets
#Moving forward, we continued to tune this algorithm in the best possible way using cross-validation and tuning hyperparameters, so these are the new and correct results.


# Define parameter grid for tuning
param_grid_svdpp = {
    'n_epochs': [35, 40, 45],
    'lr_all': [0.004, 0.005, 0.006],
    'reg_all': [0.08, 0.1, 0.12]
}

# Run grid search
gs_svdpp = GridSearchCV(SVDpp, param_grid_svdpp, measures=['rmse', 'mae'], cv=3, joblib_verbose=1)
gs_svdpp.fit(data)

# Prints the best parameters to show which ones will be used to train final model
print("Best RMSE score:", gs_svdpp.best_score['rmse'])
print("Best parameters:", gs_svdpp.best_params['rmse'])

# Train final model with best parameters
svdpp_algo = gs_svdpp.best_estimator['rmse']
svdpp_algo.fit(trainset)

# This Evaluates the test set 
svdpp_predictions = svdpp_algo.test(testset)
print("Final RMSE:", accuracy.rmse(svdpp_predictions))
print("Final MAE:", accuracy.mae(svdpp_predictions))

Cross validation for SVD++

In [None]:
cross_validate(svdpp_algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

get_precision_recall(svdpp_predictions)

Next we'll try coclustering.

In [None]:
param_grid_cocluster = {
    'n_epochs': [11, 13, 15],
    'n_cltr_u':[10, 12, 15],
    'n_cltr_i':[5, 7, 10]
}

# Initialize GridSearchCV with the CoClustering algorithm
gs_cc = GridSearchCV(CoClustering, param_grid_cocluster, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Perform grid search on the entire dataset
gs_cc.fit(data)

# Get the best score and corresponding parameters
print("Best RMSE score:", gs_cc.best_score['rmse'])
print("Best parameters:", gs_cc.best_params['rmse'])

# Use the best parameters to train the final model
best_params = gs_cc.best_params['rmse']
cc_algo = CoClustering(n_epochs=best_params['n_epochs'], n_cltr_u=best_params['n_cltr_u'], 
                      n_cltr_i=best_params['n_cltr_i'])

cc_algo.fit(trainset)

# Test the model on the testset
cc_predictions = cc_algo.test(testset)

accuracy.rmse(cc_predictions)

Get the accuracy measures for coclustering.

In [None]:
cross_validate(cc_algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

get_precision_recall(cc_predictions)

Next we'll try KNN.

In [None]:
param_grid_knn = {
    'k': [30, 40, 45],
    'min_k': [5, 7, 9]
}
sim_options = {'name': 'cosine', 'user_based': False}

# Initialize GridSearchCV with the CoClustering algorithm
gs_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

# Perform grid search on the entire dataset
gs_knn.fit(data)

# Get the best score and corresponding parameters
print("Best RMSE score:", gs_knn.best_score['rmse'])
print("Best parameters:", gs_knn.best_params['rmse'])

# Use the best parameters to train the final model
best_params = gs_knn.best_params['rmse']
knn_algo = KNNBasic(k=best_params['k'], min_k=best_params['min_k'], 
                      sim_options=sim_options)

knn_algo.fit(trainset)

# Test the model on the testset
knn_predictions = knn_algo.test(testset)
accuracy.rmse(knn_predictions)

Get accuracy measures for KNN.

In [None]:
cross_validate(knn_algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

get_precision_recall(knn_predictions)

Since SVD and SVD++ have similar performance but SVD runs faster, we will apply the SVD model to get the top 5 items.

In [None]:
full_trainset = data.build_full_trainset()
new_testset = full_trainset.build_anti_testset() # get all items not yet rated by user
new_svd_predictions = svd_algo.test(new_testset)
svd_top_n = get_top_n(new_svd_predictions, n=5)

first_five_items = list(svd_top_n.items())[:5]
for key, value in first_five_items:
    # Generate a link using the format: https://www.amazon.com/dp/[ASIN]
    print(key, [f"https://www.amazon.com/dp/{iid}" for (iid, _) in value])