In [None]:
import json
import gzip
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random

import implicit
from scipy.sparse import csr_matrix

In [None]:
df = pd.read_csv('sports.csv')
df = df.sort_values("unixReviewTime")
df, testdf = train_test_split(df, test_size=0.4, shuffle=False)

In [None]:
# Extract relevant information (e.g., user IDs, item IDs, ratings)
user_item_ratings = [(row['reviewerID'], row['asin'], row['overall']) for index, row in df.iterrows()]

In [None]:
# Convert to a user-item interaction matrix
unique_users = set(item[0] for item in user_item_ratings) # set() converts any of the iterable to sequence of iterable elements with distinct elements
unique_items = set(item[1] for item in user_item_ratings)
user_to_index = {user: i for i, user in enumerate(unique_users)}
item_to_index = {item: i for i, item in enumerate(unique_items)}

num_users = len(unique_users)
num_items = len(unique_items)

In [None]:
# find products in testdf that are also in df for training
similar_df = pd.merge(df, testdf, how='inner', left_on='asin', right_on='asin')
print("number of unique products:", similar_df['asin'].nunique())
products_to_advertise = similar_df.asin.unique()

number of unique products: 146


In [None]:
# parameters to test
no_latent = [10, 20, 30, 40, 50]
regularization = [0.1, 0.2, 0.3, 0.4, 0.5]
no_iterations = [10, 20, 30, 40, 50]
alpha = [10, 20, 30, 40, 50]

In [None]:
def test(no_latent, regularization, no_iterations, alpha):
    interaction_matrix = np.zeros((num_items, num_users))

    # use reviewers as "items" to be recommended to each software product -> items as rows
    for user, item, rating in user_item_ratings:
        interaction_matrix[item_to_index[item]][user_to_index[user]] = rating * alpha + 1

    # Create a sparse matrix representation of the item-user interactions
    sparse_interaction_matrix = csr_matrix(interaction_matrix)

    """
    Sparse matrix is a matrix in which most of its elements are zero.
    Sparse matrix representation refers to storing sparse matrices in a way that optimizes memory usage and computational efficiency.
    Compressed Sparse Row (CSR) Representation uses 3 one-dimensional arrays to store the data
    """

    # Initialize an ALS model
    model = implicit.als.AlternatingLeastSquares(factors=no_latent, regularization=regularization, iterations=no_iterations)

    matrix = (sparse_interaction_matrix).astype('double')

    # Fit the model
    model.fit(matrix, show_progress=False)

    accuracy = 0

    for products in products_to_advertise:
        # get index of item
        item_idx = item_to_index[products] # products = asin
        advertised_item = list(item_to_index.keys())[list(item_to_index.values()).index(item_idx)]

        # Generate recommendations for the item
        ids, scores = model.recommend(item_idx, sparse_interaction_matrix[item_idx], N=15, filter_already_liked_items=False)

        # users recommended by ALS
        targeted_users = []
        for idx in ids:
            targeted_users.append(list(user_to_index.keys())[list(user_to_index.values()).index(idx)]) # get exact reviewerID

        # find users who actually bought advertised item in testdf
        actually_bought_df = testdf.loc[testdf["asin"] == advertised_item]
        actually_bought = actually_bought_df['reviewerID'].tolist()
        # print("actually bought", actually_bought)

        # find number of matches in targeted_users array and list of asin in actually_bought_df
        count = len(set(targeted_users) & set(actually_bought))

        # add accuracy (proportion - number of matches / top 15 recommended)
        accuracy += count / 15

    # get average accuracy across the products tested 
    accuracy /= len(products_to_advertise)
    print(f"no_latent:{no_latent} regularization {regularization}, no_iterations {no_iterations}, alpha {alpha}")
    print(accuracy)

    return accuracy



TESTING

In [None]:
lat_acc = []
for latent in no_latent:
    acc = test(latent, regularization=0.1, no_iterations=10, alpha=40)
    lat_acc.append(acc)

no_latent:10 regularization 0.1, no_iterations 10, alpha 40
0.02054794520547945
no_latent:20 regularization 0.1, no_iterations 10, alpha 40
0.0136986301369863
no_latent:30 regularization 0.1, no_iterations 10, alpha 40
0.0136986301369863
no_latent:40 regularization 0.1, no_iterations 10, alpha 40
0.02054794520547945
no_latent:50 regularization 0.1, no_iterations 10, alpha 40
0.0273972602739726


In [None]:
iters_accs = []
for no_iter in no_iterations:
    acc = test(no_latent=40, regularization=0.1, no_iterations=no_iter, alpha=40)
    iters_accs.append(acc)

no_latent:40 regularization 0.1, no_iterations 10, alpha 40
0.0136986301369863
no_latent:40 regularization 0.1, no_iterations 20, alpha 40
0.0273972602739726
no_latent:40 regularization 0.1, no_iterations 30, alpha 40
0.02054794520547945
no_latent:40 regularization 0.1, no_iterations 40, alpha 40
0.02054794520547945
no_latent:40 regularization 0.1, no_iterations 50, alpha 40
0.0273972602739726


In [None]:
reg_accs = []

for reg in regularization:
  acc = test(no_latent=40, regularization=reg, no_iterations=20, alpha=40)
  reg_accs.append(acc)

no_latent:40 regularization 0.1, no_iterations 20, alpha 40
0.0273972602739726
no_latent:40 regularization 0.2, no_iterations 20, alpha 40
0.02054794520547945
no_latent:40 regularization 0.3, no_iterations 20, alpha 40
0.0136986301369863
no_latent:40 regularization 0.4, no_iterations 20, alpha 40
0.0273972602739726
no_latent:40 regularization 0.5, no_iterations 20, alpha 40
0.02054794520547945


In [None]:
alph_accs = []

for alph in alpha:
  acc = test(no_latent=40, regularization=0.1, no_iterations=20, alpha=alph)
  alph_accs.append(acc)


no_latent:40 regularization 0.1, no_iterations 20, alpha 10
0.02054794520547945
no_latent:40 regularization 0.1, no_iterations 20, alpha 20
0.0273972602739726
no_latent:40 regularization 0.1, no_iterations 20, alpha 30
0.0273972602739726
no_latent:40 regularization 0.1, no_iterations 20, alpha 40
0.0136986301369863
no_latent:40 regularization 0.1, no_iterations 20, alpha 50
0.02054794520547945


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2ca87312-dd22-4033-aec7-588defb6d391' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>