In [1]:
import pandas as pd 
import numpy as np
import pickle 
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


# Train, valid, and Test

First, we get preprocessed train, valid, test data from the Amazon beauty dataset. In this implementation, we care only about 5 core leave one out case. Refer to https://amazon-reviews-2023.github.io/data_processing/5core.html.

In [2]:
from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "5core_last_out_w_his_All_Beauty", trust_remote_code=True)

# Create Ratings matrix from train and valid dataset

In this project, we are not going to "train" on anything. So we'll just use train and valid dataset to construct all ratings.

In [3]:
ratings = []
for i in range(len(dataset['train'])):
    ratings.append({
        'user_id': dataset['train'][i]['user_id'],
        'parent_asin': dataset['train'][i]['parent_asin'],
        'rating': float(dataset['train'][i]['rating'])
    })

for j in range(len(dataset['valid'])):
    ratings.append({
        'user_id': dataset['valid'][j]['user_id'],
        'parent_asin': dataset['valid'][j]['parent_asin'],
        'rating': float(dataset['valid'][j]['rating'])
    })
    

In [4]:
ratings_df = pd.DataFrame(ratings)

In [5]:

# Convert user and product IDs to integer indices for the matrix
user_mapping = {user: idx for idx, user in enumerate(ratings_df['user_id'].unique())}
product_mapping = {product: idx for idx, product in enumerate(ratings_df['parent_asin'].unique())}
reverse_user_mapping = {idx: user for user, idx in user_mapping.items()}
reverse_product_mapping = {idx: product for product, idx in product_mapping.items()}



In [24]:
import pickle 

with open('../draft_retrieval/neuralcf_sideinfo_retrieval.pkl', 'rb') as f:
    retrieval = pickle.load(f)
    
with open('../draft_retrieval/cf_retrieval.pkl', 'rb') as f:
    cf = pickle.load(f)
    
combined_retrieval = {}
for user, idx in user_mapping.items():
    combined_retrieval[user] = list(retrieval[idx]) + list(cf[user])

combined_retrieval

{'AFSKPY37N3C43SOI5IEXEK5JSIYA': ['B08XZDXTG2',
  'B085SY4WC3',
  'B08S3B8Y5G',
  'B08N6YHQXT',
  'B08NJ5BTWG',
  'B08W8LKLHB',
  'B08SG2MBRY',
  'B0998BD871',
  'B08RYN11N9',
  'B07YS9W97B',
  'B088TYPM71',
  'B09BJM95J7',
  'B08MRRNL18',
  'B005AL5H9S',
  'B08PVH18Z6',
  'B08CVTNQP1',
  'B086N136NH',
  'B08H4SYXR4',
  'B08JTNQFZY',
  'B093H28PD4',
  'B08RRSPNWV',
  'B07X8W7GJZ',
  'B086VYKNDF',
  'B08TB3DZ1D',
  'B08BF4BKKM',
  'B09HMXY36F',
  'B088LWRYJC',
  'B08QVJ4NVD',
  'B07Z3NRMBS',
  'B07VQR3W3Z',
  'B08LYT4Q2X',
  'B07LCHCD6Q',
  'B0855L611L',
  'B07W1WJZFG',
  'B08BB3P4VQ',
  'B086TS3BKQ',
  'B08BY91SGT',
  'B08KGVBW41',
  'B07ZJX5MNJ',
  'B08N6CLJ6P',
  'B08693T3XR',
  'B07FP2C8N8',
  'B084ZHP45Y',
  'B081632HX6',
  'B08C37KWRR',
  'B08C71WBLC',
  'B08T7GPT1D',
  'B089ZQ8Y95',
  'B07HN1L8NQ',
  'B07NPCT6L5',
  'B08L4HTQ3R',
  'B08BYK8SKR',
  'B08NPBQR9L',
  'B0932Z1NM1',
  'B07D5FBFQ4',
  'B08DK4NDM3',
  'B08QHP717Z',
  'B08SJKR877',
  'B00O2FGBJS',
  'B087ZQG11L'],
 'AHV6Q

In [26]:
with open('../draft_retrieval/combined_retrieval.pkl', 'wb') as f:
    pickle.dump(combined_retrieval, f)

In [25]:
len(combined_retrieval['AFSKPY37N3C43SOI5IEXEK5JSIYA'])

60

In [13]:
# convert to scipy sparse matrix
ratings_df['user_id'] = ratings_df['user_id'].map(user_mapping)
ratings_df['parent_asin'] = ratings_df['parent_asin'].map(product_mapping)

In [14]:
# Create the sparse matrix
user_item = coo_matrix(
    (ratings_df['rating'], (ratings_df['user_id'], ratings_df['parent_asin']))
).tocsr()

In [15]:

# Train the ALS model
model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# Train the model
model.fit(user_item)


  check_blas_config()
100%|██████████| 50/50 [00:00<00:00, 1376.09it/s]


In [16]:
user_ids = np.arange(user_item.shape[0])
item_ids, scores = model.recommend(user_ids, user_item[user_ids], N=30)

def map_value(x):
    return reverse_product_mapping.get(x)  

vectorized_map = np.vectorize(map_value)
recommended_products = vectorized_map(item_ids)

In [17]:
recommended_products.shape

(253, 30)

In [18]:
ground_truth = [x['parent_asin'] for x in dataset['test']]

In [19]:
def evaluate_recommendations(recommendations, ground_truth):
    """
    Evaluate Hit Rate and Recall for the recommendations.
    """
    hits = 0
    recall_sum = 0
    total_relevant_items = 0

    for i, actual_items in enumerate(ground_truth):
        recommended_items = recommendations[i]
        actual_set = set([actual_items])

        # Calculate hits (whether there's any overlap)
        if any(item in actual_set for item in recommended_items):
            hits += 1

        # Calculate recall (proportion of relevant items in recommendations)
        relevant_items = actual_set.intersection(recommended_items)
        recall_sum += len(relevant_items)
        total_relevant_items += len(actual_set)

    # Hit Rate: Proportion of users with at least one hit
    hit_rate = hits / len(ground_truth)

    # Recall: Average recall across all users
    recall = recall_sum / total_relevant_items

    return {'hit_rate': hit_rate, 'recall': recall}


In [20]:
results = evaluate_recommendations(recommended_products, ground_truth)
print('hit rate is', results['hit_rate'])
print('recall is', results['recall'])

hit rate is 0.11462450592885376
recall is 0.11462450592885376


In [21]:
retrieval = {}
for user, idx in user_mapping.items():
    retrieval[user] = list(recommended_products[idx])

In [22]:
with open('../draft_retrieval/cf_retrieval.pkl', 'wb') as f:
    pickle.dump(retrieval, f)