In [130]:
import pandas as pd 
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

# Train, valid, and Test

First, we get preprocessed train, valid, test data from the Amazon beauty dataset. In this implementation, we care only about 5 core leave one out case. Refer to https://amazon-reviews-2023.github.io/data_processing/5core.html.

In [64]:
from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "5core_last_out_w_his_All_Beauty", trust_remote_code=True)

In [83]:
dataset

DatasetDict({
    train: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history'],
        num_rows: 2029
    })
    valid: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history'],
        num_rows: 253
    })
    test: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history'],
        num_rows: 253
    })
})

In [9]:
reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)

In [14]:
reviews['full'][0]

{'rating': 5.0,
 'title': 'Such a lovely scent but not overpowering.',
 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
 'images': [],
 'asin': 'B00YQ6X8EO',
 'parent_asin': 'B00YQ6X8EO',
 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
 'timestamp': 1588687728923,
 'helpful_vote': 0,
 'verified_purchase': True}

# Create Ratings matrix from train and valid dataset

In this project, we are not going to "train" on anything. So we'll just use train and valid dataset to construct all ratings.

In [97]:
ratings = []
for i in range(len(dataset['train'])):
    ratings.append({
        'user_id': dataset['train'][i]['user_id'],
        'parent_asin': dataset['train'][i]['parent_asin'],
        'rating': float(dataset['train'][i]['rating'])
    })

for j in range(len(dataset['valid'])):
    ratings.append({
        'user_id': dataset['valid'][j]['user_id'],
        'parent_asin': dataset['valid'][j]['parent_asin'],
        'rating': float(dataset['valid'][j]['rating'])
    })
    

In [107]:
ratings_df = pd.DataFrame(ratings)

In [108]:

# Convert user and product IDs to integer indices for the matrix
user_mapping = {user: idx for idx, user in enumerate(ratings_df['user_id'].unique())}
product_mapping = {product: idx for idx, product in enumerate(ratings_df['parent_asin'].unique())}
reverse_user_mapping = {idx: user for user, idx in user_mapping.items()}
reverse_product_mapping = {idx: product for product, idx in product_mapping.items()}



In [109]:
# convert to scipy sparse matrix
ratings_df['user_id'] = ratings_df['user_id'].map(user_mapping)
ratings_df['parent_asin'] = ratings_df['parent_asin'].map(product_mapping)

In [122]:
# Create the sparse matrix
user_item = coo_matrix(
    (ratings_df['rating'], (ratings_df['user_id'], ratings_df['parent_asin']))
).tocsr()

In [123]:

# Train the ALS model
model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# Train the model
model.fit(user_item)


100%|██████████| 50/50 [00:00<00:00, 1224.32it/s]


In [132]:
user_ids = np.arange(user_item.shape[0])
item_ids, scores = model.recommend(user_ids, user_item[user_ids], N=30)

def map_value(x):
    return reverse_product_mapping.get(x)  

vectorized_map = np.vectorize(map_value)
recommended_products = vectorized_map(item_ids)

In [134]:
recommended_products.shape

(253, 30)

In [137]:
ground_truth = [x['parent_asin'] for x in dataset['test']]

In [145]:
for i, x in enumerate(ground_truth):
    if x in recommended_products[i]:
        print(x)
        break

B0BTJ6SYKB


In [154]:
def evaluate_recommendations(recommendations, ground_truth):
    """
    Evaluate Hit Rate and Recall for the recommendations.
    """
    hits = 0
    recall_sum = 0
    total_relevant_items = 0

    for i, actual_items in enumerate(ground_truth):
        recommended_items = recommendations[i]
        actual_set = set([actual_items])

        # Calculate hits (whether there's any overlap)
        if any(item in actual_set for item in recommended_items):
            hits += 1

        # Calculate recall (proportion of relevant items in recommendations)
        relevant_items = actual_set.intersection(recommended_items)
        recall_sum += len(relevant_items)
        total_relevant_items += len(actual_set)

    # Hit Rate: Proportion of users with at least one hit
    hit_rate = hits / len(ground_truth)

    # Recall: Average recall across all users
    recall = recall_sum / total_relevant_items

    return {'hit_rate': hit_rate, 'recall': recall}


In [155]:
results = evaluate_recommendations(recommended_products, ground_truth)
print('hit rate is', results['hit_rate'])
print('recall is', results['recall'])

hit rate is 0.1067193675889328
recall is 0.1067193675889328


In [156]:
with open('cf_retrive.npy', 'wb') as f:
    np.save(f, recommended_products)