# Collaborative Filltering 

* Goal: implement a user-based collaborative filtering recommendation system.
* Calculate user-user similarity using cosine siilarity on a user-item matrix
* Rank items by their predicted scores and return the top-N recommendations for the user.


Results:
* Results are really bad, probably due to high matrix sparsity
* Users and items have few interactions
* With so few non-zeros finding similarites becomes challenging

In [95]:
from collections import defaultdict

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from train_test_split import temporal_split_users_in_both_sets

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

# Load Data

In [81]:
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
df_reviews = dataset_reviews["full"].to_pandas()

dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_items = dataset_items.to_pandas()


In [82]:
reviews_per_user = df_reviews.groupby('user_id').size()
df_reviews['num_reviews_per_user'] = df_reviews['user_id'].map(reviews_per_user)
df_reviews = df_reviews.loc[df_reviews.num_reviews_per_user >= 5].reset_index(drop=True)

In [96]:
df_reviews.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase',
       'num_reviews_per_user'],
      dtype='object')

In [98]:
train_df, test_df = temporal_split_users_in_both_sets(df_reviews, split_ratio=0.9)


In [99]:
train_df.user_id.unique().shape

(1620,)

In [100]:
test_df.user_id.unique().shape

(1620,)

In [102]:
df_items.parent_asin.unique().shape

(112590,)

# Create User-Item Matrix

In [111]:
# user and item id mappings
user_id_mapping = {user_id: idx for idx, user_id in enumerate(df_reviews['user_id'].unique())}
item_id_mapping = {item_id: idx for idx, item_id in enumerate(df_reviews['parent_asin'].unique())}
reverse_item_id_mapping = {idx: item_id for item_id, idx in item_id_mapping.items()}

In [112]:
train_user_item_matrix = train_df.pivot_table(
    index='user_id', columns='parent_asin', values='rating', fill_value=0
).values

test_user_item_matrix = test_df.pivot_table(
    index='user_id', columns='parent_asin', values='rating', fill_value=0
).values


In [113]:
train_user_item_matrix.shape

(1620, 6177)

In [114]:
test_user_item_matrix.shape

(1620, 1688)

# Calculate User-User Similarity

In [106]:
user_similarity = cosine_similarity(train_user_item_matrix)
np.fill_diagonal(user_similarity, 0)
user_similarity.shape

(1620, 1620)

# Recommend Items 

For a target user $U_t$, predict their rating for an item $I_j$ they haven’t interacted with by:
* Identifying all other users $U_k$ who have rated $I_j$
* Weighting each user's rating by their similarity to $U_t$
  
$$
\text{Predicted\_Score}(I_j) = \frac{\sum_k \left| \text{Similarity}(U_t, U_k) \right|}{\sum_k \left( \text{Similarity}(U_t, U_k) \times \text{Rating}(U_k, I_j) \right)}
$$

Where:
- $I_j$: Item $j$
- $U_t$: Target user
- $U_k$: Another user
- $\text{Similarity}(U_t, U_k)$: Similarity between users $U_t$ and $U_k$
- $\text{Rating}(U_k, I_j)$: Rating given by user $U_k$ to item $I_j$


* Recommend the top items


In [120]:
def recommend_items_user_based(user_index, user_item_matrix, user_similarity, top_n=5):
    """
    Recommend top-N items for a given user based on user-user similarity.

    Parameters:
        user_index: Row index of the target user in the user-item matrix.
        user_item_matrix: User-item interaction matrix.
        user_similarity: User-user similarity matrix.
        top_n: Number of recommendations to generate.

    Returns:
        List of tuples [(item_index, predicted_score), ...].
    """
    # similarities for the target user
    similarities = user_similarity[user_index]
    weighted_ratings = similarities @ train_user_item_matrix
    normalization_factor = np.abs(similarities).sum()
    predicted_scores = weighted_ratings / (normalization_factor + 1e-8)

    # exclude items already rated in the training set
    interacted_items = train_user_item_matrix[user_index] > 0
    predicted_scores[interacted_items] = -np.inf

    # top-n items
    top_indices = np.argsort(predicted_scores)[::-1][:top_n]
    return top_indices, predicted_scores[top_indices]

# Recommend for a few users

Analyse a few recommendations and compare to items users purchased in the past

In [167]:
user_id_to_recommend = 'AE3KLVXGZPANXE5XLXYKHTVAZ3FQ'

user_index = user_id_mapping[user_id_to_recommend]

item_indices, scores = recommend_items_user_based(
    user_index=user_index,
    user_item_matrix=train_user_item_matrix,
    user_similarity=user_similarity,
    top_n=5
)

recommendations = [(reverse_item_id_mapping[i], score) for i, score in zip(item_indices, scores )]

In [169]:
purchased_products = df_reviews[df_reviews.user_id == user_id_to_recommend].parent_asin.values
df_items[df_items.parent_asin.isin(purchased_products)][['title', 'description', 'features']].head()

Unnamed: 0,title,description,features
1666,"5-Pack Hair Scalp scrubber Massager Shampoo Brush for Straight Curly Long Short Thick Thin Wet Dry Hair, Men Women Kids Scalp Care Hair Cleaning Shower, Scalp Care Brush,100% comfortable",[],[]
11445,NOBLE Wigs for White Women Purple Lace Wigs for Black Women 27 Inch Long Curly Wavy Colored Wigs Middle Part Heat Resistant Synthetic Ombre Purple Wavy Wigs for Daily Party Cosplay Use,[],[]
14591,Fragrantshare Makeup Brushes Professional Organizer Foundation Brush for Liquid makeup Travel 9Pcs Sets - Odorless Fiber Hair - Light Blue,[],[]
15061,SportsWell Flexible Reusable Bun Deft Bun for Hair Magic Donut Hair Bun Maker French Twist Hair Tool Bun Crown Twister #5,[],[]
24577,Bejeweled Rhinestone Padded Headbands For Girls Velvet Padded Diamond Headbands For Women Party Wedding Fashion Bejewelled Hairband Headpiece Hair Accessories,"[PACKAGE: 3pcs one piece Diamond Headband one piece Diamond Earrings and one piece Elastic Butterfly headband, which can match your daily wearing and makes you more lovely and charming in party. HIGH QUALITY MATERIAL: Fashion headwear made from rhinestones and plastic hair hoop. Each plastic headband has fabric covered to protect your head. BLING RHINESTONE DESIGN: Bling bling crystal rhinestone design makes it more different. Each diamond headband are handmade,looks and touch not as cheap as others Upgrade Your Style This Luxury Rhinestone Headband is Perfect for Workday, Special Occasions and All Festive Activities in Any Season ,Suitable for Party, Wedding and other Occasions ; We are Sure that you will WIN a Lot of Compliments With These Beautiful Headbands. Fits All Styles Suitable for Long Hair, Short Hair, Straight Hair, Curly or Wavy Hair, Thin Hair, Medium-Thick Hair, Thick Hair, and Extra Thick Long Hair.]","[About this item ☺♥🦋Package includes: you will receive 3 pieces :one piece Diamond Headband one piece Diamond Earrings and one piece Elastic Butterfly headband, the different type match your daily fashion outfits, and make your life be colorful and shinning!, ☺♥🦋Quality material: 1.the diamond of padded rhinestone headband is more shiny and fuller, and better gloss than ordinary rhinestones and not easy to fall off and break, soft velvet for long-term usage 2.Earrings:High Quality Crystal Diamond Vintage Court Bohemian fashion style Earrings with alloy material 3. Elastic Butterfly headband:The Butterfly headband is very fashional and will ensure to absorb sweat.Great to wear in sport or daily fashion outfits., ☺♥🦋Diamond headband sewn from high quality velvet, it touches soft, will not hurt your hair and scalp, bring no pressure and non-slip, which will let you feel light-weighted and comfortable when wearing, everyone needs to have a diamond headband., ☺♥🦋 Exquiste Design: Fashionable headpieces made of rhinestones and velvet reflect light and natural light, making hair hoop look bright and elegant., ☺♥🦋Comfortable to wear: pretty and shiny padded headband measures approx. 1.2 inches in width in centre widest point, and 0.8 inch thick in centre thickest point; The advantages of padded velvet is the key to avoiding the headband headache, ☺♥🦋Fit more sizes: rhinestone headband is handmade from memory elastic plastic, can stretch to fit most women' head widths without worrying about breaking, velvet padded thick headbands suitable for most women and girls, ☺♥🦋Chic accessory: shine bright like a diamond in the party with our rhinestone velvet padded headband, a chic accessory to amp up your everyday style or provide the finishing touches to your evening outfits]"


In [170]:
recommended_item_ids = [i[0] for i in recommendations]

df_items.loc[df_items.parent_asin.isin(recommended_item_ids)][['title', 'description', 'features']].head()

Unnamed: 0,title,description,features
19986,ALICROWN False Eyelashes Fluffy Volume 3D Lashes Pack Handmade Dramatic Thick Crossed Fake Eyelashes Soft Reusable 8 Pairs,[],[]
32219,Auban Sleep Hair Wrap Bonnet Eye Cover Large Black Satin Cap for Sleeping Silk Cap Double Layer for Long Hair Curl Women Sleeping Protection Hair Beanie Block Light,"[Thanks for your interest in, Auban Sleep Bonnet Eye Cover Cap, FEATURES 1. Two layer design, outer soft cotton and inner smooth satin, both are healthy to hair and will not stain. 2. Eye cover style, block lights and prevent skin wrinkles. 3. Fashionable and practical item, not only can you wear Auban eye cover bonnet at sleep, but also can put it on when doing sports or spas. DETAILS Large size: 28 cm wide, 28 cm high Head girth: 56 cm (human average 52-58 cm) Color: Outer black, inner gold, SUGGESTIONS:, 1. Dry your hair before you put on this sleep cap. 2. Better hand wash, turn inside around and spread to air.]","[[3 FUNCTIONS SLEEP BONNET] Auban hair wrap has 3 main benefits, first to maintain your hair neat at the morning, reducing hair mess after woke up, second to keep your head warm at night, and third to shield eyes from strong lights., [DOUBLE LAYER] Hair wrap has two no-fad.ing layers design. Outer layer is made of cotton, keeps warm in cold nights. Inner layer is smooth breathable satin that is friendly to hair and help reducing static., [EYE COVER SLEEP CAP] Auban eye cover hair bonnet is a new emerging sleep cap. It combine bonnet and blindfold together to provide a luxury sleep experience. Wearing our silk cap reduce friction between hair and pillow, prevent night hair loss, eye cover design block light and prevent skin wrinkles., [MULTIPLE USES] More than a sleep bonnet, Auban satin bonnet can be worn as a hair cover on various life scenes such as workout hat when doing sports, or hair wrap when house cleaning, or a fashion item that allows you to hide grease hair when you stay up late., [LARGE SIZE FOR CURL] Hair cap is 28cm wide and 28 cm high, measures 56cm head circumstance while people average 52-58 cm. Elastic fabric has no pressure on head but still stay on head firmly.]"
76452,"LOKFAR Velvet Adult Headbands for Women Thin Rhinestone Beaded Headband Jeweled Headband, Fashion Glitter Bedazzled Headband Crystal Embellished Hair Hoop Trendy Wedding Headband for Girls (Red+Black)",[],[]
76804,HICOCU 25mm 3D Mink Lashes Full Volume Mink Eyelashes Fluffy Volume Dramatic 25mm Mink Lashes Extension Cruelty-free Siberian Mink Strip Eyelashes …… (Y715),[],[]
87449,Moshina Graceful Afro Black Kinky Curly Ponytail with 2 Clips-Natural Looking As Human Hair-Afro Puff Drawstring Ponytail for Black Women -Short Afro Curly Extensions(black 1b),[],[]


# Evaluate

In [151]:
def evaluate_recommendations(test_df, train_user_item_matrix, user_similarity, user_id_mapping, item_id_mapping, top_n=5):
    """
    Evaluate the recommendation system using precision and recall.
    """
    precisions = []
    recalls = []
    recommendations = defaultdict(list)

    # build ground truth
    ground_truth = test_df.groupby('user_id')['parent_asin'].apply(set).to_dict()

    for user_id in tqdm(test_df['user_id'].unique(), total=len(test_df['user_id'].unique())):
        if user_id in user_id_mapping:
            user_index = user_id_mapping[user_id]

            # generate recommendations
            top_indices, predicted_scores = recommend_items_user_based(
                user_index=user_index,
                user_item_matrix=train_user_item_matrix,
                user_similarity=user_similarity,
                top_n=top_n
            )
            recommendations[user_id] = [reverse_item_id_mapping[idx] for idx in top_indices]

            # items user rated 
            relevant_items = ground_truth.get(user_id, set())
            # recomemended items
            recommended_set = set(recommendations[user_id])
            # items that are relevant and were recommended
            relevant_recommended = recommended_set & relevant_items

            # precision and recall
            precision = len(relevant_recommended) / len(recommendations[user_id]) if recommendations[user_id] else 0
            recall = len(relevant_recommended) / len(relevant_items) if relevant_items else 0

            precisions.append(precision)
            recalls.append(recall)

    return np.mean(precisions), np.mean(recalls)


In [175]:
top_n = 10
# Evaluate recommendations
precision, recall = evaluate_recommendations(
    test_df=test_df,
    train_user_item_matrix=train_user_item_matrix,
    user_similarity=user_similarity,
    user_id_mapping=user_id_mapping,
    item_id_mapping=item_id_mapping,
    top_n=top_n
)

print(f"Precision@{top_n}: {precision:.4f}")
print(f"Recall@{top_n}: {recall:.4f}")


100%|██████████████████████████████████████| 1620/1620 [00:07<00:00, 210.61it/s]

Precision@10: 0.0002
Recall@10: 0.0013





In [None]:
# rating prediction error

In [155]:
def calculate_rmse(test_df, train_user_item_matrix, user_similarity, user_id_mapping, item_id_mapping):
    errors = []

    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        user_id = row['user_id']
        item_id = row['parent_asin']
        true_rating = row['rating']

        # map user and item ids to their indices
        if user_id in user_id_mapping and item_id in item_id_mapping:
            user_index = user_id_mapping[user_id]
            item_index = item_id_mapping[item_id]

            # predict the rating
            similarities = user_similarity[user_index]
            ratings = train_user_item_matrix[:, item_index]

            weighted_sum = np.dot(similarities, ratings)
            normalization_factor = np.abs(similarities).sum()
            predicted_rating = weighted_sum / (normalization_factor + 1e-8)

            # add the squared error
            errors.append((predicted_rating - true_rating) ** 2)

    return np.sqrt(np.mean(errors))



In [156]:
# Example usage
user_id_mapping = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_id_mapping = {item_id: idx for idx, item_id in enumerate(train_df['parent_asin'].unique())}

rmse = calculate_rmse(test_df, train_user_item_matrix, user_similarity, user_id_mapping, item_id_mapping)
print("RMSE:", rmse)


100%|████████████████████████████████████| 2171/2171 [00:00<00:00, 28682.50it/s]

RMSE: 4.354642559158511





# Result Analysis

* Results are really bad, probably due to high matrix sparsity
* Users and items have few interactions
* With so few non-zeros finding similarites becomes challenging

In [181]:
1 - (np.count_nonzero(train_user_item_matrix) / train_user_item_matrix.size  )

0.9987806218608658