# Content Based Filtering with LSA


* Goal: find similar items based on text features
* Use LSA to capture the latent relationship between items
* Calculates cosine similarity to identify items that are most similar to the given item


In [195]:
import numpy as np
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from text_pre_processing import combine_text_features, pre_process_text

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

# Load Data

In [6]:
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
df_reviews = dataset_reviews["full"].to_pandas()

dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_items = dataset_items.to_pandas()

In [184]:
df_reviews.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')

In [8]:
df_items.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",4.8,10,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'], 'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg', 'https://m.media-amazon.com/images/I/41w2yznfuZL.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg', 'https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg'], 'variant': ['MAIN', 'PT01']}","{'title': [], 'url': [], 'user_id': []}",Howard Products,[],"{""Package Dimensions"": ""7.1 x 5.5 x 3 inches; 2.38 Pounds"", ""UPC"": ""617390882781""}",B01CUPMQZE,,,


In [161]:
df_items['title_description_features'] = df_items.apply(combine_text_features, axis=1)

In [162]:
df_items.title_description_features.head()

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)
1                                                                                                                                                                                                                                                               

In [163]:
df_items = pre_process_text(df_items, input_col='title_description_features', output_col='title_description_features_pre_processed')

In [142]:
df_items.title_description_features_pre_processed.head(10)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 howard lc leather conditioner ounce pack
1                                                                                                                                                                            

In [13]:
# Tf IDF

In [164]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_items['title_description_features_pre_processed'])

In [165]:
tfidf_matrix.shape

(112590, 5000)

# LSA

In [167]:
n_topics = 50
lsa = TruncatedSVD(n_components=n_topics, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

In [168]:
lsa_matrix.shape

(112590, 50)

# Analyse a few Predictions

In [169]:
def get_similar_items(item_id, item_ids, lsa_matrix, top_n=5):
    """
    Find top-N similar items to a given item.

    Parameters:
        item_id (str): The ID of the item to find similar items for.
        item_profiles (dict): Dictionary with item IDs as keys and their vector representations as values.
        item_vectors (list): List of vector representations for all items.
        top_n (int): Number of similar items to retrieve.

    Returns:
        List of tuples [(similar_item_id, similarity_score), ...].
    """
    if item_id not in item_ids:
        print(f"Item {item_id} not found in the dataset.")
        return []

    item_index = np.where(item_ids == item_id)[0][0] 
    item_vector = lsa_matrix[item_index].reshape(1, -1)

    # calculate cosine similarity with all other items
    similarities = cosine_similarity(item_vector, lsa_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]

    # exclude queried item itself
    top_indices = [idx for idx in sorted_indices][:top_n]
    
    similar_items = [(list(item_profiles.keys())[idx], similarities[idx]) for idx in top_indices]
    return similar_items



In [170]:
def get_similar_items(item_id, item_ids, lsa_matrix, top_n=5):
    """
    Find top-N similar items to a given item.

    Parameters:
        item_id (str): The ID of the item to find similar items for.
        item_ids (array-like): List or array of item IDs corresponding to rows in lsa_matrix.
        lsa_matrix (ndarray): The matrix of item vectors (each row is an item's vector).
        top_n (int): Number of similar items to retrieve.

    Returns:
        List of tuples [(similar_item_id, similarity_score), ...].
    """
    if item_id not in item_ids:
        print(f"Item {item_id} not found in the dataset.")
        return []

    # find the index of the queried item
    item_index = np.where(item_ids == item_id)[0][0]
    normalized_matrix  = normalize(lsa_matrix)
    item_vector = normalized_matrix[item_index].reshape(1, -1)

    # calculate cosine similarity with all other items
    similarities = cosine_similarity(item_vector, lsa_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]

    # Include the queried item itself as the most similar
    top_indices = [idx for idx in sorted_indices if idx != item_index][:top_n]

    # Map indices back to item IDs and similarity scores
    similar_items = [(item_ids[idx], similarities[idx]) for idx in top_indices]

    return similar_items

In [171]:
item_ids = df_items.parent_asin.values


In [172]:
item_id_to_recommend = 'B010PJYJIY'  # CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce
similar_items = get_similar_items(item_id_to_recommend, item_ids, lsa_matrix, top_n=5)

similar_item_ids = [i[0] for i in similar_items]
similarity_scores = {i[0]: i[1] for i in similar_items}  

similar_items_df = df_items[df_items.parent_asin.isin(similar_item_ids)][['parent_asin', 'title', 'title_description_pre_processed']]
similar_items_df['similarity'] = similar_items_df['parent_asin'].map(similarity_scores)

similar_items_df = similar_items_df.sort_values(by='similarity', ascending=False)
similar_items_df

Unnamed: 0,parent_asin,title,title_description_pre_processed,similarity
88042,B00X6P92OG,"OZNaturals Tinted Moisturizer, SPF 30 Sunscreen. Broad Spectrum Protection. Zinc Oxide + Minerals In This Tinted Sunscreen Protects, Moisturizes And Blends Well To Provide A Rich, Youthful Glow!",oznaturals tinted moisturizer spf sunscreen broad spectrum protection zinc oxide mineral tinted sunscreen protects moisturizes blend well provide rich youthful glow skin care expert dermatologist agreement importance wearing sunscreen premature aging skin caused sun exposure photo aging known skin care circle includes wrinkle freckle age spot blotchy complexion dullness sagging skin rough leathery texture word every complaint surrounding aging skin traced least part sun exposure making daily sunscreen protection spf sunscreen must sunscreen created equal oznaturals age defying solar shield spf broad spectrum tinted sunscreen zinc oxide provides broad spectrum protection sun harmful uvauvb ray moisturizing skin healthylooking hint color utilizing highly stable reflective property zinc oxide natural uv diluting quality octyl methoxycinnamate formula offer sheer broadspectrum uv coverage luxurious tinted moisturizer formulation,0.9826
74097,B079P8C9R9,D24k 24K Day Moisturizer with SPF 15 - All Skin Types,dk k day moisturizer spf skin type,0.981019
72397,B06XP3T93N,MŪN Skin Care Anarose Hydrating Rose Toner,mn skin care anarose hydrating rose toner calming toner essential balancing skin ph cleansed bulgarian rosewater instantly soothes even sensitive skin hyaluronic acid firm skin provides longlasting hydration willow bark extract contains natural salicylic acid remove dead cell healthy glow prepares skin receive nutrient goji berry extract help reduce free radical damage provides antiaging benefit combination rose jasmine daisy extract recreates velvety sensation petal skin toner ph range effective skin type,0.980434
59283,B075GDX7G4,HISTOLAB Vitamin C Complex Ampoule 47% Made in Korea Essential Korean Beauty Skin Care Brilliant Skin Vita C,histolab vitamin c complex ampoule made korea essential korean beauty skin care brilliant skin vita c vita c complex ampoule contains vita c complex excellent skin brightening agent help inhibit melanin synthesis also soothes moisturizes skin main ingredient vita c complexskin brightening arbutinmelanin synthesis inhibitory glycyrrhiza glabra powderevening complexion sodium hyaluronatesoothingmoisturizing,0.980099
72541,B01MR7FQB8,"Shi Hydrate, Facial Moisturizer",shi hydrate facial moisturizer facial moisturizer lightweight creme specially chosen ingredient help condition tone skin sage watercress lemon bioflavonoids help purify revitalize oily combination skin condition antioxidant vitamin ac e also incorporated formula protect improve skin elasticity,0.97924


In [173]:
item_id_to_recommend = 'B00BD1D75U' # Eucerin Q10 Anti-Wrinkle Sensitive Skin Creme 1.7 oz (48 g)
similar_items = get_similar_items(item_id_to_recommend,  item_ids, lsa_matrix, top_n=5)
print(f"Top similar items for item {item_id_to_recommend}: {similar_items}")

similar_item_ids = [i[0] for i in similar_items]
df_items[df_items.parent_asin.isin(similar_item_ids)][['parent_asin', 'title', 'title_description_pre_processed']]

Top similar items for item B00BD1D75U: [('B009AG5ZSC', 0.9882973564001214), ('B000G8LWZI', 0.9865733462255789), ('B004RRFO4O', 0.9865733462255789), ('B000N31QP2', 0.9865733462255789), ('B01IT9P9FO', 0.9860346615798006)]


Unnamed: 0,parent_asin,title,title_description_pre_processed
8292,B000G8LWZI,Alpha Hydrox AHA Souffle Soothing Anti-Wrinkle 1.6 oz.,alpha hydrox aha souffle soothing antiwrinkle oz renew skin antiwrinkle souffle enjoy velvety soft skin using alpha hydrox glycolic aha souffle reduce fine line wrinkle therefore soften restore skin contains glycolic aha make skin radiant offer improved texture elasticity softens smoothens skin softens smoothens skin help reduce fine line wrinkle help reduce fine line wrinkle work antiwrinkle exfoliant work antiwrinkle exfoliant skin type closer look alpha hydrox glycolic aha souffle contains glycolic aha thats highly powerful rejuvenate skin also green tea preserve firmness therefore prevent harmful effect free radical get started follow instruction mentioned label best result
17078,B009AG5ZSC,"Hale Cosmeceuticals C-Fine Milk Lotion, 1 oz",hale cosmeceuticals cfine milk lotion oz hale cosmeceuticals cfine milk lotion improves skin texture elasticity skinhealing vitamin c using vitamin c help decrease line wrinkle improving skin elasticity skin age vitamin c product concentrated refined directly absorbed skin maximum effect use hale cosmeceuticals cfine milk lotion treat damage freeradicals caused overexposure sun pollutant poor dieting habit younger skin
31557,B01IT9P9FO,"Timeless Skin Care 20% Vitamin C + E Ferulic Acid Serum - 1 oz, Pack of 2 - Lightweight, Non-Greasy Formula - Use Daily to Brighten, Restore & Correct Skin - Recommended for All Skin Types",timeless skin care vitamin c e ferulic acid serum oz pack lightweight nongreasy formula use daily brighten restore correct skin recommended skin type
53049,B004RRFO4O,Alpha Hydrox AHA Souffle Soothing Anti-Wrinkle 1.6 oz.,alpha hydrox aha souffle soothing antiwrinkle oz renew skin antiwrinkle souffle enjoy velvety soft skin using alpha hydrox glycolic aha souffle reduce fine line wrinkle therefore soften restore skin contains glycolic aha make skin radiant offer improved texture elasticity softens smoothens skin softens smoothens skin help reduce fine line wrinkle help reduce fine line wrinkle work antiwrinkle exfoliant work antiwrinkle exfoliant skin type closer look alpha hydrox glycolic aha souffle contains glycolic aha thats highly powerful rejuvenate skin also green tea preserve firmness therefore prevent harmful effect free radical get started follow instruction mentioned label best result
87593,B000N31QP2,Alpha Hydrox AHA Souffle Soothing Anti-Wrinkle 1.6 oz.,alpha hydrox aha souffle soothing antiwrinkle oz renew skin antiwrinkle souffle enjoy velvety soft skin using alpha hydrox glycolic aha souffle reduce fine line wrinkle therefore soften restore skin contains glycolic aha make skin radiant offer improved texture elasticity softens smoothens skin softens smoothens skin help reduce fine line wrinkle help reduce fine line wrinkle work antiwrinkle exfoliant work antiwrinkle exfoliant skin type closer look alpha hydrox glycolic aha souffle contains glycolic aha thats highly powerful rejuvenate skin also green tea preserve firmness therefore prevent harmful effect free radical get started follow instruction mentioned label best result


In [182]:
item_id_to_recommend = 'B00CBQUQBO' # Mac Studio Tech Foundation 10g NC30
similar_items = get_similar_items(item_id_to_recommend,  item_ids, lsa_matrix, top_n=5)
print(f"Top similar items for item {item_id_to_recommend}: {similar_items}")

similar_item_ids = [i[0] for i in similar_items]
df_items[df_items.parent_asin.isin(similar_item_ids)][['parent_asin', 'title', 'title_description_pre_processed']]

Top similar items for item B00CBQUQBO: [('B00O47HQUC', 0.9322358341288715), ('B079JPNPF3', 0.925584406733888), ('B01DI6U9BW', 0.921435489168978), ('B0194N0Z3A', 0.9202104557580403), ('B01EO5V51O', 0.9157751721468982)]


Unnamed: 0,parent_asin,title,title_description_pre_processed
1003,B079JPNPF3,"Aeroblend Airbrush Makeup Foundation (O35), Suitable for All Skin Type Giving Flawless Effect, Hypoallergenic Foundation, Fragrance Free and Long Lasting",aeroblend airbrush makeup foundation suitable skin type giving flawless effect hypoallergenic foundation fragrance free long lasting waterbased formulated even sensitive skin type contains fragrance parabens petroleum derived ingredient added botanical mineral ingredient like kaolin lavender jojoba hydrate sooth skin soft focus pigment give coverage appropriate even hd use color wide range color selection ensures skin tone ml fl oz
8393,B01DI6U9BW,Ulta Beauty Adjustable Coverage Foundation Wet/Dry Powder ~ (beige with more neutral undertones) Medium,ulta beauty adjustable coverage foundation wetdry powder beige neutral undertone medium creamy moisturizing formula provides flawless luminous finish formula glide effortlessly onto skin skin feel smooth comfortable application innovative blend ingredient ensures powder adheres skin look last last use formula wet dry adjustable coverage
75744,B0194N0Z3A,Charlotte Tilbury 'The Retoucher' Conceal & Treat Stick Wand - 2 Fair,charlotte tilbury retoucher conceal treat stick wand fair retoucher concealer fair perfect people light skin tone highly pigmented formula ultracovering colour correcting texture smooth moisturise skin whilst reducing imperfection make retoucher concealer magic hydrating complex keep skin plump moisturized natural luminous finish black tea derivative rich vitamin fill hollowed skin groove creating smooth seamless surface breakthrough liposiliconic compound special elastomer pearl create invisible plaster imperfection creating traceless natural finish ideal concealing imperfection blemish flawless skin every day parabenfree fill ml carton dimension mm x x create perfect base makeup top concealer tip charlotte recommends use shade light wonder shade magic foundation shade airbrush flawless finish fair medium
77584,B00O47HQUC,Shiseido FUTURE SOLUTION LX Total Radiance Foundation # I20 Natural Light Ivory 30ml,shiseido future solution lx total radiance foundation natural light ivory ml ultimate agedefying foundation infinite beauty work perfect synergy future solution lx skincare create exquisitely radiant finish retextured lifted brightened skin beam flawless vibrance formulated shiseidos exclusive skingenecell p counteract appearance future aging contains msk innovative brightening ingredient luminous even skin tone refines skin texture enhancing radiant complexion aura radiance powder beautiful finish last throughout day time match powder corrects visible sign aging loss firmness wrinkle dark spot dullness pore maintains essential moisture long lasting healthy brighter look
98102,B01EO5V51O,Inglot HD PERFECT COVERUP FOUNDATION 77 | 35 ml/1.18 US FL OZ,inglot hd perfect coverup foundation ml u fl oz


# Evaluate

In [191]:
def recommend_for_user(user_id, ground_truth, item_ids, lsa_matrix, top_n=5):
    """
    Generate recommendations for a user by recommending items similar to their interacted items.
    """
    recommendations = set() 

    for item_id in ground_truth.get(user_id, []):
        
        similar_items = get_similar_items(item_id, item_ids, lsa_matrix, top_n)
        recommendations.update([item[0] for item in similar_items])  
    
    # return top recommendations excluding items already interacted with
    interacted_items = set(ground_truth.get(user_id, []))
    recommendations = [item for item in recommendations if item not in interacted_items]
    return recommendations[:top_n]


In [197]:
def evaluate_recommender(ground_truth, item_ids, lsa_matrix, top_n=5):
    """
    Evaluate the recommender system using precision, recall, and coverage.

    Parameters:
        ground_truth: Mapping of user_id to items they interacted with.
        item_ids: List of all item ids.
        lsa_matrix: LSA matrix for items.
        top_n: Number of recommendations per user.

    Returns:
        precision, recall, coverage
    """
    total_recommendations = 0
    total_relevant = 0
    total_relevant_recommended = 0
    unique_recommended_items = set()

    for user_id, relevant_items in tqdm(ground_truth.items(), total=len(ground_truth)):
        # get recommendations
        recommended_items = recommend_for_user(user_id, ground_truth, item_ids, lsa_matrix, top_n)

        # calculate relevant items recommended
        relevant_recommended = set(recommended_items) & set(relevant_items)

        # update counts
        total_relevant_recommended += len(relevant_recommended)
        total_recommendations += len(recommended_items)
        total_relevant += len(relevant_items)
        unique_recommended_items.update(recommended_items)

    precision = total_relevant_recommended / total_recommendations if total_recommendations > 0 else 0
    recall = total_relevant_recommended / total_relevant if total_relevant > 0 else 0

    # coverage: unique recommended / total items
    coverage = len(unique_recommended_items) / len(item_ids)

    return precision, recall, coverage


In [200]:
reviews_per_user = df_reviews.groupby('user_id').size()
df_reviews['num_reviews_per_user'] = df_reviews['user_id'].map(reviews_per_user)

# leave only users with at least 5 reviews in order to increase relevant items in the dataset and reduce time for evaluation
df_reviews = df_reviews.loc[df_reviews.num_reviews_per_user >= 5].reset_index(drop=True)

ground_truth = df_reviews.groupby('user_id')['parent_asin'].apply(list).to_dict()
len(ground_truth)

330

In [206]:
precision, recall, coverage = evaluate_recommender(ground_truth, item_ids, lsa_matrix, top_n=20)

100%|███████████████████████████████████████████| 53/53 [03:34<00:00,  4.05s/it]


In [None]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Coverage: {coverage:.4f}")