# Content Based Filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import fuzz, process
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler
from nltk.stem import PorterStemmer
import string
import tqdm
from nltk.corpus import wordnet
from sklearn.preprocessing import normalize
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import average_precision_score, ndcg_score

In [2]:
customer_data = pd.read_csv('../Dataset/customer_data_final.csv')
purchase_history = pd.read_csv('../Dataset/purchase_history.csv')
df = pd.read_csv('../Dataset/Item_data2.csv')

gt1 = pd.read_csv('./ground_truth/Hair_Product_ground_truth_all_relevant.csv')
gt2 = pd.read_csv('./ground_truth/Household_Product_ground_truth_all_relevant.csv')
gt3 = pd.read_csv('./ground_truth/Beauty_Product_ground_truth_all_relevant.csv')

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Daniel
[nltk_data]     Matias\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Daniel
[nltk_data]     Matias\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
purchase_history.head()

Unnamed: 0,ID,ProdID_List,Rating
0,0.0,585.0,5
1,0.0,6.0,3
2,0.0,1.0,4
3,0.0,532.0,5
4,0.0,20.0,3


### Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess_text(text):
    if pd.isna(text) or isinstance(text, (int, float)):
        return '' 
    else:
        tokens = text.split()
        
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        
        return ' '.join(stemmed_tokens)

# Apply preprocessing to specific columns
df['Name'] = df['Name'].apply(preprocess_text)
df['Category'] = df['Category'].apply(preprocess_text)
df['Description'] = df['Description'].apply(preprocess_text)
df['Tags'] = df['Tags'].apply(preprocess_text)

In [7]:
df['Combined_Text'] = (
        df['Name'].fillna('') + ' ' +
        df['Description'].fillna('') + ' ' +
        df['Tags'].fillna('') + ' ' +
        df['Brand'].fillna('') + ' ' +
        df['price_bin'].fillna('').astype(str) + ' ' +
        df['Category'].fillna('')
    )

## Recommendation System

In [8]:
def content_based_filtering_by_prodid(df, prod_id, prod_id_column='ProdID', name_column='Name', text_column='Combined_Text', 
                                      stop_words='english', ngram_range=(1, 2), top_n=10):
    if prod_id not in df[prod_id_column].values:
        return f"ProdID {prod_id} not found in the dataset."

    product_name = df.loc[df[prod_id_column] == prod_id, name_column].values[0]

    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])
    input_vector = tfidf_vectorizer.transform([product_name])

    cosine_sim = cosine_similarity(input_vector, tfidf_matrix)
    
    similarities = pd.DataFrame({
        prod_id_column: df[prod_id_column],
        name_column: df[name_column],
        'Rating': df['Rating'],
        'RatingCount': df['RatingCount'],
        'ReviewCount': df['ReviewCount'],
        'Brand': df['Brand'],
        'Price': df['Price'],
        'ImageURL': df['ImageURL'],
        'Tags': df['Tags'],
        'Similarity': cosine_sim[0]
    })
    similarities = similarities[similarities[prod_id_column] != prod_id]

    scaler = MinMaxScaler()

    similarities['Rating_Score'] = scaler.fit_transform(similarities[['Rating']])
    similarities['RatingCount_Score'] = scaler.fit_transform(similarities[['RatingCount']])
    similarities['ReviewCount_Score'] = scaler.fit_transform(similarities[['ReviewCount']])
    
    similarities['Final_Score'] = (
        (0.5 * similarities['Rating_Score']) + 
        (0.3 * similarities['RatingCount_Score']) + 
        (0.2 * similarities['ReviewCount_Score'])
    )

    similarities = similarities.sort_values(by=['Similarity', 'Final_Score'], ascending=[False, False]).head(top_n)
    
    similarities.drop(columns=['Similarity', 'Rating_Score', 'RatingCount_Score', 'ReviewCount_Score', 'Final_Score'], inplace=True)

    return product_name, similarities


In [9]:
product_name, top_similarities = content_based_filtering_by_prodid(df, prod_id=67)
print(f"Product Name: {product_name}")
top_similarities

Product Name: matrix high amplifi hair and duo 33.8 oz each


Unnamed: 0,ProdID,Name,Rating,RatingCount,ReviewCount,Brand,Price,ImageURL,Tags
2453,369.0,matrix total result mega sleek shea shampoo an...,5.0,5.0,1.0,matrix,28.7,https://i5.walmartimages.com/asr/67be115a-22d6...,"matrix, total, results, mega, sleek, shea, sha..."
2200,3.0,matrix total result high amplifi proforma hair...,5.0,175.0,4.0,matrix,22.5,https://i5.walmartimages.com/asr/aef445f9-6173...,"matrix, total, results, high, amplify, proform..."
2456,3.0,matrix biolag colorlast wash for color treat h...,4.0,175.0,4.0,matrix,21.9,https://i5.walmartimages.com/asr/1561e75a-6113...,"matrix, biolage, colorlast, wash, color, treat..."
179,2.0,matrix biolag condit balm 8.5 oz - (pack of 2),3.2,155.0,0.0,matrix,20.36,https://i5.walmartimages.com/asr/b3ba6ed3-62d8...,"matrix, biolage, conditioning, balm, oz, pack,..."
305,45.0,matrix 17628490 matrix biolag oil wonder flash...,2.6,16.0,0.0,matrix,12.03,https://i5.walmartimages.com/asr/f33c87e3-25d6...,"matrix, 17628490, matrix, biolage, oil, wonder..."
318,61.0,matrix biolag hydrasourc daili leave-in,2.9,16.0,10.0,matrix,413.49,https://i5.walmartimages.com/asr/9b3f2f5f-b180...,"matrix, biolage, hydrasource, daily, leave, wa..."
1541,3.0,matrix - biolag - fiberstrong fortifi cream - ...,3.1,175.0,0.0,matrix,10.99,https://i5.walmartimages.com/asr/e08ad533-4650...,"matrix, biolage, fiberstrong, fortifying, crea..."
2896,765.0,matrix cream develop 10 volum - size : 16 oz,4.0,11.0,6.0,matrix,20.69,https://i5.walmartimages.com/asr/73be1f4f-6e2a...,"matrix, cream, developer, 10, volume, size, 16..."
1975,226.0,biolag hydrasourc condit balm for dri hair by ...,5.0,41.0,21.0,matrix,18.99,https://i5.walmartimages.com/asr/4a0904fb-a101...,"biolage, hydrasource, conditioning, balm, dry,..."
2101,64933.0,"pureolog hydrat conditioner, 33.8 oz",5.0,16.0,13.0,pureology,68.57,https://i5.walmartimages.com/asr/909554f4-cc0a...,"pureology, hydrating, conditioner, oz, wal, mart"


## Search System

### Fuzzy Matching

In [10]:
def fuzzy_match_multiple_columns(train_data, query, threshold=80):
    def preprocess(text):
        return re.sub(r'[^A-Za-z0-9\s]+', '', text).strip().lower()

    query_processed = preprocess(query)
    best_match = None
    best_score = -1

    for column in ['Name', 'Brand', 'Tags', 'Description', 'Category']:
        candidates = train_data[column].fillna('').astype(str)
        for candidate in candidates:
            candidate_processed = preprocess(candidate)
            score = fuzz.token_sort_ratio(query_processed, candidate_processed)
            if score > best_score and score >= threshold:
                best_score = score
                best_match = candidate
    
    return best_match, best_score

### Synonyms

In [11]:
 def get_synonyms(term):
        synonyms = set()
        for synset in wordnet.synsets(term):
            for lemma in synset.lemmas():
                synonyms.add(lemma.name().replace('_', ' '))
        return synonyms

In [12]:
def expand_query_with_synonyms(query):
        query_terms = query.lower().split()
        expanded_terms = query_terms.copy()

        for term in query_terms:
            synonyms = get_synonyms(term)
            expanded_terms.extend(synonyms)

        return " ".join(set(expanded_terms))

### System System 1

In [13]:
def content_based_recommendations(train_data, item_name, top_n=50, name_weight=0.2, description_weight=0.2, tags_weight=0.2, brand_weight=0.2, category_weight=0.2):
    item_name_lower = preprocess_text(item_name)
    
    item_name_lower = expand_query_with_synonyms(item_name_lower)
    query_terms = item_name_lower.split()

    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

    combined_matches = pd.DataFrame()

    for term in query_terms:
        name_match = train_data[train_data['Name'].str.lower().str.contains(term, case=False, na=False)]
        brand_match = train_data[train_data['Brand'].str.lower().str.contains(term, case=False, na=False)]
        tags_match = train_data[train_data['Tags'].str.lower().str.contains(term, case=False, na=False)]
        description_match = train_data[train_data['Description'].str.lower().str.contains(term, case=False, na=False)]
        category_match = train_data[train_data['Category'].str.lower().str.contains(term, case=False, na=False)]

        term_matches = pd.concat([name_match, brand_match, tags_match, description_match, category_match]).drop_duplicates()
        
        combined_matches = pd.concat([combined_matches, term_matches]).drop_duplicates()

    if combined_matches.empty:
        print(f"No exact match found for '{item_name_lower}'. Attempting fuzzy matching...")
        fuzzy_match, fuzzy_score = fuzzy_match_multiple_columns(train_data, item_name_lower)
        
        if fuzzy_match and fuzzy_score >= 80:
            print(f"Fuzzy match found: '{fuzzy_match}' with score: {fuzzy_score}")
            
            name_match = train_data[train_data['Name'].str.lower().str.contains(fuzzy_match.lower(), case=False, na=False)]
            brand_match = train_data[train_data['Brand'].str.lower().str.contains(fuzzy_match.lower(), case=False, na=False)]
            tags_match = train_data[train_data['Tags'].str.lower().str.contains(fuzzy_match.lower(), case=False, na=False)]
            description_match = train_data[train_data['Description'].str.lower().str.contains(fuzzy_match.lower(), case=False, na=False)]
            category_match = train_data[train_data['Category'].str.lower().str.contains(fuzzy_match.lower(), case=False, na=False)]

            combined_matches = pd.concat([name_match, brand_match, tags_match, description_match, category_match]).drop_duplicates()

    if combined_matches.empty:
        print(f"No match found for '{item_name_lower}'. Returning most similar result based on combined features.")
    
        tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['Combined_Text'])
        
        input_vector = tfidf_vectorizer.transform([item_name_lower])

        cosine_sim = cosine_similarity(input_vector, tfidf_matrix)

        most_similar_item_index = cosine_sim.argmax() 
        exact_match = train_data.iloc[[most_similar_item_index]] 
        print(f"Most similar item found: '{exact_match['Name'].values[0]}'")
        combined_matches = exact_match

    if not combined_matches.empty:
        print(f"Found match for '{item_name_lower}'. Showing similar items.")
        
        tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['Combined_Text'])
        
        name_vector = tfidf_vectorizer.transform(train_data['Name'].fillna(''))
        description_vector = tfidf_vectorizer.transform(train_data['Description'].fillna(''))
        tags_vector = tfidf_vectorizer.transform(train_data['Tags'].fillna(''))
        brand_vector = tfidf_vectorizer.transform(train_data['Brand'].fillna(''))
        category_vector = tfidf_vectorizer.transform(train_data['Category'].fillna(''))
        
        weighted_matrix = (
            (name_weight * name_vector) + 
            (description_weight * description_vector) + 
            (tags_weight * tags_vector) + 
            (brand_weight * brand_vector) + 
            (category_weight * category_vector)
        )

        weighted_matrix_normalized = normalize(weighted_matrix, norm='l2')

        cosine_sim = cosine_similarity(weighted_matrix_normalized, weighted_matrix_normalized)

        match_indices = combined_matches.index.tolist()
        similar_items = sorted(list(enumerate(cosine_sim[match_indices].mean(axis=0))), key=lambda x: x[1], reverse=True)

        top_similar_items = similar_items[1:top_n+1]
        recommended_items_indices = [x[0] for x in top_similar_items if x[0] < len(train_data)]

        recommended_items = train_data.iloc[recommended_items_indices][['ProdID', 'Name', 'Rating', 'RatingCount', 'ReviewCount', 'Brand', 'Price', 'ImageURL', 'Tags']]
        
        recommended_items['Rating_Score'] = recommended_items['Rating'].rank(ascending=False, method='min')
        recommended_items['RatingCount_Score'] = recommended_items['RatingCount'].rank(ascending=False, method='min')
        recommended_items['ReviewCount_Score'] = recommended_items['ReviewCount'].rank(ascending=False, method='min')

        recommended_items['Final_Score'] = (
            (0.5 * recommended_items['Rating_Score']) + 
            (0.3 * recommended_items['RatingCount_Score']) + 
            (0.2 * recommended_items['ReviewCount_Score'])
        )

        recommended_items = recommended_items.sort_values(by='Final_Score', ascending=True)
        
        return recommended_items.drop(columns=['Rating_Score', 'RatingCount_Score', 'ReviewCount_Score', 'Final_Score'])


In [14]:
item_name = "loreal"  # The item the user is searching for

# Call the content-based recommendation function
recommended_items = content_based_recommendations(
    train_data=df,
    item_name=item_name,
    top_n=10  # Number of top recommendations to retrieve
)

# Display recommended iteams
recommended_items

Found match for 'loreal'. Showing similar items.


Unnamed: 0,ProdID,Name,Rating,RatingCount,ReviewCount,Brand,Price,ImageURL,Tags
3653,7.0,loreal pari excel creme perman tripl protect h...,4.0,12494.0,6494.0,paris,7.91,https://i5.walmartimages.com/asr/5ffb3626-4031...,"paris, excellence, creme, permanent, triple, p..."
2273,0.0,loreal pari superior prefer fade-defi shine pe...,4.0,3348.0,2077.0,paris,8.97,https://i5.walmartimages.com/asr/3eae3d0b-b23e...,"paris, superior, preference, fade, defying, sh..."
632,6249.0,loreal pari colour rich origin satin lipstick ...,4.0,1029.0,794.0,paris,5.97,https://i5.walmartimages.com/asr/6a105452-4523...,"paris, colour, riche, original, satin, lipstic..."
2993,76.0,loreal pari superior prefer fade-defi shine pe...,3.8,1941.0,1582.0,paris,8.97,https://i5.walmartimages.com/asr/762fd9f2-3624...,"paris, superior, preference, fade, defying, sh..."
3890,2.0,"loreal pari colour rich glossi balm, innoc coral",4.0,155.0,41.0,paris,20.99,https://i5.walmartimages.com/asr/a313e50a-5f82...,"paris, colour, riche, glossy, balm, innocent, ..."
1226,8.0,loreal pari excel creme tripl protect hair col...,5.0,141.0,1.0,paris,62.81,https://i5.walmartimages.com/asr/3801e2fa-7a12...,"paris, excellence, creme, triple, protection, ..."
1903,6.0,loreal pari superior prefer fade-defi shine pe...,3.7,2618.0,1506.0,paris,8.97,https://i5.walmartimages.com/asr/826d9eed-5b6d...,"paris, superior, preference, fade, defying, sh..."
1485,76.0,loreal pari colour rich origin satin lipstick ...,3.6,1430.0,794.0,paris,5.82,https://i5.walmartimages.com/asr/76d82d8d-ff90...,"paris, colour, riche, original, satin, lipstic..."
1,6.0,loreal pari excel creme tripl protect color cr...,3.6,1143.0,760.0,paris,7.97,https://i5.walmartimages.com/asr/4becac6c-e293...,"paris, excellence, creme, triple, protection, ..."
1731,4.0,loreal pari colour rich collect exclus lipstick,3.2,140.0,31.0,paris,4.12,https://i5.walmartimages.com/asr/c654a699-1a19...,"paris, colour, riche, collection, exclusive, l..."


In [15]:
item_name = "shampoo"  # The item the user is searching for

# Call the content-based recommendation function
recommended_items = content_based_recommendations(
    train_data=df,
    item_name=item_name,
    top_n=10  # Number of top recommendations to retrieve
)

# Display recommended iteams
recommended_items

Found match for 'shampoo'. Showing similar items.


Unnamed: 0,ProdID,Name,Rating,RatingCount,ReviewCount,Brand,Price,ImageURL,Tags
3216,41667.0,head and shoulder old spice pure sport dandruf...,4.6,534.0,358.0,"head, shoulders",5.24,https://i5.walmartimages.com/asr/a9b3c1ff-e011...,"head, shoulders, old, spice, pure, sport, dand..."
3881,2.0,loreal pari elviv extraordinari clay rebalanc ...,4.4,2008.0,1891.0,paris,3.97,https://i5.walmartimages.com/asr/dd6fa9f0-4180...,"paris, elvive, extraordinary, clay, rebalancin..."
2071,24.0,paul mitchel awapuhi wild ginger moistur lathe...,5.0,11.0,1.0,"paul, mitchell",47.99,https://i5.walmartimages.com/asr/9c2b4e1b-3b34...,"paul, mitchell, awapuhi, wild, ginger, moistur..."
2235,8.0,"panten nutrient blend shampoo, damag repair, 9...",4.6,141.0,96.0,pantene,6.97,https://i5.walmartimages.com/asr/74e506f4-8667...,"pantene, nutrient, blends, shampoo, damage, re..."
2706,3076159000000.0,big sexi hair big volum shampoo sexi hair 33.8...,5.0,5.0,1.0,"sexy, hair",14.99,https://i5.walmartimages.com/asr/149c88e0-b4a7...,"big, sexy, hair, big, volume, shampoo, sexy, h..."
3605,7.0,garnier whole blend smooth shampoo with coconu...,3.3,15073.0,9799.0,garnier,3.47,https://i5.walmartimages.com/asr/70da3e89-1984...,"garnier, blends, smoothing, shampoo, coconut, ..."
1672,5.0,panten pro-v classic clean 2 in 1 shampoo & co...,3.9,571.0,336.0,pantene,36.98,https://i5.walmartimages.com/asr/846e55e8-aade...,"pantene, pro, v, classic, clean, 2, 1, shampoo..."
3729,383.0,finess 2 in 1 textur enhanc shampoo & conditio...,4.0,2.0,1.0,finesse,24.76,https://i5.walmartimages.com/asr/6d1a79fc-45b7...,"finesse, 2, 1, texture, enhancing, shampoo, co..."
2234,71.0,finess 2 in 1 moistur shampoo and condition 24...,2.9,15.0,0.0,finesse,24.15,https://i5.walmartimages.com/asr/4dd8bfba-387a...,"finesse, 2, 1, moisturizing, shampoo, conditio..."
1154,290366.0,rusk sensori pure mandarin and jasmin shampoo ...,2.0,1.0,0.0,rusk,13.99,https://i5.walmartimages.com/asr/736ac3be-6c68...,"rusk, sensories, pure, mandarin, jasmine, sham..."


### Search System 2

In [17]:
def content_based_recommendations2(train_data, item_name, top_n=50, 
                                  exact_match_weight=1.0, 
                                  ai_vector_weight=0.9, 
                                  category_weight=0.7, 
                                  word_search_weight=0.5, 
                                  fuzzy_weight=0.3):
    
    item_name_lower = preprocess_text(item_name)
    item_name_expanded = expand_query_with_synonyms(item_name_lower)
    query_terms = item_name_expanded.split()

    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    train_data['Combined_Text'] = (
        train_data['Name'].fillna('') + ' ' + 
        train_data['Description'].fillna('') + ' ' +
        train_data['Tags'].fillna('') + ' ' + 
        train_data['Brand'].fillna('') + ' ' + 
        train_data['Category'].fillna('')
    )

    # Step 1: Exact Match
    exact_matches = pd.DataFrame()
    for term in query_terms:
        exact_match = train_data[train_data['Name'].str.lower().str.contains(term, case=False, na=False)]
        exact_matches = pd.concat([exact_matches, exact_match]).drop_duplicates()

    if not exact_matches.empty:
        exact_matches['Score'] = exact_match_weight
        combined_matches = exact_matches
    else:
        combined_matches = pd.DataFrame()

    # Step 2: AI Vector Search (TF-IDF with Cosine Similarity)
    tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['Combined_Text'])
    input_vector = tfidf_vectorizer.transform([item_name_expanded])
    cosine_sim = cosine_similarity(input_vector, tfidf_matrix)
    
    ai_vector_matches = train_data.copy()
    ai_vector_matches['Similarity'] = cosine_sim[0]
    ai_vector_matches = ai_vector_matches[ai_vector_matches['Similarity'] > 0]
    ai_vector_matches['Score'] = ai_vector_matches['Similarity'] * ai_vector_weight
    
    combined_matches = pd.concat([combined_matches, ai_vector_matches]).drop_duplicates()

    # Step 3: Category Match
    category_matches = train_data[train_data['Category'].str.lower().str.contains(item_name_lower, case=False, na=False)]
    if not category_matches.empty:
        category_matches.loc[:, 'Score'] = category_weight
        combined_matches = pd.concat([combined_matches, category_matches]).drop_duplicates()

    # Step 4: Word Search (using expanded terms)
    word_search_matches = pd.DataFrame()
    for term in query_terms:
        word_match = train_data[
            train_data['Name'].str.lower().str.contains(term, case=False, na=False) |
            train_data['Brand'].str.lower().str.contains(term, case=False, na=False) |
            train_data['Tags'].str.lower().str.contains(term, case=False, na=False) |
            train_data['Description'].str.lower().str.contains(term, case=False, na=False) |
            train_data['Category'].str.lower().str.contains(term, case=False, na=False)
        ]
        word_search_matches = pd.concat([word_search_matches, word_match]).drop_duplicates()
        
    if not word_search_matches.empty:
        word_search_matches['Score'] = word_search_weight
        combined_matches = pd.concat([combined_matches, word_search_matches]).drop_duplicates()

    # Step 5: Fuzzy Search
    fuzzy_matches = pd.DataFrame()
    for column in ['Name', 'Brand', 'Tags', 'Description', 'Category']:
        candidates = train_data[column].fillna('').astype(str)
        for candidate in candidates:
            candidate_processed = preprocess_text(candidate)
            score = fuzz.token_sort_ratio(item_name_lower, candidate_processed)
            if score >= 80:  # threshold for fuzzy matching
                match = train_data[train_data[column].str.lower() == candidate_processed]
                match['Score'] = score / 100 * fuzzy_weight
                fuzzy_matches = pd.concat([fuzzy_matches, match]).drop_duplicates()

    combined_matches = pd.concat([combined_matches, fuzzy_matches]).drop_duplicates()

    # Combine Scores and Rank by Content-Based Score
    combined_matches['Final_Score'] = combined_matches.groupby(['ProdID'])['Score'].transform('sum')
    recommended_items = combined_matches.sort_values(by='Final_Score', ascending=False).drop_duplicates('ProdID').head(top_n)

    # Additional Ranking Based on Ratings, RatingCount, and ReviewCount
    recommended_items['Rating_Score'] = recommended_items['Rating'].rank(ascending=False, method='min')
    recommended_items['RatingCount_Score'] = recommended_items['RatingCount'].rank(ascending=False, method='min')
    recommended_items['ReviewCount_Score'] = recommended_items['ReviewCount'].rank(ascending=False, method='min')

    recommended_items['Final_Score'] = (
        (0.5 * recommended_items['Rating_Score']) + 
        (0.3 * recommended_items['RatingCount_Score']) + 
        (0.2 * recommended_items['ReviewCount_Score'])
    )

    recommended_items = recommended_items.sort_values(by='Final_Score', ascending=True)
    
    return recommended_items[['ProdID', 'Name', 'Rating', 'RatingCount', 'ReviewCount', 'Brand', 'Price', 'Tags']]


In [18]:
item_name = "loreal"  

recommended_items = content_based_recommendations2(
    train_data=df,
    item_name=item_name,
    top_n=10 
)

recommended_items

Unnamed: 0,ProdID,Name,Rating,RatingCount,ReviewCount,Brand,Price,Tags
3653,7.0,loreal pari excel creme perman tripl protect h...,4.0,12494.0,6494.0,paris,7.91,"paris, excellence, creme, permanent, triple, p..."
63,3.0,"loreal pari infal pro last 2 step lipstick, pe...",4.0,751.0,461.0,paris,9.98,"paris, infallible, pro, 2, step, lipstick, per..."
1034,5.0,etern mauv 520 by loreal for women lipstick,4.8,154.0,53.0,paris,6.47,"eternally, mauve, 520, women, lipstick, wal, mart"
1,6.0,loreal pari excel creme tripl protect color cr...,3.6,1143.0,760.0,paris,7.97,"paris, excellence, creme, triple, protection, ..."
3851,1.0,loreal pari colorista hair makeup temporari 1-...,3.4,1855.0,1006.0,paris,8.97,"paris, colorista, hair, makeup, temporary, 1, ..."
3875,8.0,"loreal pari colour rich lip liner, last plum, ...",4.4,141.0,85.0,paris,5.82,"paris, colour, riche, lip, liner, lasting, plu..."
3890,2.0,"loreal pari colour rich glossi balm, innoc coral",4.0,155.0,41.0,paris,20.99,"paris, colour, riche, glossy, balm, innocent, ..."
2375,9.0,loreal profession majicontrast - red - 1.7 oz ...,2.9,157.0,0.0,professionnel,14.07,"loreal, professional, majicontrast, red, oz, h..."
3124,0.0,"loreal pari excel non-drip creme hair color, r...",2.7,164.0,0.0,paris,31.44,"paris, excellence, non, drip, creme, hair, col..."
2335,4.0,2 pack - loreal pari revitalift doubl lift eye...,3.2,140.0,47.0,professionnel,24.45,"2, pack, paris, revitalift, double, lifting, e..."


In [19]:
item_name = "shampoo"  

recommended_items = content_based_recommendations2(
    train_data=df,
    item_name=item_name,
    top_n=10 
)

recommended_items

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Unnamed: 0,ProdID,Name,Rating,RatingCount,ReviewCount,Brand,Price,Tags
1605,4.0,"loreal pari magic root cover up conceal spray,...",4.0,11721.0,6955.0,paris,9.97,"paris, magic, root, cover, concealer, spray, b..."
2989,8.0,redken color extend magnet sulfate-fre shampoo...,4.5,505.0,326.0,redken,30.0,"redken, color, extend, magnetics, sulfate, fre..."
975,1.00794e+42,suav profession coconut milk infus intens mois...,4.6,246.0,220.0,suave,13.53,"suave, professionals, coconut, milk, infusion,..."
2755,2.0,just for men color gel mustach & beard m-35 me...,3.2,2257.0,2070.0,men,40.68,"men, color, gel, mustache, beard, medium, brow..."
3666,1.0,ogx ever straighten + brazilian keratin therap...,3.2,796.0,425.0,ogx,21.0,"ogx, straightening, brazilian, keratin, therap..."
1672,5.0,panten pro-v classic clean 2 in 1 shampoo & co...,3.9,571.0,336.0,pantene,36.98,"pantene, pro, v, classic, clean, 2, 1, shampoo..."
1701,56.0,splat hair chalk (sugar plum),4.2,56.0,53.0,splat,6.89,"splat, hair, chalk, sugar, plum, wal, mart"
3600,3.0,volum therapi shampoo by biosilk for unisex - ...,3.1,175.0,0.0,biosilk,25.87,"volumizing, therapy, shampoo, biosilk, unisex,..."
2434,0.0,"panten pro-v smooth & sleek conditioner, 23.7 ...",2.7,164.0,142.0,pantene,29.9,"pantene, pro, v, smooth, sleek, conditioner, f..."
3013,9.0,blue neon color pigment powder for craft soap ...,2.9,157.0,0.0,"oils, center",12.19,"blue, neon, colorant, pigment, powder, crafts,..."


In [None]:
item_name = "Hair Product"  

recommended_items = content_based_recommendations2(
    train_data=df,
    item_name=item_name,
    top_n=10 
)

recommended_items

## Search Evaluation

In [19]:
ground_truth_top = gt1.sort_values(by='Rating', ascending=False).head(20)
ground_truth_top2 = gt2.sort_values(by='Rating', ascending=False).head(20)
ground_truth_top3 = gt3.sort_values(by='Rating', ascending=False).head(20)

In [20]:
def evaluate_recommendations(train_data, ground_truth, query, top_n=20):
    """
    Evaluates the content-based recommendation system based on the ground truth.
    
    Parameters:
    - train_data: The full dataset containing product information (df).
    - ground_truth: The ground truth dataset (manually labeled relevant products, df2).
    - query: The search query used to generate recommendations.
    - top_n: Number of top recommendations to evaluate.
    
    Returns:
    - precision: Precision score for the recommendations.
    - recall: Recall score for the recommendations.
    - f1: F1-Score for the recommendations.
    - accuracy: Accuracy score for the recommendations.
    - map_score: Mean Average Precision (MAP) score.
    - ndcg: Normalized Discounted Cumulative Gain (NDCG) score.
    """
    recommended_items = content_based_recommendations(train_data, query, top_n=top_n)
    
    recommended_prod_ids = recommended_items['ProdID'].values
    relevant_prod_ids = ground_truth['ProdID'].values
    
    y_true = [1 if prod_id in relevant_prod_ids else 0 for prod_id in train_data['ProdID']]
    y_pred = [1 if prod_id in recommended_prod_ids else 0 for prod_id in train_data['ProdID']]
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    y_pred_scores = [1 if prod_id in recommended_prod_ids else 0 for prod_id in train_data['ProdID']]
    map_score = average_precision_score(y_true, y_pred_scores)

    relevant_labels = [1 if prod_id in relevant_prod_ids else 0 for prod_id in recommended_prod_ids]
    ndcg = ndcg_score([relevant_labels], [y_pred_scores[:top_n]])

    return precision, recall, f1, accuracy, map_score, ndcg

### Hair Product

In [21]:
query = "Hair Product"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'intersection hair hair's-breadth Cartesian product production merchandise mathematical product haircloth fuzz whisker hairsbreadth pilus product tomentum ware'. Showing similar items.
Precision: 0.4078, Recall: 0.4738, F1: 0.4383, Accuracy: 0.7042
MAP: 0.3214, NDCG: 0.5704


In [22]:
query = "Hair mask"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'hair block out cloak dissemble masquerade disguise haircloth fuzz whisker masque hairsbreadth pilus hair's-breadth tomentum masquerade party mask'. Showing similar items.
Precision: 0.4085, Recall: 0.4738, F1: 0.4388, Accuracy: 0.7047
MAP: 0.3217, NDCG: 0.4953


In [23]:
query = "Shampoo"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'shampoo'. Showing similar items.
Precision: 0.5944, Recall: 0.7696, F1: 0.6708, Accuracy: 0.8160
MAP: 0.5136, NDCG: 0.7976


In [24]:
query = "conditioner"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'consideration status term condition stipulate qualify circumstance specify discipline precondition experimental condition train shape stipulation check'. Showing similar items.
Precision: 0.4701, Recall: 0.4738, F1: 0.4719, Accuracy: 0.7417
MAP: 0.3509, NDCG: 0.6871


In [25]:
queries = ["Hair Product", "Hair mask", "Shampoo", "Conditioner"]

total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations(df, ground_truth_top, query, top_n=20)
    
    # Accumulate the metrics
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

# Calculate the average metrics across all queries
avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

# Print the averaged results
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

Found match for 'intersection hair hair's-breadth Cartesian product production merchandise mathematical product haircloth fuzz whisker hairsbreadth pilus product tomentum ware'. Showing similar items.
Found match for 'hair block out cloak dissemble masquerade disguise haircloth fuzz whisker masque hairsbreadth pilus hair's-breadth tomentum masquerade party mask'. Showing similar items.
Found match for 'shampoo'. Showing similar items.
Found match for 'consideration status term condition stipulate qualify circumstance specify discipline precondition experimental condition train shape stipulation check'. Showing similar items.
Average Precision: 0.4702
Average Recall: 0.5478
Average F1-Score: 0.5049
Average Accuracy: 0.7417
Average MAP: 0.3769
Average NDCG: 0.6376


### HouseHolds

In [26]:
query = "Households"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'household house menage family home'. Showing similar items.
Precision: 0.6804, Recall: 0.8094, F1: 0.7393, Accuracy: 0.8606
MAP: 0.5973, NDCG: 0.7260


In [27]:
query = "Detergent"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'deterg'. Showing similar items.
Precision: 0.5983, Recall: 0.7934, F1: 0.6822, Accuracy: 0.8194
MAP: 0.5252, NDCG: 0.7163


In [28]:
query = "Cleaning"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'cleanse fairly light uncontaminating clear fresh strip sporting clean house clean-living uninfected houseclean plumb white clean blank make clean unclouded neat unobjectionable fair sportsmanlike scavenge clean and jerk plum pick sporty'. Showing similar items.
Precision: 0.3072, Recall: 0.2989, F1: 0.3030, Accuracy: 0.6641
MAP: 0.2631, NDCG: 0.4066


In [29]:
query = "Home"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'base menage plate nursing home abode habitation household family interior national domicile dwelling house rest home place home dwelling house home plate home base internal'. Showing similar items.
Precision: 0.6067, Recall: 0.6189, F1: 0.6127, Accuracy: 0.8089
MAP: 0.4686, NDCG: 0.7862


In [30]:
queries = ["Households", "Detergent", "Cleaning", "Home"]

total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations(df, ground_truth_top2, query, top_n=20)
    
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

Found match for 'household house menage family home'. Showing similar items.
Found match for 'deterg'. Showing similar items.
Found match for 'cleanse fairly light uncontaminating clear fresh strip sporting clean house clean-living uninfected houseclean plumb white clean blank make clean unclouded neat unobjectionable fair sportsmanlike scavenge clean and jerk plum pick sporty'. Showing similar items.
Found match for 'base menage plate nursing home abode habitation household family interior national domicile dwelling house rest home place home dwelling house home plate home base internal'. Showing similar items.
Average Precision: 0.5482
Average Recall: 0.6301
Average F1-Score: 0.5843
Average Accuracy: 0.7882
Average MAP: 0.4635
Average NDCG: 0.6588


### Beauty Products

In [31]:
query = "Beauty"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'beauti'. Showing similar items.
Precision: 0.4005, Recall: 0.5709, F1: 0.4708, Accuracy: 0.7405
MAP: 0.3154, NDCG: 0.6313


In [32]:
query = "makeup"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'makeup composition war paint make-up constitution physical composition'. Showing similar items.
Precision: 0.5673, Recall: 0.5879, F1: 0.5774, Accuracy: 0.8260
MAP: 0.4168, NDCG: 0.5497


In [33]:
query = "Lotion"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'lotion application'. Showing similar items.
Precision: 0.5495, Recall: 0.5455, F1: 0.5474, Accuracy: 0.8177
MAP: 0.3916, NDCG: 0.4964


In [34]:
query = "Lipstick"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Found match for 'lipstick lip rouge'. Showing similar items.
Precision: 0.4181, Recall: 0.5879, F1: 0.4887, Accuracy: 0.7513
MAP: 0.3291, NDCG: 0.5089


In [35]:
queries = ["Beauty", "Makeup", "Lotion", "Lipstick"]


total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations(df, ground_truth_top3, query, top_n=20)
    
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

Found match for 'beauti'. Showing similar items.
Found match for 'makeup composition war paint make-up constitution physical composition'. Showing similar items.
Found match for 'lotion application'. Showing similar items.
Found match for 'lipstick lip rouge'. Showing similar items.
Average Precision: 0.4838
Average Recall: 0.5730
Average F1-Score: 0.5211
Average Accuracy: 0.7839
Average MAP: 0.3632
Average NDCG: 0.5466


### Combined Evaluation

In [36]:
query_sets_with_ground_truth = [
    (["Beauty", "Makeup", "Lotion", "Lipstick"], ground_truth_top),      
    (["Households", "Detergent", "Cleaning", "Home"], ground_truth_top2), 
    (["Hair Product", "Hair mask", "Shampoo", "Conditioner"], ground_truth_top3) 
]

combined_precision = combined_recall = combined_f1 = combined_accuracy = combined_map = combined_ndcg = 0
total_queries = 0

for queries, ground_truth in query_sets_with_ground_truth:
    total_precision = total_recall = total_f1 = total_accuracy = total_map = total_ndcg = 0
    num_queries = len(queries)
    
    for query in queries:
        precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations(df, ground_truth, query, top_n=20)
        
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_accuracy += accuracy
        total_map += map_score
        total_ndcg += ndcg

    avg_precision = total_precision / num_queries
    avg_recall = total_recall / num_queries
    avg_f1 = total_f1 / num_queries
    avg_accuracy = total_accuracy / num_queries
    avg_map = total_map / num_queries
    avg_ndcg = total_ndcg / num_queries
    
    combined_precision += avg_precision
    combined_recall += avg_recall
    combined_f1 += avg_f1
    combined_accuracy += avg_accuracy
    combined_map += avg_map
    combined_ndcg += avg_ndcg
    total_queries += 1

final_avg_precision = combined_precision / total_queries
final_avg_recall = combined_recall / total_queries
final_avg_f1 = combined_f1 / total_queries
final_avg_accuracy = combined_accuracy / total_queries
final_avg_map = combined_map / total_queries
final_avg_ndcg = combined_ndcg / total_queries

print(f"Final Combined Average Precision: {final_avg_precision:.4f}")
print(f"Final Combined Average Recall: {final_avg_recall:.4f}")
print(f"Final Combined Average F1-Score: {final_avg_f1:.4f}")
print(f"Final Combined Average Accuracy: {final_avg_accuracy:.4f}")
print(f"Final Combined Average MAP: {final_avg_map:.4f}")
print(f"Final Combined Average NDCG: {final_avg_ndcg:.4f}")

Found match for 'beauti'. Showing similar items.
Found match for 'makeup composition war paint make-up constitution physical composition'. Showing similar items.
Found match for 'lotion application'. Showing similar items.
Found match for 'lipstick lip rouge'. Showing similar items.
Found match for 'household house menage family home'. Showing similar items.
Found match for 'deterg'. Showing similar items.
Found match for 'cleanse fairly light uncontaminating clear fresh strip sporting clean house clean-living uninfected houseclean plumb white clean blank make clean unclouded neat unobjectionable fair sportsmanlike scavenge clean and jerk plum pick sporty'. Showing similar items.
Found match for 'base menage plate nursing home abode habitation household family interior national domicile dwelling house rest home place home dwelling house home plate home base internal'. Showing similar items.
Found match for 'intersection hair hair's-breadth Cartesian product production merchandise mathe

## Evaluation 2

In [37]:
def evaluate_recommendations2(train_data, ground_truth, query, top_n=20):
    """
    Evaluates the content-based recommendation system based on the ground truth.
    
    Parameters:
    - train_data: The full dataset containing product information (df).
    - ground_truth: The ground truth dataset (manually labeled relevant products, df2).
    - query: The search query used to generate recommendations.
    - top_n: Number of top recommendations to evaluate.
    
    Returns:
    - precision: Precision score for the recommendations.
    - recall: Recall score for the recommendations.
    - f1: F1-Score for the recommendations.
    - accuracy: Accuracy score for the recommendations.
    - map_score: Mean Average Precision (MAP) score.
    - ndcg: Normalized Discounted Cumulative Gain (NDCG) score.
    """
    recommended_items = content_based_recommendations2(train_data, query, top_n=top_n)
    
    recommended_prod_ids = recommended_items['ProdID'].values
    relevant_prod_ids = ground_truth['ProdID'].values
    
    y_true = [1 if prod_id in relevant_prod_ids else 0 for prod_id in train_data['ProdID']]
    y_pred = [1 if prod_id in recommended_prod_ids else 0 for prod_id in train_data['ProdID']]
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    y_pred_scores = [1 if prod_id in recommended_prod_ids else 0 for prod_id in train_data['ProdID']]
    map_score = average_precision_score(y_true, y_pred_scores)

    relevant_labels = [1 if prod_id in relevant_prod_ids else 0 for prod_id in recommended_prod_ids]
    ndcg = ndcg_score([relevant_labels], [y_pred_scores[:top_n]])

    return precision, recall, f1, accuracy, map_score, ndcg

### Hair Products

In [38]:
query = "Hair Product"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Precision: 0.5582, Recall: 0.9648, F1: 0.7072, Accuracy: 0.8054
MAP: 0.5471, NDCG: 0.7603


In [39]:
query = "Hair mask"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

Precision: 0.5566, Recall: 0.9648, F1: 0.7059, Accuracy: 0.8042
MAP: 0.5456, NDCG: 0.5805


In [40]:
query = "Shampoo"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.5675, Recall: 0.9648, F1: 0.7146, Accuracy: 0.8123
MAP: 0.5561, NDCG: 0.7603


In [41]:
query = "conditioner"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.5563, Recall: 0.9497, F1: 0.7016, Accuracy: 0.8032
MAP: 0.5405, NDCG: 0.6454


In [42]:
queries = ["Hair Product", "Hair mask", "Shampoo", "Conditioner"]

total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations2(df, ground_truth_top, query, top_n=20)
    
    # Accumulate the metrics
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

# Calculate the average metrics across all queries
avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

# Print the averaged results
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Average Precision: 0.5596
Average Recall: 0.9610
Average F1-Score: 0.7073
Average Accuracy: 0.8063
Average MAP: 0.5473
Average NDCG: 0.6866


### Household Products

In [43]:
query = "Households"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.5675, Recall: 0.9609, F1: 0.7136, Accuracy: 0.8116
MAP: 0.5549, NDCG: 0.7254


In [44]:
query = "Detergent"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.6349, Recall: 0.9348, F1: 0.7562, Accuracy: 0.8527
MAP: 0.6094, NDCG: 0.8256


In [45]:
query = "Cleaning"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match['Score'] = score / 100 * fuzzy_weight


Precision: 0.5480, Recall: 0.9509, F1: 0.6953, Accuracy: 0.7964
MAP: 0.5331, NDCG: 0.7055


In [46]:
query = "Home"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top2, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.5403, Recall: 0.9338, F1: 0.6846, Accuracy: 0.7898
MAP: 0.5207, NDCG: 0.6313


In [47]:
queries = ["Households", "Detergent", "Cleaning", "Home"]

total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations2(df, ground_truth_top2, query, top_n=20)
    
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice fro

Average Precision: 0.5727
Average Recall: 0.9451
Average F1-Score: 0.7124
Average Accuracy: 0.8126
Average MAP: 0.5545
Average NDCG: 0.7219


### Beauty Products

In [48]:
query = "Beauty"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.4766, Recall: 0.9612, F1: 0.6372, Accuracy: 0.7787
MAP: 0.4659, NDCG: 0.6313


In [49]:
query = "Makeup"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.4604, Recall: 0.9455, F1: 0.6193, Accuracy: 0.7650
MAP: 0.4464, NDCG: 0.6451


In [50]:
query = "Lotion"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.4657, Recall: 0.9455, F1: 0.6240, Accuracy: 0.7697
MAP: 0.4513, NDCG: 0.6451


In [51]:
query = "Lipstick"
precision, recall, f1, accuracy, map_score,  ndcg = evaluate_recommendations2(df, ground_truth_top3, query ,top_n=20)


print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}, NDCG: {ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight


Precision: 0.4662, Recall: 0.9455, F1: 0.6245, Accuracy: 0.7702
MAP: 0.4518, NDCG: 0.6451


In [52]:
queries = ["Beauty", "Makeup", "Lotion", "Lipstick"]


total_precision, total_recall, total_f1, total_accuracy, total_map, total_ndcg = 0, 0, 0, 0, 0, 0
num_queries = len(queries)

for query in queries:
    precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations2(df, ground_truth_top3, query, top_n=20)
    
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += accuracy
    total_map += map_score
    total_ndcg += ndcg

avg_precision = total_precision / num_queries
avg_recall = total_recall / num_queries
avg_f1 = total_f1 / num_queries
avg_accuracy = total_accuracy / num_queries
avg_map = total_map / num_queries
avg_ndcg = total_ndcg / num_queries

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average NDCG: {avg_ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice fro

Average Precision: 0.4672
Average Recall: 0.9494
Average F1-Score: 0.6262
Average Accuracy: 0.7709
Average MAP: 0.4539
Average NDCG: 0.6417


In [53]:
query_sets_with_ground_truth = [
    (["Beauty", "Makeup", "Lotion", "Lipstick"], ground_truth_top),      
    (["Households", "Detergent", "Cleaning", "Home"], ground_truth_top2), 
    (["Hair Product", "Hair mask", "Shampoo", "Conditioner"], ground_truth_top3) 
]

combined_precision = combined_recall = combined_f1 = combined_accuracy = combined_map = combined_ndcg = 0
total_queries = 0

for queries, ground_truth in query_sets_with_ground_truth:
    total_precision = total_recall = total_f1 = total_accuracy = total_map = total_ndcg = 0
    num_queries = len(queries)
    
    for query in queries:
        precision, recall, f1, accuracy, map_score, ndcg = evaluate_recommendations2(df, ground_truth, query, top_n=20)
        
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_accuracy += accuracy
        total_map += map_score
        total_ndcg += ndcg

    avg_precision = total_precision / num_queries
    avg_recall = total_recall / num_queries
    avg_f1 = total_f1 / num_queries
    avg_accuracy = total_accuracy / num_queries
    avg_map = total_map / num_queries
    avg_ndcg = total_ndcg / num_queries
    
    combined_precision += avg_precision
    combined_recall += avg_recall
    combined_f1 += avg_f1
    combined_accuracy += avg_accuracy
    combined_map += avg_map
    combined_ndcg += avg_ndcg
    total_queries += 1

final_avg_precision = combined_precision / total_queries
final_avg_recall = combined_recall / total_queries
final_avg_f1 = combined_f1 / total_queries
final_avg_accuracy = combined_accuracy / total_queries
final_avg_map = combined_map / total_queries
final_avg_ndcg = combined_ndcg / total_queries

print(f"Final Combined Average Precision: {final_avg_precision:.4f}")
print(f"Final Combined Average Recall: {final_avg_recall:.4f}")
print(f"Final Combined Average F1-Score: {final_avg_f1:.4f}")
print(f"Final Combined Average Accuracy: {final_avg_accuracy:.4f}")
print(f"Final Combined Average MAP: {final_avg_map:.4f}")
print(f"Final Combined Average NDCG: {final_avg_ndcg:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_matches.loc[:, 'Score'] = category_weight
A value is trying to be set on a copy of a slice fro

Final Combined Average Precision: 0.5306
Final Combined Average Recall: 0.9466
Final Combined Average F1-Score: 0.6782
Final Combined Average Accuracy: 0.7939
Final Combined Average MAP: 0.5144
Final Combined Average NDCG: 0.6507
