In [1]:
import numpy as np 
import pandas as pd 

In [5]:
products = pd.read_csv('amazon_sample_products.csv')
category = pd.read_csv('amazon_sample_category.csv')

In [6]:
products.head(1)


Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000


In [7]:
category.head(1)

Unnamed: 0,id,category_name
0,1,Beading & Jewelry Making


In [8]:
products.isnull().sum()

asin                 0
title                0
imgUrl               0
productURL           0
stars                0
reviews              0
price                0
listPrice            0
category_id          0
isBestSeller         0
boughtInLastMonth    0
dtype: int64

In [9]:
category.isnull().sum()

id               0
category_name    0
dtype: int64

In [10]:
products.duplicated().sum()

0

In [11]:
category.duplicated().sum()

0

In [12]:
category = category.rename(columns={'id': 'category_id'})
prod_cat = products.merge(category, on='category_id')

In [13]:
prod_cat = prod_cat.head(50000)
df = prod_cat.copy()

In [14]:
summary_df = prod_cat.groupby('title', as_index=False).agg(
    avg_stars=('stars', 'mean'),
    review_count=('reviews', 'count')
)

In [15]:
filtered_titles = summary_df[
    (summary_df['avg_stars'] >= 4.5) & 
    (summary_df['review_count'] > 0)
]

filtered_titles.head(1)

Unnamed: 0,title,avg_stars,review_count
0,"""Inspirations"" 14k Gold-Dipped Jerusalem Cross...",4.5,1


In [33]:
final_df = filtered_titles.sort_values(
    'avg_stars', 
    ascending=False
).sample(50)

In [34]:
final_df.shape

(50, 3)

In [35]:
final_df = final_df.merge(products,on='title').drop_duplicates('title')[['title','productURL','imgUrl','avg_stars','review_count']]


In [36]:
final_df


Unnamed: 0,title,productURL,imgUrl,avg_stars,review_count
0,"Lead Time 3"" NMT",https://www.amazon.com/dp/B0BS12LMHJ,https://m.media-amazon.com/images/I/61UhgyLVfh...,5.0,1
1,Men's Rival Fleece Shorts,https://www.amazon.com/dp/B0BGVN9X5Y,https://m.media-amazon.com/images/I/51FMUokKV8...,4.6,1
2,"1 or 2 Pack Men's Thermal Compression Pants, A...",https://www.amazon.com/dp/B096LW8SWY,https://m.media-amazon.com/images/I/512JTY+681...,4.5,1
3,Mens Air Running Shoes Lightweight Athletic Te...,https://www.amazon.com/dp/B0C27T7NVG,https://m.media-amazon.com/images/I/71S4LS7T53...,4.5,1
4,Men's Woven Graphic Wordmark Shorts,https://www.amazon.com/dp/B071LH1N6X,https://m.media-amazon.com/images/I/71WmLGCqLp...,4.5,1
5,1.5 Inch Thank You for Delivering This Happy M...,https://www.amazon.com/dp/B092ZNZQPB,https://m.media-amazon.com/images/I/51u1by0iw-...,5.0,1
6,Replacement Metal D-Pads and Paddles for Xbox ...,https://www.amazon.com/dp/B089ZYSK8T,https://m.media-amazon.com/images/I/51HGyHzjOC...,4.5,1
7,Men's Packable Puffer Jacket Lightweight Hoode...,https://www.amazon.com/dp/B0B8HNWXRM,https://m.media-amazon.com/images/I/71O4nLgi8X...,4.5,1
8,Sonic Unleashed - Xbox 360 (Renewed),https://www.amazon.com/dp/B084G2C8H2,https://m.media-amazon.com/images/I/81DBG9eV8c...,5.0,1
9,Swim Trunks with Compression Liner - Men's Pre...,https://www.amazon.com/dp/B09ZXB6Y41,https://m.media-amazon.com/images/I/81lxH0872y...,4.6,1


# Next part -2 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
df['title'] = df['title'].fillna('')

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

print("Creating TF-IDF matrix for 50K sample...")
tfidf_matrix = tfidf_vectorizer.fit_transform(df['title'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

print("Computing cosine similarity... (this should take 2-5 minutes)")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("✓ Cosine similarity matrix computed successfully!")

Creating TF-IDF matrix for 50K sample...
TF-IDF matrix shape: (50000, 5000)
Computing cosine similarity... (this should take 2-5 minutes)
✓ Cosine similarity matrix computed successfully!


In [21]:
indices = pd.Series(df.index, index=df['asin']).drop_duplicates()

In [22]:
def get_recommendations_by_name(asin, cosine_sim=cosine_sim, df=df, indices=indices, num_recommendations=5):
    """
    Returns recommended products based on similar names using Cosine Similarity.
    """
    try:
        # FIX: Check if the ASIN exists in the current indices map
        if asin not in indices:
            # If not found, it likely means the input ASIN was not in the 10000-row subset.
            # This should be handled by the calling function if using user input.
            raise KeyError(f"ASIN {asin} not in current model indices.")

        idx = indices[asin]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the top (num_recommendations+1) most similar products (+1 to exclude itself)
        sim_scores = sim_scores[1:num_recommendations+1]
        product_indices = [i[0] for i in sim_scores]

        recommended_asins = df['asin'].iloc[product_indices].tolist()
        return recommended_asins
    except Exception as e:
        # print(f"Error in name-based recommendations: {e}")
        return []

In [23]:
def get_recommendations_by_category(asin, df=df, num_recommendations=5):
    """
    Returns random products from the same category as the input ASIN.
    """
    try:
        product_category = df.loc[df['asin'] == asin, 'category_name'].iloc[0]
        same_category_products = df[(df['category_name'] == product_category) & (df['asin'] != asin)]

        if len(same_category_products) == 0:
            return []

        # Sample up to 'num_recommendations' products
        n_sample = min(num_recommendations, len(same_category_products))
        recommended_df = same_category_products.sample(n=n_sample, random_state=None)
        recommended_asins = recommended_df['asin'].tolist()

        return recommended_asins
    except Exception as e:
        # print(f"Error in category-based recommendations: {e}")
        return []

In [24]:
def recommend_products(asin, strategy=None, num_recommendations=5):
    """
    The main recommendation wrapper. 
    If strategy is None, it randomly selects between 'Similar Name' and 'Same Category'.
    """
    # 1. Validate that the ASIN exists in our dataset
    if asin not in df['asin'].values:
        return f"Product ASIN {asin} not found in database."

    result_asins = [asin]
    
    # 2. Strategy Selection
    if strategy is None:
        # Randomly select a strategy (Original code logic)
        strategy_selector = random.randint(0, 1)
        strategy_name = 'Same Category' if strategy_selector == 0 else 'Similar Name'
    else:
        strategy_name = strategy # Used for accuracy testing

    print(f"Strategy: {strategy_name}")

    if strategy_name == 'Same Category':
        recommended = get_recommendations_by_category(asin, num_recommendations=num_recommendations)
    else: # 'Similar Name'
        recommended = get_recommendations_by_name(asin, num_recommendations=num_recommendations)
    
    result_asins.extend(recommended)
    
    # Pad the list with empty strings if fewer than 5 recommendations were found
    if len(result_asins) < num_recommendations + 1:
        result_asins.extend([''] * (num_recommendations + 1 - len(result_asins)))

    return result_asins

In [25]:
def find_best_match_asin(user_input, tfidf_model, df, N=1):
    """
    Finds the ASIN(s) whose title is most similar to the user's text input 
    using the fitted TF-IDF model.
    """
    user_input_tfidf = tfidf_model.transform([user_input])
    similarity_scores = cosine_similarity(user_input_tfidf, tfidf_matrix)
    
    # Get the index of the most similar product(s)
    sim_scores = list(enumerate(similarity_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N matching ASINs
    top_indices = [i[0] for i in sim_scores[0:N]]
    return df['asin'].iloc[top_indices].tolist()

In [26]:
prod_cat

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,category_name
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.00,104,False,2000,Suitcases
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000,Suitcases
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300,Suitcases
3,B08MVFKGJM,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,354.37,104,False,400,Suitcases
4,B01DJLKZBA,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,309.99,104,False,400,Suitcases
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,B0BZVPKZLV,Upgrade Vacuum Hose for Shark Navigator NV22 N...,https://m.media-amazon.com/images/I/71jn6mCx5-...,https://www.amazon.com/dp/B0BZVPKZLV,4.7,11,21.99,0.00,175,False,0,Vacuum Cleaners & Floor Care
49996,B08226SRTY,BLACK+DECKER POWERSERIES Vacuum Filter Replace...,https://m.media-amazon.com/images/I/71TBgl9Ho7...,https://www.amazon.com/dp/B08226SRTY,4.6,517,16.45,17.99,175,False,200,Vacuum Cleaners & Floor Care
49997,B07RQ5965Q,RONGJU 2 Post + 4 Foam & Felt Filters for Shar...,https://m.media-amazon.com/images/I/71cqFhMaIf...,https://www.amazon.com/dp/B07RQ5965Q,4.7,609,16.99,0.00,175,False,0,Vacuum Cleaners & Floor Care
49998,B0BKG11J9F,2156 Vacuum Filter Replacement for Bissell Zin...,https://m.media-amazon.com/images/I/71eoTSHwRV...,https://www.amazon.com/dp/B0BKG11J9F,4.6,81,14.99,0.00,175,False,50,Vacuum Cleaners & Floor Care


In [27]:
prod_cat['category_name'].value_counts()

category_name
Men's Shoes                               17092
Men's Clothing                            17070
Men's Accessories                          8449
Xbox 360 Games, Consoles & Accessories     3433
Vacuum Cleaners & Floor Care               3032
Suitcases                                   924
Name: count, dtype: int64

In [33]:
import random
def run_user_recommendation_search():
    """
    Takes user text input, finds the best matching product, and prints 
    recommendations using a randomly selected strategy.
    """
    print("="*70)
    user_query = input("Enter a product name or keyword (e.g., 'ps5' or 'hiking backpack'):\n>>> ")
    
    if not user_query.strip():
        print("Search query cannot be empty.")
        return

    print("-" * 70)
    print(f"Searching for: '{user_query}'")
    
    # 1. Find the best matching ASIN
    best_match_asins = find_best_match_asin(user_query, tfidf_vectorizer, df, N=1)
    
    if not best_match_asins:
        print("Error: Could not find a product matching your query in the dataset.")
        return

    # Use the best matching ASIN
    input_asin = best_match_asins[0]
    match_title = df[df['asin'] == input_asin]['title'].iloc[0]
    match_category = df[df['asin'] == input_asin]['category_name'].iloc[0]
    
    print(f"✅ Query matched to best product:")
    print(f"   ASIN: {input_asin}")
    print(f"   Title: {match_title[:60]}...")
    print(f"   Category: {match_category}")
    print("-" * 70)

    # 2. Get recommendations (Random strategy is selected inside recommend_products)
    recommendations = recommend_products(input_asin)
    
    if isinstance(recommendations, list) and len(recommendations) > 1:
        # First item is the input ASIN, subsequent are recommendations
        rec_asins = recommendations[1:]
        
        # 3. Print recommendation details
        print("\nRecommendation Details:")
        for j, rec_asin in enumerate(rec_asins, 1):
            if rec_asin == '':
                print(f"  {j}. [No more items available]")
                continue
                
            rec_product = df[df['asin'] == rec_asin]['title'].iloc[0]
            rec_category = df[df['asin'] == rec_asin]['category_name'].iloc[0]
            print(f"  {j}. {rec_asin} - {rec_product[:60]}... | Category: {rec_category}")
    else:
        print(f"Could not generate recommendations: {recommendations}")
    
    print("="*70)

# EXECUTE THE INTERACTIVE SEARCH
run_user_recommendation_search() 
# Uncomment the line above to run the interactive search in your notebook.



Enter a product name or keyword (e.g., 'ps5' or 'hiking backpack'):
>>>  suitcase


----------------------------------------------------------------------
Searching for: 'suitcase'
✅ Query matched to best product:
   ASIN: B08MPXX1RF
   Title: Unisex-Adult's Suitcase, Champagne, zzzz-s...
   Category: Suitcases
----------------------------------------------------------------------
Strategy: Same Category

Recommendation Details:
  1. B07JJBDP68 - Sidewinder Expandable Hardshell Suitcase with TSA Lock, Avai... | Category: Suitcases
  2. B01H5PWGLS - Luggage Collection - 24 Inch Scratch Resistant (ABS+PC) Hard... | Category: Suitcases
  3. B0C3X83DD1 - Eiffel Tower Luggage Cover Protector Vintage Eiffel Tower Lo... | Category: Suitcases
  4. B07GJX8D7V - Frontier Spinner Carry-On Luggage Large Purple Suitcase... | Category: Suitcases
  5. B09JSD6CXV - Travel Softside 28 Inch Luggage with Spinner Wheels Lightwei... | Category: Suitcases


In [34]:
# Assuming numpy (np), pandas (pd), and tqdm are already imported.

def evaluate_model_accuracy(test_asin, df, K=5):
    """
    Calculates Precision@K and Recall@K for a single test ASIN, 
    using the 'Similar Name' strategy and all other items in the same category as ground truth.
    """
    
    # 1. Get the category of the test product
    try:
        input_category = df[df['asin'] == test_asin]['category_name'].iloc[0]
    except IndexError:
        return {'asin': test_asin, 'precision': np.nan, 'recall': np.nan, 'error': 'ASIN not in DataFrame'}

    # 2. Define the Ground Truth (Relevant Set)
    relevant_set = set(df[df['category_name'] == input_category]['asin'].tolist())
    if test_asin in relevant_set:
        relevant_set.remove(test_asin)
        
    total_relevant_items = len(relevant_set)
    
    if total_relevant_items < K:
        # Not enough items in the category to properly calculate Recall@K
        return {'asin': test_asin, 'precision': np.nan, 'recall': np.nan, 'error': f'Only {total_relevant_items} relevant items in category'}

    # 3. Get the Model's Recommendations (calling the name-based function directly)
    recommendations_list = get_recommendations_by_name(test_asin, num_recommendations=K)
    
    if not recommendations_list:
        return {'asin': test_asin, 'precision': 0.0, 'recall': 0.0, 'error': 'Recommender returned empty list'}

    top_k_recommended = set(recommendations_list[:K])
    
    # 4. Count the Hits
    hits = len(top_k_recommended.intersection(relevant_set))
    
    # 5. Calculate Metrics
    precision_k = hits / K
    recall_k = hits / total_relevant_items
    
    return {
        'asin': test_asin,
        'precision': precision_k,
        'recall': recall_k,
        'total_relevant': total_relevant_items
    }


# --- Evaluation Execution ---
N_TEST_PRODUCTS = 100 # Number of random products to test
K_SCORE = 5           # K value for Precision@K and Recall@K (Top 5 recommendations)

# Select a random sample of ASINs from the working DataFrame
test_asins = df['asin'].sample(N_TEST_PRODUCTS, random_state=42).tolist()

results = []
for asin in tqdm(test_asins, desc="Evaluating Model"):
    # Pass the ASIN and K_SCORE to the evaluation function
    result = evaluate_model_accuracy(asin, df, K=K_SCORE) 
    results.append(result)

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results)

# Filter out rows that had errors or too few relevant items
valid_results = results_df.dropna(subset=['precision', 'recall'])

# --- Final Output ---
print("\n" + "="*50)
print("             Model Accuracy Results")
print("="*50)
print(f"Strategy: Similar Name (Title Similarity)")
print(f"Tested {len(valid_results)} valid products for K={K_SCORE}")
print("-" * 50)
print(f"Average Precision@{K_SCORE}: {valid_results['precision'].mean():.4f}")
print(f"Average Recall@{K_SCORE}:    {valid_results['recall'].mean():.4f}")
print("-" * 50)

Evaluating Model: 100%|██████████| 100/100 [00:06<00:00, 14.75it/s]


             Model Accuracy Results
Strategy: Similar Name (Title Similarity)
Tested 100 valid products for K=5
--------------------------------------------------
Average Precision@5: 0.8920
Average Recall@5:    0.0006
--------------------------------------------------



