In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

In [7]:
# Load datasets
products = pd.read_csv('/kaggle/input/instacart-market-basket-analysis/products.csv')
aisles = pd.read_csv('/kaggle/input/instacart-market-basket-analysis/aisles.csv')
departments = pd.read_csv('/kaggle/input/instacart-market-basket-analysis/departments.csv')

# Merge products with aisles and departments for content information
product_info = pd.merge(products, aisles, on='aisle_id')
product_info = pd.merge(product_info, departments, on='department_id')

product_info.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes,snacks
2,102,Danish Butter Cookies,61,19,cookies cakes,snacks
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes,snacks
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes,snacks


In [3]:
# Combine aisle and department into a single content feature
product_info['content'] = product_info['aisle'] + ' ' + product_info['department']

# Using TF-IDF Vectorizer to transform text data into feature vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(product_info['content'])

In [4]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping from product name to index
product_indices = pd.Series(product_info.index, index=product_info['product_name']).drop_duplicates()

In [5]:
def recommend_products(product_name, cosine_sim=cosine_sim, product_indices=product_indices, top_n=10):
    # Get the index of the product that matches the product name
    idx = product_indices[product_name]

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 10 most similar products
    sim_scores = sim_scores[1:top_n+1]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar products
    return product_info['product_name'].iloc[product_indices]

# Example usage
similar_products = recommend_products("Chocolate Sandwich Cookies")
print(similar_products)

1                     Nutter Butter Cookie Bites Go-Pak
2                                 Danish Butter Cookies
3        Gluten Free All Natural Chocolate Chip Cookies
4                          Mini Nilla Wafers Munch Pack
5                              Organic Lemon Gingersnap
6                             Chips Ahoy! Chewy Cookies
7     Cookie Chips Crunchy Dark Chocolate Chocolate ...
8                                Golden Cupcakes 8 Pack
9                     Crunch Vanilla Sugar Mini Cookies
10                                Vanilla Sugar Cookies
Name: product_name, dtype: object
