In [20]:
!pip install pandas numpy sentence-transformers chromadb -q

In [21]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import time
import ast

In [22]:
print("Reading product data...")
data = pd.read_csv('/content/products.csv')
print(f"Loaded {len(data)} products")

Reading product data...
Loaded 17330 products


In [23]:
data

Unnamed: 0,Product_ID,productDisplayName,Description
0,7901086810337,lafaani 100% cotton flap collar shirt,Shirt with a concealed placket and pintuck det...
1,7901084909793,100% cotton jacket style shirt,"Full-sleeve shirt jacket with buttons, panelli..."
2,7914715971809,100% cotton jumpsuit,"Jumpsuit with elasticated short sleeves, side ..."
3,8285102145761,100% cotton maroon haori jacket,Maroon 100% cotton haori jacket with pintucks ...
4,7914716430561,100% cotton shirt with drawstring,Boxy fit shirt with short sleeves and drawstring
...,...,...,...
17325,8926127718625,embroidered pink cutwork dress,Allure the audience with our pink poplin (100%...
17326,8926127784161,cutwork embroidered pink dress,Rock your fashionista vibes in our pink poplin...
17327,8926127882465,pink sleeveless printed top,Style yourself with our pink and multicolor po...
17328,8926127980769,pink printed drawstring top,Upgrade your closet with our pink and multicol...


In [24]:
print("\nSample of product data:")
print(data[['Product_ID', 'productDisplayName']].head())


Sample of product data:
      Product_ID                     productDisplayName
0  7901086810337  lafaani 100% cotton flap collar shirt
1  7901084909793         100% cotton jacket style shirt
2  7914715971809                   100% cotton jumpsuit
3  8285102145761        100% cotton maroon haori jacket
4  7914716430561      100% cotton shirt with drawstring


In [None]:
print("\nInitializing embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  


Initializing embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [26]:
data['text'] = data['productDisplayName'] +' ' + data['Description']

In [27]:
data.text[0]

'lafaani 100% cotton flap collar shirt Shirt with a concealed placket and pintuck details along with faggoting stitch detail at the back. Intricate Kantha adorns the extra fabric flap on the pocket. This fabric is hand spun, handwoven using traditional weaving techniques, and undyed, so may vary in colour and texture.'

In [28]:
print("Generating embeddings for product display names...")
product_text = data['text'].tolist()
product_ids = data['Product_ID'].astype(str).tolist()
embeddings = model.encode(product_text)
print(f"Generated {len(embeddings)} embeddings")

Generating embeddings for product display names...
Generated 17330 embeddings


In [29]:
results = []
for _ , row in data.iterrows():
        product_id = str(row['Product_ID'])
        product_name = str(row['productDisplayName'])
        product_text = str(row['text'])

        prod_embedding = model.encode(product_text)

        results.append({
            'product_id': product_id,
            'product_name': product_name,
            'embedding': prod_embedding
        })

In [30]:
results[0]

{'product_id': '7901086810337',
 'product_name': 'lafaani 100% cotton flap collar shirt',
 'embedding': array([-6.80536358e-03,  2.56447005e-03, -1.31142512e-01,  3.54967499e-03,
         8.66520684e-03, -1.17798820e-02,  9.25107449e-02, -5.09590879e-02,
        -1.50729557e-02, -4.86600492e-03,  8.67439732e-02,  1.12575758e-02,
        -2.75956634e-02,  2.50626616e-02,  1.80658344e-02,  3.88845336e-03,
        -5.78992255e-02,  1.13929529e-02, -5.07902950e-02, -3.89026515e-02,
        -2.64000893e-02, -3.68483737e-02,  8.62554386e-02,  1.68413669e-02,
         1.25365900e-02, -6.24984577e-02, -1.55196125e-02, -8.28144699e-02,
         8.49637482e-03, -5.71040548e-02, -2.08583754e-02,  1.10869713e-01,
        -6.51807850e-03,  6.11638557e-03, -6.73999563e-02,  6.54805778e-03,
         7.39977956e-02,  2.53945235e-02,  1.42854294e-02,  6.32283315e-02,
        -1.64352916e-02, -3.34170572e-02,  1.74147519e-03, -3.76257822e-02,
         3.52669396e-02,  7.50585943e-02, -2.79209614e-02,  1

In [31]:
embedding_df = pd.DataFrame(results)

In [32]:
embedding_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17330 entries, 0 to 17329
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    17330 non-null  object
 1   product_name  17330 non-null  object
 2   embedding     17330 non-null  object
dtypes: object(3)
memory usage: 406.3+ KB


In [33]:
embedding_df

Unnamed: 0,product_id,product_name,embedding
0,7901086810337,lafaani 100% cotton flap collar shirt,"[-0.0068053636, 0.00256447, -0.13114251, 0.003..."
1,7901084909793,100% cotton jacket style shirt,"[-0.047693554, 0.12277306, -0.047323346, 0.015..."
2,7914715971809,100% cotton jumpsuit,"[-0.035049167, 0.08176353, -0.0071075605, 0.02..."
3,8285102145761,100% cotton maroon haori jacket,"[-0.038592666, 0.01662371, -0.05382528, 0.0462..."
4,7914716430561,100% cotton shirt with drawstring,"[0.0087339105, 0.11074402, -0.0038278685, 0.01..."
...,...,...,...
17325,8926127718625,embroidered pink cutwork dress,"[0.039596256, 0.03907658, 0.03083358, 0.002765..."
17326,8926127784161,cutwork embroidered pink dress,"[0.010585936, 0.006684779, 0.07908235, -0.0176..."
17327,8926127882465,pink sleeveless printed top,"[-0.0017885191, 0.028902344, 0.02032174, 0.043..."
17328,8926127980769,pink printed drawstring top,"[0.014062271, -0.010578652, 0.02977633, 0.0059..."


In [None]:
embedding_df.to_pickle('./ss_embeddings.pkl')

In [34]:
client = chromadb.Client()

In [None]:
Check if collection exists and recreate it
try:
    client.delete_collection("product_search")
except:
    pass

collection = client.create_collection(name="product_search")

In [None]:
print("Adding products to vector database in batches...")

batch_size = 5000
num_products = len(embedding_df)

for i in range(0, num_products, batch_size):
    batch_df = embedding_df.iloc[i:i + batch_size]

    batch_names = batch_df['product_name'].tolist()
    batch_embeddings = batch_df['embedding'].tolist()
    batch_ids = batch_df['product_id'].astype(str).tolist() 
    batch_metadatas = [{"name": name} for name in batch_names]

    collection.add(
        documents=batch_names,
        embeddings=batch_embeddings,
        ids=batch_ids,
        metadatas=batch_metadatas
    )

    print(f"Added batch {i // batch_size + 1}/{(num_products + batch_size - 1) // batch_size}")

print(f"Finished adding {num_products} products to the vector database.")


Adding products to vector database in batches...
Added batch 1/4
Added batch 2/4
Added batch 3/4
Added batch 4/4
Finished adding 17330 products to the vector database.


In [None]:
def search_similar_products(query_text, top_k=4):
    """Search for products similar to the query text"""
    print(f"\nSearching for '{query_text}'...")
    start_time = time.time()

    query_embedding = model.encode([query_text])[0]

    results = collection.query(
        query_texts=[query_text],
        n_results=top_k,
        include=['distances', 'documents', 'metadatas']  
    )

    end_time = time.time()
    print(f"Search completed in {end_time - start_time:.4f} seconds.")

    
    if not results or not results['ids'] or not results['ids'][0]:
        print("No similar products found.")
        return

    print(f"\nSearch results for '{query_text}':")

    for i, (id_str, document, distance, metadata) in enumerate(zip(
        results['ids'][0],
        results['documents'][0],
        results['distances'][0],
        results['metadatas'][0]
    )):
        similarity = 1 - distance  

        print(f"{i+1}. Product ID: {id_str}, Distance: {distance:.4f}, Heuristic Similarity: {similarity:.4f}")
        print(f"   Name: {document}")

       
        try:
  
            match_row = embedding_df[embedding_df['product_id'].astype(str) == str(id_str)]

            if not match_row.empty:
                row = match_row.iloc[0]
                print(f"   Product ID (original): {row['product_id']}")
                print(f"   Product Name: {row['product_name']}")
            else:
                print(f"   Warning: Product ID {id_str} found in ChromaDB but not in embedding_df.")
        except Exception as e:
            print(f"   Error retrieving product info: {e}")

        print("-" * 40)


In [39]:
# %%timeit

search_similar_products("Linen Blend Casual Blazer A lightweight and breathable linen blend blazer, perfect for smart-casual looks during warmer seasons")


Searching for 'Linen Blend Casual Blazer A lightweight and breathable linen blend blazer, perfect for smart-casual looks during warmer seasons'...
Search completed in 0.3338 seconds.

Search results for 'Linen Blend Casual Blazer A lightweight and breathable linen blend blazer, perfect for smart-casual looks during warmer seasons':
1. Product ID: 7901024616673, Distance: 0.6438, Heuristic Similarity: 0.3562
   Name: oriental printed blazer
   Product ID (original): 7901024616673
   Product Name: oriental printed blazer
----------------------------------------
2. Product ID: 8447546982625, Distance: 0.6520, Heuristic Similarity: 0.3480
   Name: linen beige blazer style dress
   Product ID (original): 8447546982625
   Product Name: linen beige blazer style dress
----------------------------------------
3. Product ID: 7901026255073, Distance: 0.6642, Heuristic Similarity: 0.3358
   Name: ajrakh printed linen blazer
   Product ID (original): 7901026255073
   Product Name: ajrakh printed l

In [None]:
os.environ["GOOGLE_API_KEY"] = "api_key"

In [4]:
!pip install  langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv



In [5]:
pip install -U langchain-google-genai



In [6]:
from typing import List, Dict
from pydantic import BaseModel, Field
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings , HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.schema import Document

In [7]:
llm = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash-lite",
            temperature=0.2
        )

In [8]:
template = """
You are a retail recommendation expert specializing in identifying **complementary products** for items sold on an Ecommerce platform.

CONTEXT:
- Original Product: {original_product_name}
- Available Categories :
{category}

Your task is to suggest **up to 5 complementary products** that are typically purchased, worn, or used **together** with the original product. Your recommendations must be selected **only from the provided categories**. Rank them in **decreasing order of complementary relevance**.

Focus your suggestions on:
- Functional utility (e.g., matching bottomwear, required underlayers)
- Styling and enhancement (e.g., accessories, color coordination)
- Target usage (festive, daily wear, casual, formal, age relevance)
- The original product’s intended gender, style, and cultural context

📌 INSTRUCTIONS:
- Only recommend items from the listed categories within `{category}`
- Do **NOT** include substitutes or near-identical products
- Do **NOT** include items from categories not listed
- Prioritize items that help **complete, accessorize, or enhance** the product
- **For recommended products, use specific item names or types (e.g., "Slim Fit Chinos", "Statement Necklace") rather than just the general category name (e.g., not just "Bottoms - Men", "Necklaces").**

📊 SCORING:
- Assign a **complementary score between 0.80 and 1.00**
- Only include items with a score **≥ 0.85**, unless slightly lower but very relevant
- Fewer than 5 is okay — **precision matters more than quantity**
- List in **descending order of complementary score**

📎 OUTPUT FORMAT:
For each complementary product:
- **Product Name**
- **Brief Description** (1-2 lines describing the item itself)
- **Complementary Score** (e.91)

Do not explain how it complements the original product.

🔍 EXAMPLES:

1.  Original: **Luxury Moisturizing Shampoo (500ml)**
    Category List:
    -   Personal Care: ["Conditioners", "Hair Masks", "Serums", "Body Wash"]
    -   Accessories: ["Hairbands", "Clips"]
    -   Appliances: ["Hair Dryers", "Straighteners"]

    Suggested Complementary Products:
    -   **Matching Deep Conditioner**
        -   **Brief Description**: A rich, hydrating conditioner formulated to work with the shampoo.
        -   **Complementary Score**: 0.96
    -   **Leave-In Hair Serum**
        -   **Brief Description**: Lightweight serum that smooths hair and tames frizz post-wash.
        -   **Complementary Score**: 0.89

2.  Original: **Men's Casual Cotton T-Shirt**
    Category List:
    -   Apparel: ["Jeans", "Shorts", "Jackets", "Sweatshirts"]
    -   Accessories: ["Sneakers", "Caps", "Socks", "Belts"]

    Suggested Complementary Products:
    -   **Comfort Fit Denim Jeans**
        -   **Brief Description**: Classic blue jeans with a relaxed fit for everyday comfort.
        -   **Complementary Score**: 0.96
    -   **White Casual Sneakers**
        -   **Brief Description**: Lightweight lace-up shoes that suit casual and semi-casual wear.
        -   **Complementary Score**: 0.91

{format_instructions}
"""

In [None]:
class ProductRelationship(BaseModel):
    product_name: str = Field(description="Name of the recommended product.")
    product_description: str = Field(description="Describe the product and explain what it is used for.")
    score: float = Field(ge=0.0, le=1.0, description="A score (0 to 1) indicating how complementary or similar the product is.")

In [10]:
class ProductRecommendations(BaseModel):
    complementary_products: List[ProductRelationship] = Field(
        description="List of complementary products"
    )

In [11]:
parser = PydanticOutputParser(pydantic_object=ProductRecommendations)

In [12]:
prompt = ChatPromptTemplate.from_template(
    template=template,
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [13]:
recommendation_chain = prompt | llm | parser

In [None]:
from typing import Dict, List

def get_complementary(
    product_name: str,
    catagory: dict,
    llm=llm
) -> Dict[str, List[Dict]]:
    if llm is None :
        print("Error: LLM is not initialized.")
        return {"complementary_products": [] }

    print(f"Generating complementary products for: {product_name}")

    try:
    
        llm_response = recommendation_chain.invoke({
            "original_product_name": product_name,
            "category": catagory
        })

        complementary_list = llm_response.complementary_products

    
        sorted_complementary = sorted(
            complementary_list,
            key=lambda x: x.score,
            reverse=True
        )

        return {"complementary_products": complementary_list}

    except Exception as e:
        print(f"An error occurred during LLM invocation: {e}")
        return {"complementary_products":{}}

In [41]:
# %%timeit

complementary_products_out = get_complementary(
    "Men's Casual Cotton T-Shirt",
    {
        "Apparel": ["Jeans", "Shorts", "Jackets", "Sweatshirts"],
        "Accessories": ["Sneakers", "Caps", "Socks", "Belts"]
    }
)


Generating complementary products for: Men's Casual Cotton T-Shirt


In [42]:
complementary_products_out

{'complementary_products': [ProductRelationship(product_name='Slim Fit Denim Jeans', product_description='Dark wash jeans offering a modern, tailored fit for a versatile look.', score=0.97),
  ProductRelationship(product_name='Canvas Sneakers', product_description='Classic low-top sneakers in a neutral color, perfect for casual wear.', score=0.95),
  ProductRelationship(product_name='Casual Belt', product_description='A leather or faux-leather belt in a matching color to the jeans.', score=0.9),
  ProductRelationship(product_name='Baseball Cap', product_description='A cotton baseball cap to complete the casual outfit.', score=0.88)]}

In [43]:
# gemini 1.5 flash - 2.62 s ± 114 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# gemini 2.0 flash - 2.03 s ± 146 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# gemini-2.0-flash-lite - 1.39 s ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [45]:
%%timeit

for comp in complementary_products_out['complementary_products']:
  comp_name = comp.product_name
  search_similar_products(comp_name)


Searching for 'Slim Fit Denim Jeans'...
Search completed in 0.3077 seconds.

Search results for 'Slim Fit Denim Jeans':
1. Product ID: 7899906539745, Distance: 0.3724, Heuristic Similarity: 0.6276
   Name: black pure cotton slim fit pants
   Product ID (original): 7899906539745
   Product Name: black pure cotton slim fit pants
----------------------------------------
2. Product ID: 7899906801889, Distance: 0.4183, Heuristic Similarity: 0.5817
   Name: white pure cotton slim fit pants
   Product ID (original): 7899906801889
   Product Name: white pure cotton slim fit pants
----------------------------------------
3. Product ID: 8043581178081, Distance: 0.6032, Heuristic Similarity: 0.3968
   Name: white slim fit linen pants
   Product ID (original): 8043581178081
   Product Name: white slim fit linen pants
----------------------------------------
4. Product ID: 8175767978209, Distance: 0.6698, Heuristic Similarity: 0.3302
   Name: black slim fit pants
   Product ID (original): 81757679