## 1. Loading Data

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import re
import json
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
# load embedding data
embedding_df = pd.read_pickle('../data/embedding_with_metadata.pkl')

# load and configure OpenAI client
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
# Sanity Check: ensure embeddings are loaded correctly
embedding_lengths = embedding_df['embedding'].apply(len)
assert embedding_lengths.nunique() == 1, "Inconsistent embedding dimensions"

embedding_dim = embedding_lengths.iloc[0]
print(f"Embedding dimension: {embedding_dim}")

Embedding dimension: 1536


In [4]:
# Convert to numpy array
embeddings = np.vstack(embedding_df['embedding'].tolist()).astype(np.float32)

## 2. Search

### 2.1 Define the search function

In [5]:
# Generate embedding for query
def generate_query_embedding(query_text, model="text-embedding-ada-002"):
    response = openai_client.embeddings.create(
        model=model,
        input=query_text
    )
    return response.data[0].embedding

In [None]:
# Hybrid Attribute Extraction (Rule + LLM)
def extract_filters(query_text):
    filters = {}

    # Rule-based gender extraction
    gender_patterns = {
        'Women': r'(?i)\b(women|woman|ladies|female)\b',
        'Men': r'(?i)\b(men|man|gentlemen|male)\b',
        'Boys': r'(?i)\b(boys|boy)\b',
        'Girls': r'(?i)\b(girls|girl)\b',
        'Unisex': r'(?i)\bunisex\b'
    }
    for gender, pattern in gender_patterns.items():
        if re.search(pattern, query_text):
            filters['gender'] = gender
            break

    # Rule-based color extraction
    color_patterns = {
        'Red': r'(?i)\b(red|burgundy|maroon)\b',
        'Blue': r'(?i)\b(blue|navy)\b',
        'Grey': r'(?i)\b(grey|silver|charcoal)\b',
        'Yellow': r'(?i)\b(yellow|gold|mustard|orange|khaki)\b',
        'Pink': r'(?i)\bpink\b',
        'Black': r'(?i)\bblack\b',
        'White': r'(?i)\bwhite\b'
    }
    for color, pattern in color_patterns.items():
        if re.search(pattern, query_text):
            filters['color_group'] = color
            break

    # LLM-based price extraction
    system_prompt = """
        You are an e-commerce assistant. Extract price range from user query.
        Return JSON like: {"min_price": xxx, "max_price": yyy}.
        If only 'under 5000' is mentioned, return {"max_price": 5000}.
        If only 'above 1000' is mentioned, return {"min_price": 1000}.
        If no price is mentioned, return empty JSON.
        """
    
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Query: {query_text}"}
        ],
        temperature=0
    )
    content = response.choices[0].message.content
    try:
        price_filter = json.loads(content)
        filters.update(price_filter)
    except:
        pass

    return filters


In [None]:
# Filter-Then-Search Pipeline
def filter_search(query_text, prefetch_k=100, top_k=5):
    print(f"\nIncoming query: {query_text}")

    # Step 1: Extract filters from user query (gender, color_group, price range)
    filters = extract_filters(query_text)
    print(f"Extracted filters: {filters}")

    # Vector search first (full embedding space)
    query_embedding = generate_query_embedding(query_text)
    embeddings_all = np.vstack(embedding_df['embedding'].tolist()).astype(np.float32)
    query_vector = np.array(query_embedding).reshape(1, -1).astype('float32')
    similarities = cosine_similarity(query_vector, embeddings_all)
    top_indices = np.argsort(similarities[0])[::-1][:prefetch_k]

   # Select top candidates
    candidates = embedding_df.iloc[top_indices].copy()
    candidates['score'] = similarities[0][top_indices]

    # Step 3: If no filters detected, return pure vector search results directly
    if not filters:
        print("\nNo filters detected, returning top candidates based on vector search scores.")
        final_results = candidates.sort_values('score', ascending=False).head(top_k)
        print("\nTop search results:")
        print(final_results[['product_id', 'product_name', 'product_brand', 
                              'gender', 'color_group', 'price_inr', 'score']])
        return
    
    # Step 4: Apply soft filter bonus if filters exist
    bonus_weight = 0.1
    candidates['filter_bonus'] = 0


    # Apply bonus for matching attributes
    if 'gender' in filters:
        candidates['filter_bonus'] += (candidates['gender'] == filters['gender']).astype(int)
    if 'color_group' in filters:
        candidates['filter_bonus'] += (candidates['color_group'] == filters['color_group']).astype(int)
    if 'min_price' in filters:
        candidates['filter_bonus'] += (candidates['price_inr'] >= filters['min_price']).astype(int)
    if 'max_price' in filters:
        candidates['filter_bonus'] += (candidates['price_inr'] <= filters['max_price']).astype(int)

    # Combine original similarity score and filter bonus
    candidates['final_score'] = candidates['score'] + bonus_weight * candidates['filter_bonus']

    # Step 5: Sort by final score
    final_results = candidates.sort_values('final_score', ascending=False).head(top_k)

    print("\nTop search results:")
    print(final_results[['product_id', 'product_name', 'product_brand', 'gender', 'color_group', 'price_inr', 'score', 'final_score']])

In [8]:
filter_search("Casual shoes under 5k for kids unisex")


Incoming query: Casual shoes under 5k for kids unisex
Extracted filters: {'gender': 'Unisex', 'max_price': 5000}

Top search results:
       product_id                                       product_name  \
10453    10248227             Puma Unisex Grey Leather Running Shoes   
804      10018015                         Puma Unisex Black Sneakers   
10289    10253211      Puma Unisex Grey Cappela IDP Slip-on Sneakers   
10163    10253231  Puma Unisex Maroon Auxius V2 IDP Printed Slip-...   
516      10018053                          Puma Unisex Blue Sneakers   

      product_brand  gender color_group  price_inr     score  final_score  
10453          Puma  Unisex        Grey     4674.0  0.831802     1.031802  
804            Puma  Unisex       Black     3199.0  0.825517     1.025517  
10289          Puma  Unisex        Grey     1979.0  0.825307     1.025307  
10163          Puma  Unisex         Red     2799.0  0.824423     1.024423  
516            Puma  Unisex        Blue     2999.0  