In [61]:
import os
import pandas as pd
import torch
import clip
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import joblib
import chromadb

### Let's first check if images exist for all Product Ids

In [28]:

df = pd.read_csv('Fashion_Dataset_V2.csv')

image_folder = "images/"


In [33]:
existing_images = set([f.split('.')[0] for f in os.listdir(image_folder)])
df['image_exists'] = df['p_id'].astype(int).astype(str).isin(existing_images)

print(f"Products with missing images: {len(df) - df['image_exists'].sum()}")

Products with missing images: 0


In [30]:
df.shape

(14220, 9)

In [31]:
df = df[df['image_exists']].reset_index(drop=True)

In [32]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,p_id,price,brand,log_price,cleaned_description,final_text,image_exists
0,0,0,17048614.0,5099.0,Khushal K,8.536996,Black printed Kurta with Palazzos with dupatta...,A Khushal K Women Black Ethnic Motifs Printed ...,True
1,1,1,16524740.0,5899.0,InWeave,8.682708,Orange solid Kurta with Palazzos with dupattaK...,A InWeave Women Orange Solid Kurta with Palazz...,True
2,2,2,16331376.0,4899.0,Anubhutee,8.49699,Navy blue embroidered Kurta with Trousers with...,A Anubhutee Women Navy Blue Ethnic Motifs Embr...,True
3,3,3,14709966.0,3699.0,Nayo,8.216088,Red printed kurta with trouser and dupattaKurt...,A Nayo Women Red Floral Printed Kurta With Tro...,True
4,4,4,11056154.0,1350.0,AHIKA,7.2086,"Black and green printed straight kurta, has a ...",A AHIKA Women Black & Green Printed Straight K...,True


In [23]:
df = df.drop(columns=['Unnamed: 0.1'])

### 1. Setup and Model Loading

In [25]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)
print(f"Using device: {device}")

100%|███████████████████████████████████████| 338M/338M [01:02<00:00, 5.69MiB/s]


Using device: mps


In [26]:
model

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

### Dataloader

In [34]:
class FashionDataset(Dataset):
    def __init__(self, df, img_path, preprocess):
        self.df = df
        self.img_path = img_path
        self.preprocess = preprocess
        self.texts = clip.tokenize(df['final_text'].tolist(), truncate=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = int(self.df.iloc[idx]['p_id'])
        image_file = os.path.join(self.img_path, f"{img_id}.jpg")
        image = self.preprocess(Image.open(image_file))
        
        text = self.texts[idx]
        return image, text

In [39]:
dataset = FashionDataset(df, "images/", preprocess)
loader = DataLoader(dataset, batch_size=32, shuffle=False)

### Embeddidng Loop

In [44]:
# Some of the images in our folder are corrupted, let's remove them first

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [45]:
image_embeddings = []
text_embeddings = []

model.eval()
with torch.no_grad():
    for images, texts in tqdm(loader):
        images = images.to(device)
        texts = texts.to(device)

        img_features = model.encode_image(images)
        txt_features = model.encode_text(texts)

        img_features /= img_features.norm(dim=-1, keepdim=True)
        txt_features /= txt_features.norm(dim=-1, keepdim=True)

        image_embeddings.append(img_features.cpu().numpy())
        text_embeddings.append(txt_features.cpu().numpy())

image_embeddings = np.vstack(image_embeddings)
text_embeddings = np.vstack(text_embeddings)

print(f"Generated {image_embeddings.shape[0]} embeddings.")

  0%|          | 0/445 [00:00<?, ?it/s]

100%|██████████| 445/445 [05:02<00:00,  1.47it/s]

Generated 14211 embeddings.





In [46]:
np.save('image_embeddings.npy', image_embeddings)
np.save('text_embeddings.npy', text_embeddings)

In [47]:
image_embeddings.shape

(14211, 512)

## Feature Fusion: Concatenation vs. Addition

Now that we have generated 512-dimensional embeddings for both the **Images** and the **Text**, we must combine them into a single "Master Vector" for our KNN model.

### 1. Why Concatenation over Addition?
While CLIP maps images and text to the same coordinate space, they represent different "perspectives" of the product.
* **Addition ($V_{img} + V_{txt}$):** Acts like an average. If the image is highly detailed but the text is generic, adding them "waters down" the visual signal.
* **Concatenation ($[V_{img}, V_{txt}]$):** Keeps the visual and textual data in separate "feature lanes." This allows the KNN model to calculate distances based on both modalities independently. If two items look similar but have different descriptions, the concatenated vector preserves that distinction.

### 2. Weighted Vectors (Visual Primacy)
In fashion recommendation, **visual style** is often a stronger driver of similarity than the written description. 
* By applying a **Weighting Factor** (e.g., multiplying the Image Vector by 1.2), we effectively increase the "spread" of visual features in the vector space.
* This forces the KNN model to prioritize items that *look* similar, using the text and price as secondary refining features.

### 3. Integrating Structured Metadata (Price)
Since we used `MinMaxScaler` on the `log_price`, the price is now a value between 0 and 1. By appending it to the end of our 1024-dimensional CLIP vector, we ensure that the price acts as a final "nudge" in the recommendation, favoring products in a similar price bracket.

In [62]:
# Normalize the Log Price -- use MinMaxScaler to make sure Price is between 0 and 1
scaler = MinMaxScaler()
price_features = scaler.fit_transform(df[['log_price']])

# Concatenate: [Image(512) + Text(512) + Price(1)] = 1025 dimensions
master_vectors = np.hstack([
    image_embeddings * 1.5, 
    text_embeddings * 1.0, 
    price_features * 1.0
])

print(f"Fusion Complete!")
print(f"Final Feature Matrix Shape: {master_vectors.shape}")

Fusion Complete!
Final Feature Matrix Shape: (14211, 1025)


In [60]:
joblib.dump(scaler, 'price_scaler.pkl')

['price_scaler.pkl']

In [None]:
np.save('master_vectors.npy', master_vectors)

### Make ChromaDB

In [64]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'p_id', 'price', 'brand', 'log_price',
       'cleaned_description', 'final_text', 'image_exists'],
      dtype='object')

In [65]:
client = chromadb.PersistentClient(path="./fashion_vector_db")
collection = client.get_or_create_collection(name="multimodal_fashion", metadata={"hnsw:space": "cosine"})

# 2. Prepare Data (IDs must be strings)
ids = df['p_id'].astype(int).astype(str).tolist()
# Convert only necessary metadata to save space
metadatas = df[['brand', 'price', 'final_text']].to_dict('records')

# 3. Batch Ingestion (ChromaDB works best with batches of ~1000)
BATCH_SIZE = 1000
for i in tqdm(range(0, len(ids), BATCH_SIZE)):
    batch_ids = ids[i : i + BATCH_SIZE]
    batch_vectors = master_vectors[i : i + BATCH_SIZE].tolist() # Must be a list
    batch_metadata = metadatas[i : i + BATCH_SIZE]
    
    collection.upsert(
        ids=batch_ids,
        embeddings=batch_vectors,
        metadatas=batch_metadata
    )

print(f"✅ Ingestion Complete! Total items: {collection.count()}")

100%|██████████| 15/15 [00:02<00:00,  6.27it/s]

✅ Ingestion Complete! Total items: 14211



