In [8]:
import pandas as pd
df=pd.read_csv("amazon_com_ecommerce.csv")

In [9]:
df=df.dropna(axis=1,how="all")
df=df.drop(columns=["Upc Ean Code",]) # delete additonal unuseful
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Uniq Id                10002 non-null  object
 1   Product Name           10002 non-null  object
 2   Category               9172 non-null   object
 3   Selling Price          9895 non-null   object
 4   Model Number           8230 non-null   object
 5   About Product          9729 non-null   object
 6   Product Specification  8370 non-null   object
 7   Technical Details      9212 non-null   object
 8   Shipping Weight        8864 non-null   object
 9   Product Dimensions     479 non-null    object
 10  Image                  10002 non-null  object
 11  Variants               2478 non-null   object
 12  Product Url            10002 non-null  object
 13  Is Amazon Seller       10002 non-null  object
dtypes: object(14)
memory usage: 1.1+ MB


In [10]:
def combine_text(row):
    text_fields = [
        row.get("Product Name", ""),
        row.get("About Product", "")
    ]
    return " ".join([str(f) for f in text_fields if pd.notna(f) and f.strip()])

df["text"] = df.apply(combine_text, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Uniq Id                10002 non-null  object
 1   Product Name           10002 non-null  object
 2   Category               9172 non-null   object
 3   Selling Price          9895 non-null   object
 4   Model Number           8230 non-null   object
 5   About Product          9729 non-null   object
 6   Product Specification  8370 non-null   object
 7   Technical Details      9212 non-null   object
 8   Shipping Weight        8864 non-null   object
 9   Product Dimensions     479 non-null    object
 10  Image                  10002 non-null  object
 11  Variants               2478 non-null   object
 12  Product Url            10002 non-null  object
 13  Is Amazon Seller       10002 non-null  object
 14  text                   10002 non-null  object
dtypes: object(15)
memor

In [11]:
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

os.makedirs("images", exist_ok=True) # create images folder

valid_entries = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    url = row["Image"]
    text = row["text"]
    try:
        response = requests.get(url, timeout=10)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        filename = f"images/{row['Uniq Id']}.jpg"
        image.save(filename)
        valid_entries.append({"image_path": filename, "text": text})
    except Exception as e:
        continue  # skip broken URLs

# Convert to DataFrame
train_df = pd.DataFrame(valid_entries)


100%|██████████| 10002/10002 [1:16:27<00:00,  2.18it/s] 


In [12]:
# save train data to local
train_df.to_csv("clip_train_data.csv", index=False)

# load back train data
train_df=pd.read_csv("clip_train_data.csv")

# finetune

In [17]:
from torch.utils.data import Dataset
from PIL import Image
from transformers import CLIPProcessor

# Define PyTorch Dataset
class ProductDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.data = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["image_path"]
        text = self.data.iloc[idx]["text"]

        image = Image.open(image_path).convert("RGB")

        # Ensure text is truncated to 77 tokens
        return self.processor(
            text=[text],
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=77,
        )

# Load Pretrained CLIP Model
from transformers import CLIPModel, CLIPProcessor

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [19]:
# Fine-Tuning Loop
import torch
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = ProductDataset(train_df, processor)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)

model.train()

for epoch in range(3):  # adjust epochs as needed
    total_loss = 0.0
    for batch in dataloader:
        # Remove batch dimension
        input_ids = batch["input_ids"].squeeze(1).to(device)
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        pixel_values = batch["pixel_values"].squeeze(1).to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            return_loss=True  # This enables contrastive loss automatically
        )

        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} — Loss: {total_loss / len(dataloader):.4f}")



Epoch 1 — Loss: 0.1067
Epoch 2 — Loss: 0.0423
Epoch 3 — Loss: 0.0361


In [20]:
# save CLIP
model.save_pretrained("clip-finetuned")
processor.save_pretrained("clip-finetuned")

# load back
# from transformers import CLIPModel, CLIPProcessor

# model = CLIPModel.from_pretrained("clip-finetuned")
# processor = CLIPProcessor.from_pretrained("clip-finetuned")


[]