##  Basic Library imports

In [3]:
import os
import pandas as pd 
import numpy as np

##  Read Dataset

In [4]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

In [22]:
# Max possible sequence length
lengths = [len(i.split(' ')) for i in train["catalog_content"]]
Q1 = np.percentile(lengths, 25)
Q3 = np.percentile(lengths, 75)
IQR = Q3 - Q1

print(np.mean(lengths))
print(np.median(lengths))
print(f"First Quartile (Q1): {Q1}")
print(f"Third Quartile (Q3): {Q3}")
print(f"Interquartile Range (IQR): {IQR}")

141.84637333333333
97.0
First Quartile (Q1): 37.0
Third Quartile (Q3): 201.0
Interquartile Range (IQR): 164.0


In [3]:
from utils import download_images
download_images(train['image_link'], '../train_images')

 52%|█████▏    | 38792/75000 [01:11<01:00, 601.70it/s]

HTTP Error 404: Not Found


100%|██████████| 75000/75000 [02:09<00:00, 576.95it/s]


In [4]:
from utils import download_images
download_images(test['image_link'], '../test_images')

 56%|█████▌    | 41803/75000 [01:38<01:06, 500.34it/s]

HTTP Error 404: Not Found

 56%|█████▌    | 41907/75000 [01:38<01:34, 351.89it/s]




100%|██████████| 75000/75000 [02:46<00:00, 450.01it/s]


In [5]:
train[:5]

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [6]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.amp import autocast, GradScaler
from tqdm.autonotebook import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

  from tqdm.autonotebook import tqdm


In [7]:
device = "cuda"
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [48]:
class ResidualBlock(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(features, features),
            nn.LayerNorm(features),
            nn.GELU(),
            nn.Dropout(0.05),
            nn.Linear(features, features)
        )
    
    def forward(self, x):
        return x + self.block(x)

In [50]:
class CLIPRegressor(nn.Module):
    def __init__(self, base_model_name="openai/clip-vit-base-patch32", num_residual_blocks=1):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(base_model_name)
            
        # Initial projection layer
        self.initial_projection = nn.Linear(512 * 2, 64)
        
        # Stack residual blocks
        self.residual_blocks = nn.Sequential(
            *[ResidualBlock(64) for _ in range(num_residual_blocks)]
        )
        
        # Final output layer
        self.output_layer = nn.Linear(64, 1)

    def forward(self, text_inputs, image_inputs):
        outputs = self.clip(**text_inputs, **image_inputs)
        img_emb = outputs.image_embeds
        txt_emb = outputs.text_embeds
        fused = torch.cat([img_emb, txt_emb], dim=1)
        
        # Process through the regression head
        x = self.initial_projection(fused)
        x = self.residual_blocks(x)
        x = self.output_layer(x)
        
        return x

In [51]:
clip_regressor = CLIPRegressor().to(device)

In [10]:
# Freeze most layers except the last transformer block + projection layers
# for name, param in clip_regressor.clip.text_model.named_parameters():
#     if not any(x in name for x in ["encoder.layers.11", "final_layer_norm"]):
#         param.requires_grad = False

# for name, param in clip_regressor.clip.vision_model.named_parameters():
#     if not any(x in name for x in ["encoder.layers.11", "post_layernorm"]):
#         param.requires_grad = False

In [29]:
class CLIPRegressionDataset(Dataset):
    def __init__(self, csv_path, image_folder, processor_name="openai/clip-vit-base-patch32", max_length=77):
        self.df = pd.read_csv(csv_path)
        self.image_folder = image_folder
        self.processor = CLIPProcessor.from_pretrained(processor_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # --- Text ---
        text = str(row["catalog_content"])
        if len(text.strip()) == 0:
            text = "No description available."

        # --- Image ---
        img_name = row["image_link"].split("/")[-1]
        img_path = os.path.join(self.image_folder, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception:
            # Fallback image (solid black) if corrupted or missing
            image = Image.new("RGB", (224, 224), (0, 0, 0))

        # --- Price target ---
        price = torch.tensor(float(row["price"]), dtype=torch.float32)

        # --- Preprocess with CLIP ---
        inputs = self.processor(
            text=[text],
            images=image,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Remove batch dim (CLIPProcessor returns 1-batch)
        text_inputs = {k: v.squeeze(0) for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
        image_inputs = {k: v.squeeze(0) for k, v in inputs.items() if k in ["pixel_values"]}

        return {
            "text_inputs": text_inputs,
            "image_inputs": image_inputs,
            "price": price,
        }

def collate_fn_train(batch):
    """Custom collator for batching CLIPProcessor outputs"""
    text = {
        "input_ids": torch.stack([b["text_inputs"]["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["text_inputs"]["attention_mask"] for b in batch]),
    }
    images = {
        "pixel_values": torch.stack([b["image_inputs"]["pixel_values"] for b in batch]),
    }
    prices = torch.stack([b["price"] for b in batch])
    return text, images, prices

def get_train_dataloader(csv_path, image_folder, batch_size=32, num_workers=4, shuffle=True):
    dataset = CLIPRegressionDataset(csv_path, image_folder)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=True,
        collate_fn=collate_fn_train,
    )
    return dataloader

In [34]:
df = pd.read_csv("/teamspace/studios/this_studio/student_resource/dataset/train.csv")
price_mean = df["price"].mean()
price_std = df["price"].std()
df["price"] = (df["price"] - price_mean) / price_std
df.to_csv("train_norm.csv", index=False)

In [30]:
train_dataloader = get_train_dataloader(
    csv_path="/teamspace/studios/this_studio/student_resource/src/train_norm.csv",
    image_folder="train_images",
    batch_size=32,
    num_workers=8
)

In [31]:
class SMAPELoss(torch.nn.Module):
    def __init__(self, eps=1e-2):
        super().__init__()
        self.eps = eps

    def forward(self, preds, targets):
        numerator = torch.abs(preds - targets)
        denominator = (torch.abs(targets) + torch.abs(preds)).clamp(min=self.eps)
        smape = numerator / (denominator / 2.0)
        return smape.mean()

smape_loss_fn = SMAPELoss()
l1_loss_fn = torch.nn.SmoothL1Loss()

def hybrid_loss(preds, targets, alpha=0.7):
    """Blend SMAPE (relative) and L1 (absolute) losses."""
    return alpha * smape_loss_fn(preds, targets) + (1 - alpha) * l1_loss_fn(preds, targets)

* Train from further checkpoints

In [52]:
# checkpoint = torch.load("clip_regressor_smape.pt", weights_only=False) # map_location=device
# price_mean, price_std = checkpoint["price_mean"], checkpoint["price_std"]

# clip_regressor = CLIPRegressor().to(device)
# clip_regressor.load_state_dict(checkpoint["model_state_dict"])

# freezing all the weights, only mlp will be trainable
for name, param in clip_regressor.clip.text_model.named_parameters():
    if not any(x in name for x in ["encoder.layers.11", "final_layer_norm"]):
        param.requires_grad = False

for name, param in clip_regressor.clip.vision_model.named_parameters():
    if not any(x in name for x in ["encoder.layers.11", "final_layer_norm"]):
        param.requires_grad = False

In [53]:
optimizer = AdamW(
    filter(lambda p: p.requires_grad, clip_regressor.parameters()),
    lr=5e-5, weight_decay=1e-6  # small weight decay for regularization
)

scaler = GradScaler('cuda')

In [54]:
epochs = 7  # typically enough for partial CLIP fine-tune
clip_regressor.train()

for epoch in range(epochs):
    running_loss = 0.0
    running_smape = 0.0

    pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", dynamic_ncols=True)
    
    for step, (text_inputs, image_inputs, prices) in enumerate(pbar):
        text_inputs = {k: v.to(device, non_blocking=True) for k, v in text_inputs.items()}
        image_inputs = {k: v.to(device, non_blocking=True) for k, v in image_inputs.items()}
        prices = prices.to(device).unsqueeze(1)

        optimizer.zero_grad(set_to_none=True)

        with autocast(device_type='cuda', dtype=torch.bfloat16):
            preds = clip_regressor(text_inputs, image_inputs)
            loss = hybrid_loss(preds, prices)

        # Mixed precision backward
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(clip_regressor.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        # Compute running SMAPE in normalized space
        with torch.no_grad():
            batch_smape = smape_loss_fn(preds, prices).item() * 100
        running_smape += batch_smape

        if (step + 1) % 50 == 0:
            avg_loss = running_loss / (step + 1)
            avg_smape = running_smape / (step + 1)
            pbar.set_postfix({"Loss": f"{avg_loss:.4f}", "SMAPE%": f"{avg_smape:.2f}"})

    print(f"Epoch {epoch+1} done | Avg Loss: {avg_loss:.4f} | Avg SMAPE: {avg_smape:.2f}%")

Epoch 1/7:   0%|          | 0/2344 [00:00<?, ?it/s]

Epoch 1 done | Avg Loss: 0.6590 | Avg SMAPE: 84.31%


Epoch 2/7:   0%|          | 0/2344 [00:00<?, ?it/s]

Epoch 2 done | Avg Loss: 0.5961 | Avg SMAPE: 76.96%


Epoch 3/7:   0%|          | 0/2344 [00:00<?, ?it/s]

Epoch 3 done | Avg Loss: 0.5779 | Avg SMAPE: 74.76%


Epoch 4/7:   0%|          | 0/2344 [00:00<?, ?it/s]

Epoch 4 done | Avg Loss: 0.5679 | Avg SMAPE: 73.54%


Epoch 5/7:   0%|          | 0/2344 [00:00<?, ?it/s]

Epoch 5 done | Avg Loss: 0.5619 | Avg SMAPE: 72.80%


Epoch 6/7:   0%|          | 0/2344 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [35]:
# ------------------------------
# 💾 Save Model
# ------------------------------
torch.save({
    "model_state_dict": clip_regressor.state_dict(),
    "price_mean": price_mean,
    "price_std": price_std
}, "clip_regressor_smape.pt")

In [55]:
class CLIPTestDataset(Dataset):
    def __init__(self, csv_path, image_folder, processor_name="openai/clip-vit-base-patch32", max_length=77):
        self.df = pd.read_csv(csv_path)
        self.image_folder = image_folder
        self.processor = CLIPProcessor.from_pretrained(processor_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["catalog_content"])
        if len(text.strip()) == 0:
            text = "No description available."

        img_name = row["image_link"].split("/")[-1]
        img_path = os.path.join(self.image_folder, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception:
            image = Image.new("RGB", (224, 224), (0, 0, 0))

        inputs = self.processor(
            text=[text],
            images=image,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        text_inputs = {k: v.squeeze(0) for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
        image_inputs = {k: v.squeeze(0) for k, v in inputs.items() if k in ["pixel_values"]}

        return {
            "sample_id": row["sample_id"],
            "text_inputs": text_inputs,
            "image_inputs": image_inputs
        }

def collate_fn(batch):
    text = {
        "input_ids": torch.stack([b["text_inputs"]["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["text_inputs"]["attention_mask"] for b in batch]),
    }
    images = {
        "pixel_values": torch.stack([b["image_inputs"]["pixel_values"] for b in batch]),
    }
    sample_ids = [b["sample_id"] for b in batch]
    return text, images, sample_ids

def get_test_dataloader(csv_path, image_folder, batch_size=32, num_workers=8):
    dataset = CLIPTestDataset(csv_path, image_folder)
    return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)

In [None]:
checkpoint = torch.load("clipreg_smape.pt", weights_only=False) # map_location=device
price_mean, price_std = checkpoint["price_mean"], checkpoint["price_std"]

clip_regressor = CLIPRegressor().to(device)
clip_regressor.load_state_dict(checkpoint["model_state_dclip_regressor"])

<All keys matched successfully>

In [56]:
test_dataloader = get_test_dataloader("/teamspace/studios/this_studio/student_resource/dataset/test.csv", "test_images", batch_size=32)

In [57]:
clip_regressor.eval()
predictions = []
sample_ids = []

with torch.no_grad():
    pbar = tqdm(test_dataloader, desc="Predicting", dynamic_ncols=True)
    for text_inputs, image_inputs, sids in pbar:
        text_inputs = {k: v.to(device, non_blocking=True) for k, v in text_inputs.items()}
        image_inputs = {k: v.to(device, non_blocking=True) for k, v in image_inputs.items()}

        with autocast(device_type='cuda',dtype=torch.bfloat16):
            preds = clip_regressor(text_inputs, image_inputs)

        preds = preds.squeeze(1).cpu()

        # Denormalize
        preds = preds * price_std + price_mean

        # Ensure non-negative prices
        preds = torch.clamp(preds, min=0.0)

        predictions.extend(preds.tolist())
        sample_ids.extend(sids)

submission_df = pd.DataFrame({
    "sample_id": sample_ids,
    "price": predictions
})

submission_df.to_csv("submission.csv", index=False)
print("✅ Saved predictions to submission.csv")


Predicting:   0%|          | 0/2344 [00:00<?, ?it/s]

✅ Saved predictions to submission.csv
