In [1]:
import os
import pandas as pd 
import numpy as np

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

In [3]:
lengths = [len(i.split(' ')) for i in train["catalog_content"]]
Q1 = np.percentile(lengths, 25)
Q3 = np.percentile(lengths, 75)
IQR = Q3 - Q1

print(np.mean(lengths))
print(np.median(lengths))
print(max(lengths))
print(f"First Quartile (Q1): {Q1}")
print(f"Third Quartile (Q3): {Q3}")
print(f"Interquartile Range (IQR): {IQR}")

141.84637333333333
97.0
1322
First Quartile (Q1): 37.0
Third Quartile (Q3): 201.0
Interquartile Range (IQR): 164.0


In [5]:
train["price"].describe()

count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
25%          6.795000
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64

In [14]:
import math
from pathlib import Path
import random
from typing import Dict, Any

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.amp.autocast_mode import autocast 
from torch.amp.grad_scaler import GradScaler
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
# from sentence_transformers import SentenceTransformer

# from torchvision import transforms
# from PIL import Image
# from transformers import AutoProcessor, AutoModel
from tqdm.autonotebook import tqdm

In [4]:
DEVICE = "cuda"
device = "cuda"

In [5]:
def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(0)

In [6]:
def smape(y_true, y_pred, eps=1e-8):
    # y_true, y_pred: numpy arrays or tensors on CPU
    y_true = np.array(y_true).astype(np.float64)
    y_pred = np.array(y_pred).astype(np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return 100.0 * np.mean(2.0 * np.abs(y_true - y_pred) / denom)

In [8]:
class ProductPriceDataset(Dataset):
    def __init__(self, df: pd.DataFrame, image_root: str):
        self.df = df.reset_index(drop=True)
        self.image_root = image_root

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        img_name = row["image_link"].split("/")[-1]
        img_path = os.path.join(self.image_root, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception:
            # Fallback image (solid black) if corrupted or missing
            image = Image.new("RGB", (224, 224), (0, 0, 0))

        text = str(row['catalog_content'])
        if len(text.strip()) == 0:
            text = "No description available."

        # processor could handle both image and text tokenization
        # but here we return raw and do processor in collate_fn
        return {'image':image, 'text':text}

In [9]:
def mycollate_fn(batch):
    imgs = [b['image'] for b in batch]
    texts = [b['text'] for b in batch]
    return {'images':imgs, 'texts': texts}

In [19]:
dataset_train = ProductPriceDataset(train, image_root='train_images')
train_loader = DataLoader(dataset_train, batch_size=100, collate_fn=lambda b: mycollate_fn(b), shuffle=False)

In [20]:
dataset_test = ProductPriceDataset(test, image_root='test_images') 
test_loader = DataLoader(dataset_test, batch_size=100, collate_fn=lambda b: mycollate_fn(b), shuffle=False)

In [11]:
siglip_model = AutoModel.from_pretrained("google/siglip-base-patch16-224", attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
siglip_model.to(device)
emb_text = SentenceTransformer("google/embeddinggemma-300m").to(device)
processor_i = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", use_fast=True)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [21]:
siglip_model.eval()
emb_text.eval()

img_embds_train = []
text_embds_train = []

with torch.inference_mode():
    for batch in tqdm(train_loader):
        img = processor_i(images=batch['images'], return_tensors="pt")
        img = img.to(device)
        img_embd = siglip_model.get_image_features(**img)
        img_embds_train.extend(img_embd.to(torch.float32).cpu().squeeze().numpy())
        text_embd = emb_text.encode_document(batch['texts'])
        text_embds_train.extend(text_embd)

np.savez_compressed(
    "Train_embeddings_image_text.npz",
    img_embds_train=np.array(img_embds_train, dtype=object),
    text_embds_train=np.array(text_embds_train, dtype=object)
)

del img_embds_train, text_embds_train
print("Train Saved!")

img_embds_test = []
text_embds_test = []

with torch.inference_mode():
    for batch in tqdm(test_loader):
        img = processor_i(images=batch['images'], return_tensors="pt")
        img = img.to(device)
        img_embd = siglip_model.get_image_features(**img)
        img_embds_test.extend(img_embd.to(torch.float32).cpu().squeeze().numpy())
        text_embd = emb_text.encode_document(batch['texts'])
        text_embds_test.extend(text_embd)

np.savez_compressed(
    "Test_embeddings_image_text.npz",
    img_embds_test=np.array(img_embds_test, dtype=object),
    text_embds_test=np.array(text_embds_test, dtype=object)
)

del img_embds_test, text_embds_test
print("Test Saved!")

  0%|          | 0/750 [00:00<?, ?it/s]

Train Saved!


  0%|          | 0/750 [00:00<?, ?it/s]

Test Saved!


* Using the extracted embeddings

In [7]:
# Loading from numpy npz
loaded_data = np.load("Train_embeddings_image_text.npz", allow_pickle=True)
img_train = loaded_data['img_embds_train']
text_train = loaded_data['text_embds_train']
print(img_train.shape, text_train.shape)

(75000, 768) (75000, 768)


* gemini

In [7]:
# EMBED_DIM = 768
# LATENT_DIM = 512    # Dimension for projections and attention
# NUM_HEADS = 8       # MultiHeadAttention heads
# MLP_DIM = 1024      # Inner dimension for MLP blocks
# DROPOUT_RATE = 0.1
# BATCH_SIZE = 150
# LEARNING_RATE = 4e-5
# EPOCHS = 50         # Training for a fixed number of epochs

In [53]:
EMBED_DIM = 768
LATENT_DIM = 512    # Dimension for projections and attention
NUM_HEADS = 8       # MultiHeadAttention heads
MLP_DIM = 2048      # Inner dimension for MLP blocks
DROPOUT_RATE = 0.1
BATCH_SIZE = 150
LEARNING_RATE = 4e-5
EPOCHS = 100         # Training for a fixed number of epochs

In [65]:
def smape_metric(y_pred_raw, y_true_raw):
    """
    SMAPE (Symmetric Mean Absolute Percentage Error) - for evaluation.
    Requires raw (non-log) prices.
    """
    # Ensure y_true is a tensor on the same device as y_pred
    if isinstance(y_true_raw, np.ndarray):
        y_true = torch.tensor(y_true_raw, dtype=torch.float32, device=y_pred_raw.device)
    else:
        y_true = y_true_raw
        
    y_pred = y_pred_raw
    
    # Clamp predictions to a minimum value (e.g., 0.13) before SMAPE calculation
    MIN_PRICE_CLAMP = 0.13
    y_pred = torch.clamp(y_pred, min=MIN_PRICE_CLAMP)

    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    
    # SMAPE calculation (100 * mean of percentage error)
    smape = 100 * torch.mean(numerator / (denominator + 1e-7)) # Added 1e-7 for stability
    return smape.item()

In [66]:
class MLPBlock(nn.Module):
    """Transformer-like MLP with LayerNorm and Residual connection."""
    def __init__(self, in_features, mlp_dim, dropout_rate=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(in_features)
        self.net = nn.Sequential(
            nn.Linear(in_features, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(mlp_dim, in_features)
        )

    def forward(self, x):
        return x + self.net(self.norm(x)) # Residual connection and pre-norm

In [67]:
class PriceFusionTransformer(nn.Module):
    """Cross-Attention Fusion Network for Multimodal Regression."""
    def __init__(self, embed_dim=768, latent_dim=512, num_heads=8, mlp_dim=2048, dropout_rate=0.1):
        super().__init__()
        
        # 1. Input Projections
        self.img_proj = nn.Linear(embed_dim, latent_dim)
        self.text_proj = nn.Linear(embed_dim, latent_dim)
        
        # 2. Cross-Attention Fusion Block
        # We model text as Q attending to Image (K, V) and Image as Q attending to Text (K, V)
        self.cross_attn_img_to_text = nn.MultiheadAttention(latent_dim, num_heads, dropout=0.05, batch_first=True)
        self.cross_attn_text_to_img = nn.MultiheadAttention(latent_dim, num_heads, dropout=0.05, batch_first=True)
        
        # LayerNorm for the Attention outputs (Post-Attention Norm)
        self.norm_img = nn.LayerNorm(latent_dim)
        self.norm_text = nn.LayerNorm(latent_dim)

        # 3. Final Fusion & Regression Head
        self.fusion_norm = nn.LayerNorm(latent_dim * 2) # Latent dim x 2 after concatenation
        
        # MLP stack on the fused embedding
        self.mlp_block1 = MLPBlock(latent_dim * 2, mlp_dim, dropout_rate)
        self.mlp_block2 = MLPBlock(latent_dim * 2, mlp_dim, dropout_rate)
        
        # Output is a single log-transformed price value
        self.regression_head = nn.Linear(latent_dim * 2, 1)

    def forward(self, img_emb, text_emb):
        
        # 1. Projections
        E_img = self.img_proj(img_emb).unsqueeze(1)    # Bx1xL
        E_text = self.text_proj(text_emb).unsqueeze(1)  # Bx1xL

        # 2. Cross-Attention
        
        # Text-to-Image (Text Query to Image Key/Value)
        attn_output_t, _ = self.cross_attn_text_to_img(E_text, E_img, E_img)
        fused_text = self.norm_text(E_text + attn_output_t) # Residual + Post-norm

        # Image-to-Text (Image Query to Text Key/Value)
        attn_output_i, _ = self.cross_attn_img_to_text(E_img, E_text, E_text)
        fused_img = self.norm_img(E_img + attn_output_i) # Residual + Post-norm

        # 3. Final Fusion (Concatenation)
        E_final = torch.cat([fused_text.squeeze(1), fused_img.squeeze(1)], dim=-1) # Bx(2*L)
        E_final = self.fusion_norm(E_final)
        
        # MLP Blocks
        E_final = self.mlp_block1(E_final)
        E_final = self.mlp_block2(E_final)
        
        # Regression Head: Output is log1p(price)
        y_pred_log = self.regression_head(E_final)
        
        return y_pred_log.squeeze(1) # Bx1 -> B

In [77]:
def prepare_data(img_train, text_train, prices):
    # --- 3.2 Target Transformation (log1p) ---
    # Log transform the price target
    # np.log1p(x) is equivalent to np.log(1 + x)
    prices_log = np.log1p(prices)
    
    # Convert to PyTorch Tensors
    X_img_tensor = torch.tensor(img_train.astype(np.float32), dtype=torch.float32)
    X_text_tensor = torch.tensor(text_train.astype(np.float32), dtype=torch.float32)
    Y_tensor = torch.tensor(prices_log, dtype=torch.float32)
    
    # Create PyTorch Dataset and DataLoader
    full_dataset = TensorDataset(X_img_tensor, X_text_tensor, Y_tensor)
    full_dataloader = DataLoader(full_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Return the scaler objects for transforming the test data later
    return full_dataloader, np.array(prices)

In [78]:
def get_predictions_and_smape(model, data_loader, raw_prices):
    model.eval()
    y_pred_log_list = []
    
    with torch.no_grad():
        for img_batch, text_batch, _ in data_loader: # Iterate over the DataLoader
            img_batch = img_batch.to(DEVICE)
            text_batch = text_batch.to(DEVICE)
            y_pred_log = model(img_batch, text_batch)
            y_pred_log_list.append(y_pred_log.cpu()) # Keep on CPU for concatenation/memory

    y_pred_log_all = torch.cat(y_pred_log_list)
    y_pred_raw_all = torch.expm1(y_pred_log_all).to(DEVICE) # Convert back to device for SMAPE calc
    
    # We don't have R2 here, but we return the SMAPE
    smape_score = smape_metric(y_pred_raw_all, raw_prices)
    
    return smape_score

def calculate_smape_on_train_set(model, train_loader, prices_train_raw):
    
    # 1. Calculate Training SMAPE
    train_smape = get_predictions_and_smape(model, train_loader, prices_train_raw)

    model.train() # Switch model back to train mode
    return train_smape

In [71]:
def train_model(dataloader, prices_raw_array, val_loader, price_val_raw, epochs=EPOCHS, lr=LEARNING_RATE):
    
    # Initialize Model, Optimizer, and Scheduler
    model = PriceFusionTransformer(
        embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, num_heads=NUM_HEADS, mlp_dim=MLP_DIM, dropout_rate=DROPOUT_RATE
    ).to(DEVICE)
    
    # AdamW is the recommended robust optimizer for Transformer architectures
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-8) 

    # Loss
    loss_fn = nn.SmoothL1Loss(beta=0.1)
    
    # Cosine Annealing Scheduler (with Warmup for stability)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, T_mult=2, eta_min=1e-8)
    
    print(f"Starting training on full dataset for {epochs} epochs...")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for img_batch, text_batch, y_true_log in dataloader:
            
            img_batch = img_batch.to(DEVICE)
            text_batch = text_batch.to(DEVICE)
            y_true_log = y_true_log.to(DEVICE)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass: model outputs log1p(price)
            y_pred_log = model(img_batch, text_batch)
            
            # Calculate custom Hinge Regression Loss
            loss = loss_fn(y_pred_log, y_true_log)
            
            # Backward pass and optimization
            loss.backward()
            
            # Clip gradients to prevent explosion (common in custom loss/transformers)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item() * img_batch.size(0)
            
        scheduler.step()
        avg_loss = total_loss / len(dataloader.dataset)

        train_smape, val_smape = calculate_smape_on_train_set(model, dataloader, prices_raw_array, val_loader, price_val_raw)
        print(f"Epoch: {epoch+1:5d} | {avg_loss:.4f} | {train_smape:.4f}% | {val_smape:.4f} | {scheduler.get_last_lr()[0]:.8f}")

    print("Training complete.")
    return model

In [79]:
def train_model_no_val(dataloader, prices_train_raw, epochs=EPOCHS, lr=LEARNING_RATE):
    
    # Initialize Model, Optimizer, and Scheduler
    model = PriceFusionTransformer(
        embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, num_heads=NUM_HEADS, mlp_dim=MLP_DIM, dropout_rate=DROPOUT_RATE
    ).to(DEVICE)
    
    # AdamW is the recommended robust optimizer for Transformer architectures
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-8) 

    # Loss
    loss_fn = nn.SmoothL1Loss(beta=0.1)
    
    # Cosine Annealing Scheduler (with Warmup for stability)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, T_mult=2, eta_min=1e-8)
    
    print(f"Starting training on full dataset for {epochs} epochs...")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for img_batch, text_batch, y_true_log in dataloader:
            
            img_batch = img_batch.to(DEVICE)
            text_batch = text_batch.to(DEVICE)
            y_true_log = y_true_log.to(DEVICE)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass: model outputs log1p(price)
            y_pred_log = model(img_batch, text_batch)
            
            # Calculate custom Hinge Regression Loss
            loss = loss_fn(y_pred_log, y_true_log)
            
            # Backward pass and optimization
            loss.backward()
            
            # Clip gradients to prevent explosion (common in custom loss/transformers)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item() * img_batch.size(0)
            
        scheduler.step()
        avg_loss = total_loss / len(dataloader.dataset)

        train_smape = get_predictions_and_smape(model, train_loader, prices_train_raw)
        
        print(f"Epoch: {epoch+1:5d} | {avg_loss:.4f} | {train_smape:.4f}% | {scheduler.get_last_lr()[0]:.8f}")

    print("Training complete.")
    return model

In [80]:
def predict_test_set(model, img_test, text_test):
    
    # Convert to Tensors
    X_img_test = torch.tensor(img_test.astype(np.float32), dtype=torch.float32).to(DEVICE)
    X_text_test = torch.tensor(text_test.astype(np.float32), dtype=torch.float32).to(DEVICE)
    
    # Create DataLoader for batch inference
    test_dataset = TensorDataset(X_img_test, X_text_test)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model.eval()
    predictions_log = []
    
    with torch.no_grad():
        for img_batch, text_batch in test_dataloader:
            
            # Model outputs log1p(price)
            y_pred_log = model(img_batch, text_batch)
            
            # Collect predictions
            predictions_log.append(y_pred_log.cpu().numpy())
            
    # Concatenate all batch results
    predictions_log = np.concatenate(predictions_log)
    
    # Reverse transformation: price = expm1(log1p(price))
    # np.expm1(x) is equivalent to np.exp(x) - 1
    final_predictions_raw = np.expm1(predictions_log)
    
    # Ensure final predictions are non-negative and meet the minimum price constraint
    MIN_PRICE = 0.13
    final_predictions_raw = np.maximum(final_predictions_raw, MIN_PRICE)
    
    return final_predictions_raw.tolist()

In [81]:
prices_data = train["price"].tolist()
# full_dataloader, full_dataset, prices_raw_array, val_set, prices_val_raw = prepare_data_for_kfold(img_train, text_train, prices_data, part_val_indices=, BATCH_SIZE)

In [None]:
from sklearn.model_selection import train_test_split

all_indices = np.arange(len(train))

for i in range(5):
    print(f"Split: {i}")
    _, part_val_indices = train_test_split(
    all_indices, 
    test_size=0.1, 
    random_state=42, 
    shuffle=True
    )
    full_dataloader, prices_raw_array, val_loader, prices_val_raw = prepare_data_for_kfold(img_train, text_train, prices_data, part_val_indices=part_val_indices, BATCH_SIZE=150)
    model = train_model(full_dataloader, prices_raw_array, val_loader, prices_val_raw, epochs=EPOCHS, lr=LEARNING_RATE)

In [82]:
train_loader, prices_train_raw = prepare_data(img_train, text_train, prices_data)

In [88]:
model = train_model_no_val(train_loader, prices_train_raw, epochs=60, lr=LEARNING_RATE)

Starting training on full dataset for 60 epochs...


Epoch:     1 | 0.6685 | 66.1939% | 0.00003975
Epoch:     2 | 0.6185 | 70.0640% | 0.00003902
Epoch:     3 | 0.6004 | 61.9260% | 0.00003782
Epoch:     4 | 0.5906 | 61.2323% | 0.00003618
Epoch:     5 | 0.5860 | 61.7868% | 0.00003414
Epoch:     6 | 0.5833 | 60.4732% | 0.00003176
Epoch:     7 | 0.5794 | 60.5352% | 0.00002908
Epoch:     8 | 0.5762 | 60.7629% | 0.00002618
Epoch:     9 | 0.5745 | 60.1886% | 0.00002313
Epoch:    10 | 0.5707 | 59.9626% | 0.00002001
Epoch:    11 | 0.5650 | 59.4160% | 0.00001688
Epoch:    12 | 0.5591 | 58.8740% | 0.00001383
Epoch:    13 | 0.5533 | 58.5364% | 0.00001093
Epoch:    14 | 0.5492 | 58.2907% | 0.00000825
Epoch:    15 | 0.5447 | 58.4875% | 0.00000587
Epoch:    16 | 0.5414 | 57.9117% | 0.00000383
Epoch:    17 | 0.5382 | 57.5221% | 0.00000219
Epoch:    18 | 0.5345 | 57.3324% | 0.00000099
Epoch:    19 | 0.5323 | 57.1632% | 0.00000026
Epoch:    20 | 0.5311 | 57.1373% | 0.00004000
Epoch:    21 | 0.5551 | 59.9035% | 0.00003994
Epoch:    22 | 0.5497 | 61.6703% |

In [84]:
loaded_data = np.load("Test_embeddings_image_text.npz", allow_pickle=True)
img_test = loaded_data['img_embds_test']
text_test = loaded_data['text_embds_test']
print(img_test.shape, text_test.shape)

(75000, 768) (75000, 768)


In [89]:
preds = predict_test_set(model, img_test, text_test)

In [90]:
test_sub = test.copy()
test_sub = test_sub.drop(['catalog_content', 'image_link'], axis=1)
test_sub['price'] = preds
test_sub[:5]

Unnamed: 0,sample_id,price
0,100179,15.122637
1,245611,15.063874
2,146263,22.577444
3,95658,6.786475
4,36806,22.5112


In [91]:
test_sub.to_csv("sub3.csv", index=False)

In [92]:
torch.save(model.state_dict(), "model2.pth")