In [2]:
!pip install transformers==4.30.2

Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.30.2)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manyl

In [5]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-tran

In [10]:
%%writefile ddp_price_prediction_final_stable.py

import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# 🔄 CHANGED: Removed Bert specific imports, added SentenceTransformer and kept the scheduler
from sentence_transformers import SentenceTransformer
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import os
import pickle

# DDP Imports
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

# Mixed Precision Imports
from torch.cuda.amp import GradScaler, autocast

# ============== 1. CONFIGURATION ==============
# 🔄 CHANGED: Updated the model name
MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'
MAX_LENGTH = 64
BATCH_SIZE = 128
EPOCHS = 50
LEARNING_RATE = 2e-5
SAVE_DIR = './saved_models'
EMBEDDINGS_PATH = '/kaggle/input/embeddings/embeddings/embeddings_images.pkl'
ID_TO_IGNORE = '279285'
PRETRAINED_MODEL_PATH = '/kaggle/working/saved_models/best_model.pth'
CHECKPOINT_PATH = os.path.join(SAVE_DIR, 'best_model.pth')


# ============== 2. DDP SETUP ==============
def ddp_setup():
    dist.init_process_group(backend="nccl")
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

# ============== 3. PREPROCESSING ==============
def extract_value(text):
    match = re.search(r"Value:\s*(\d+\.?\d*)", str(text))
    return float(match.group(1)) if match else 0.0

def load_embeddings(path):
    if not os.path.exists(path): return {}
    try:
        with open(path, 'rb') as f: return pickle.load(f)
    except Exception as e:
        print(f"Error loading embeddings file: {e}.")
        return {}
        
# ============== 4. PYTORCH DATASET ==============
class ProductTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, embeddings_dict, is_test=False):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.embeddings_dict = embeddings_dict
        self.is_test = is_test
        self.numeric_cols = ['extracted_value']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, sample_id = str(row['catalog_content']), str(row['sample_id'])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_length,
            padding='max_length', truncation=True, return_tensors='pt'
        )
        numeric_values = row[self.numeric_cols].values.astype(np.float32)
        numeric_features = torch.tensor(numeric_values, dtype=torch.float32)
        
        embedding_data = self.embeddings_dict.get(sample_id, torch.ones(768, dtype=torch.float32))
        if not isinstance(embedding_data, torch.Tensor):
            image_embedding = torch.tensor(embedding_data, dtype=torch.float32)
        else:
            image_embedding = embedding_data
            
        item = {
            'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0),
            'numeric_features': numeric_features, 'image_embedding': image_embedding
        }
        if self.is_test:
            item['sample_id'] = row['sample_id']
        else:
            item['price'] = torch.tensor(row['price'], dtype=torch.float32)
        return item

# ============== 5. MODEL ARCHITECTURE ==============
# 🔄 CHANGED: Updated the model architecture to use SentenceTransformer
class TextPlusNumericModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(TextPlusNumericModel, self).__init__()
        # Use SentenceTransformer instead of BertModel
        self.sbert = SentenceTransformer(MODEL_NAME)
        # Get embedding dimension dynamically
        text_embedding_dim = self.sbert.get_sentence_embedding_dimension()
        image_embedding_dim = 768 # Assuming image embedding dim is fixed
        numeric_dim = 1
        
        combined_size = text_embedding_dim + numeric_dim + image_embedding_dim
        
        self.regression_head = nn.Sequential(
            nn.Linear(combined_size, 1024), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask, numeric_features, image_embedding):
        # The forward pass for SentenceTransformer requires a dictionary
        token_features = {'input_ids': input_ids, 'attention_mask': attention_mask}
        sbert_output = self.sbert(token_features)
        text_features = sbert_output['sentence_embedding']
        
        combined_features = torch.cat([text_features, numeric_features, image_embedding], dim=1)
        price = self.regression_head(combined_features)
        return price

def smape_loss(y_pred, y_true, epsilon=1e-8):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2
    return torch.mean(numerator / (denominator + epsilon)) * 100

# ============== 6. MAIN TRAINING FUNCTION ==============
def main_training_and_prediction():
    ddp_setup()
    rank = int(os.environ["LOCAL_RANK"])
    device = torch.device(f"cuda:{rank}")
    best_val_smape = float('inf')
    
    if rank == 0:
        print("="*60 + "\nStarting Training...\n" + "="*60)
        os.makedirs(SAVE_DIR, exist_ok=True)

    train_df = pd.read_csv('/kaggle/input/amazon-ml-dataset-csv/splits/splits/train.csv')
    val_df = pd.read_csv('/kaggle/input/amazon-ml-dataset-csv/splits/splits/val.csv')
    train_df = train_df[train_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)
    val_df = val_df[val_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)
    if rank == 0: print(f"Removed sample ID '{ID_TO_IGNORE}'.")

    scaler = MinMaxScaler()
    train_df['extracted_value'] = train_df['catalog_content'].apply(extract_value)
    val_df['extracted_value'] = val_df['catalog_content'].apply(extract_value)
    scaler.fit(train_df[['extracted_value']])
    train_df['extracted_value'] = scaler.transform(train_df[['extracted_value']])
    val_df['extracted_value'] = scaler.transform(val_df[['extracted_value']])
    train_df['price'] = np.log1p(train_df['price'])
    val_df['price'] = np.log1p(val_df['price'])

    embeddings_dict = load_embeddings(EMBEDDINGS_PATH)
    
    # 🔄 CHANGED: Initialize model first to get access to its tokenizer
    # We need to create the model on the correct device before wrapping with DDP
    model = TextPlusNumericModel().to(device)
    tokenizer = model.sbert.tokenizer

    train_dataset = ProductTextDataset(train_df, tokenizer, MAX_LENGTH, embeddings_dict)
    val_dataset = ProductTextDataset(val_df, tokenizer, MAX_LENGTH, embeddings_dict)
    train_sampler, val_sampler = DistributedSampler(train_dataset), DistributedSampler(val_dataset, shuffle=False)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=2)

    if PRETRAINED_MODEL_PATH and os.path.exists(PRETRAINED_MODEL_PATH):
        if rank == 0: print(f"--- Loading pretrained model from: {PRETRAINED_MODEL_PATH} ---")
        model.load_state_dict(torch.load(PRETRAINED_MODEL_PATH, map_location=device))
    else:
        if rank == 0: print("--- Starting from scratch. ---")

    model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    scaler = GradScaler()
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * len(train_loader) * EPOCHS), 
        num_training_steps=len(train_loader) * EPOCHS
    )

    for epoch in range(EPOCHS):
        train_sampler.set_epoch(epoch)
        model.train()
        train_loop = tqdm(train_loader, leave=False, disable=(rank != 0))
        if rank == 0: train_loop.set_description(f"Epoch {epoch+1}/{EPOCHS}")

        for batch in train_loop:
            input_ids, attention_mask, numerics, img_embeds, prices = (
                batch['input_ids'].to(device), batch['attention_mask'].to(device),
                batch['numeric_features'].to(device), batch['image_embedding'].to(device),
                batch['price'].to(device)
            )
            optimizer.zero_grad(set_to_none=True)

            with autocast():
                outputs = model(input_ids, attention_mask, numerics, img_embeds)
                loss = loss_fn(outputs.squeeze(), prices)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            if rank == 0: train_loop.set_postfix(loss=f"{loss.item():.4f}")
        
        model.eval()
        total_val_smape = 0
        with torch.no_grad():
            for batch in val_loader:
                with autocast():
                    log_prices_pred = model(
                        input_ids=batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device),
                        numeric_features=batch['numeric_features'].to(device), image_embedding=batch['image_embedding'].to(device)
                    )
                prices_pred_orig = torch.expm1(log_prices_pred.squeeze())
                prices_true_orig = torch.expm1(batch['price'].to(device))
                total_val_smape += smape_loss(prices_pred_orig, prices_true_orig).item()
        
        val_metrics_tensor = torch.tensor([total_val_smape, len(val_loader)]).to(device)
        dist.all_reduce(val_metrics_tensor, op=dist.ReduceOp.SUM)

        if rank == 0:
            avg_val_smape = val_metrics_tensor[0] / val_metrics_tensor[1]
            print(f"Epoch {epoch+1}/{EPOCHS} -> Avg Validation SMAPE: {avg_val_smape:.4f}%")
            if avg_val_smape < best_val_smape:
                best_val_smape = avg_val_smape
                print(f"** New best model found! Saving to {CHECKPOINT_PATH} **")
                torch.save(model.module.state_dict(), CHECKPOINT_PATH)
            print("-" * 50)
            
    dist.destroy_process_group()

# ============== 7. LAUNCHER & PREDICTION ==============
if __name__ == "__main__":
    main_training_and_prediction()

    if int(os.environ.get("RANK", "0")) == 0:
        print("\n" + "="*60 + "\nTraining finished. Starting prediction...\n" + "="*60)

        if os.path.exists(CHECKPOINT_PATH):
            device = torch.device("cuda:0")
            final_model = TextPlusNumericModel()
            final_model.load_state_dict(torch.load(CHECKPOINT_PATH))
            final_model.to(device)
            final_model.eval()

            test_df = pd.read_csv('/kaggle/input/amazon-ml-dataset-csv/dataset/dataset/test.csv')
            train_df_for_scaler = pd.read_csv('/kaggle/input/amazon-ml-dataset-csv/splits/splits/train.csv')
            test_df = test_df[test_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)
            
            scaler = MinMaxScaler()
            train_df_for_scaler['extracted_value'] = train_df_for_scaler['catalog_content'].apply(extract_value)
            test_df['extracted_value'] = test_df['catalog_content'].apply(extract_value)
            scaler.fit(train_df_for_scaler[['extracted_value']])
            test_df['extracted_value'] = scaler.transform(test_df[['extracted_value']])
            
            # 🔄 CHANGED: Get tokenizer from the SentenceTransformer model
            tokenizer = final_model.sbert.tokenizer
            embeddings_dict = load_embeddings(EMBEDDINGS_PATH)
            test_dataset = ProductTextDataset(test_df, tokenizer, MAX_LENGTH, embeddings_dict, is_test=True)
            test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2)

            all_predictions, all_sample_ids = [], []
            with torch.no_grad():
                for batch in tqdm(test_loader, desc="Predicting"):
                    with autocast():
                        outputs = final_model(
                            input_ids=batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), 
                            numeric_features=batch['numeric_features'].to(device), image_embedding=batch['image_embedding'].to(device)
                        )
                    preds = np.expm1(outputs.squeeze().cpu().numpy())
                    all_predictions.extend(preds if preds.ndim > 0 else [preds])
                    all_sample_ids.extend(batch['sample_id'])

            submission_df = pd.DataFrame({'sample_id': all_sample_ids, 'price': all_predictions})
            submission_df.to_csv('output.csv', index=False)
            print("\nPredictions saved to output.csv")

Overwriting ddp_price_prediction_final_stable.py


In [11]:
!torchrun --nproc_per_node=2 ddp_price_prediction_final_stable.py

W1013 14:43:24.852000 3471 torch/distributed/run.py:792] 
W1013 14:43:24.852000 3471 torch/distributed/run.py:792] *****************************************
W1013 14:43:24.852000 3471 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1013 14:43:24.852000 3471 torch/distributed/run.py:792] *****************************************
2025-10-13 14:43:30.642597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-13 14:43:30.642615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760366610.664753    3473 cuda_dnn.cc:8310] U