In [None]:
!pip install -U sentence-transformers

In [None]:
!pip install --upgrade transformers

In [None]:
# predict_sbert.py
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import os
import re
import pickle

# Configs
MODEL_NAME_SBERT = 'sentence-transformers/all-mpnet-base-v2'
CHECKPOINT_PATH_SBERT = '/kaggle/input/model-weights-amazon-ml/best_model_all_mp_net_ashish.pth'
TEST_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/val_split_final.csv'
TRAIN_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/train_split_final.csv'
EMBEDDINGS_PATH = '/kaggle/input/embeddings/embeddings/embeddings_images.pkl'
ID_TO_IGNORE = '279285'
MAX_LENGTH = 64
BATCH_SIZE = 128

def extract_value(text):
    match = re.search(r"Value:\s*(\d+\.?\d*)", str(text))
    return float(match.group(1)) if match else 0.0

def load_embeddings(path):
    if not os.path.exists(path):
        print("Image Embeddings path doesn't exist")
        return {}
    try:
        with open(path, 'rb') as f:
            print("Image Embeddings Loading")
            return pickle.load(f)
    except Exception as e:
        print(f"Error loading embeddings file: {e}.")
        return {}

class ProductTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, embeddings_dict):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.embeddings = embeddings_dict

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = str(row.catalog_content)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        image_embedding = self.embeddings.get(str(row.sample_id))
        
        if image_embedding is None:
            raise ValueError(f"Missing image embedding for sample_id: {row.sample_id}")
                
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numeric_features': torch.tensor([row.extracted_value], dtype=torch.float),
            'image_embedding': torch.tensor(image_embedding, dtype=torch.float),
            'sample_id': row.sample_id
        }

class SBERTModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(SBERTModel, self).__init__()
        self.sbert = SentenceTransformer(MODEL_NAME_SBERT)
        text_embedding_dim = self.sbert.get_sentence_embedding_dimension()
        combined_size = text_embedding_dim + 1 + 768
        self.regression_head = nn.Sequential(
            nn.Linear(combined_size, 1024), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask, numeric_features, image_embedding):
        output = self.sbert({'input_ids': input_ids, 'attention_mask': attention_mask})
        text_features = output['sentence_embedding']
        combined = torch.cat([text_features, numeric_features, image_embedding], dim=1)
        return self.regression_head(combined)

def predict(model, dataloader, device):
    model.to(device)
    model.eval()
    all_preds, all_ids = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="SBERT Predicting"):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                numeric_features=batch['numeric_features'].to(device),
                image_embedding=batch['image_embedding'].to(device)
            )
            preds = np.expm1(outputs.squeeze().cpu().numpy())
            all_preds.extend(preds if preds.ndim > 0 else [preds])
            all_ids.extend(batch['sample_id'].cpu().numpy())
    return pd.DataFrame({'sample_id': all_ids, 'price': all_preds})

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    test_df = pd.read_csv(TEST_CSV_PATH)
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = test_df[test_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)

    scaler = MinMaxScaler()
    train_df['extracted_value'] = train_df['catalog_content'].apply(extract_value)
    test_df['extracted_value'] = test_df['catalog_content'].apply(extract_value)
    scaler.fit(train_df[['extracted_value']])
    test_df['extracted_value'] = scaler.transform(test_df[['extracted_value']])

    embeddings = load_embeddings(EMBEDDINGS_PATH)

    if embeddings:
        print("Embeddings loaded successfully.")
        print(f"Number of embeddings loaded: {len(embeddings)}")
    else:
        print("Embeddings failed to load or are empty.")

    
    model = SBERTModel()
    model.load_state_dict(torch.load(CHECKPOINT_PATH_SBERT, map_location=device))
    tokenizer = model.sbert.tokenizer
    dataset = ProductTextDataset(test_df, tokenizer, MAX_LENGTH, embeddings)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    preds_df = predict(model, loader, device)
    preds_df.to_csv("output_sbert.csv", index=False)
    print("Saved: output_sbert.csv")

In [None]:
!pip install transformers==4.30.2

In [1]:
# predict_bert.py
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import os
import re
import pickle


# Configs
MODEL_NAME_BERT = 'bert-base-uncased'
CHECKPOINT_PATH_BERT = '/kaggle/input/model-weights-amazon-ml/best_model_44_48.pth'
TEST_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/val_split_final.csv'
TRAIN_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/train_split_final.csv'
EMBEDDINGS_PATH = '/kaggle/input/embeddings/embeddings/embeddings_images.pkl'
ID_TO_IGNORE = -1
MAX_LENGTH = 64
BATCH_SIZE = 128

def extract_value(text):
    match = re.search(r"Value:\s*(\d+\.?\d*)", str(text))
    return float(match.group(1)) if match else 0.0

def load_embeddings(path):
    if not os.path.exists(path): return {}
    try:
        with open(path, 'rb') as f: return pickle.load(f)
    except Exception as e:
        print(f"Error loading embeddings file: {e}.")
        return {}

class ProductTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, embeddings_dict):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.embeddings = embeddings_dict

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = str(row.catalog_content)
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        image_embedding = self.embeddings.get(str(row.sample_id))
        
        if image_embedding is None:
            raise ValueError(f"Missing image embedding for sample_id: {row.sample_id}")
                
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numeric_features': torch.tensor([row.extracted_value], dtype=torch.float),
            'image_embedding': torch.tensor(image_embedding, dtype=torch.float),
            'sample_id': row.sample_id
        }

class BERTModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME_BERT)
        combined_size = 768 + 1 + 768
        self.regression_head = nn.Sequential(
            nn.Linear(combined_size, 1024), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask, numeric_features, image_embedding):
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        combined = torch.cat([text_features, numeric_features, image_embedding], dim=1)
        return self.regression_head(combined)

def predict(model, dataloader, device):
    model.to(device)
    model.eval()
    all_preds, all_ids = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="BERT Predicting"):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                numeric_features=batch['numeric_features'].to(device),
                image_embedding=batch['image_embedding'].to(device)
            )
            preds = np.expm1(outputs.squeeze().cpu().numpy())
            all_preds.extend(preds if preds.ndim > 0 else [preds])
            all_ids.extend(batch['sample_id'].cpu().numpy())
    return pd.DataFrame({'sample_id': all_ids, 'price': all_preds})

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    test_df = pd.read_csv(TEST_CSV_PATH)
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = test_df[test_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)

    scaler = MinMaxScaler()
    train_df['extracted_value'] = train_df['catalog_content'].apply(extract_value)
    test_df['extracted_value'] = test_df['catalog_content'].apply(extract_value)
    scaler.fit(train_df[['extracted_value']])
    test_df['extracted_value'] = scaler.transform(test_df[['extracted_value']])

    embeddings = load_embeddings(EMBEDDINGS_PATH)

    model = BERTModel()
    model.load_state_dict(torch.load(CHECKPOINT_PATH_BERT, map_location=device))
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_BERT)
    dataset = ProductTextDataset(test_df, tokenizer, MAX_LENGTH, embeddings)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    preds_df = predict(model, loader, device)
    preds_df.to_csv("output_bert.csv", index=False)
    print("Saved: output_bert.csv")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
BERT Predicting: 100%|██████████| 88/88 [00:22<00:00,  3.99it/s]

Saved: output_bert.csv





In [2]:
# ensemble_predictions.py
import pandas as pd

# Load model predictions
sbert_df = pd.read_csv("output_sbert.csv")  # Contains sample_id, price
bert_df = pd.read_csv("output_bert.csv")    # Contains sample_id, price

# Rename to avoid conflicts
sbert_df.rename(columns={'price': 'price_sbert'}, inplace=True)
bert_df.rename(columns={'price': 'price_bert'}, inplace=True)

# Merge on sample_id
merged_df = pd.merge(sbert_df, bert_df, on="sample_id")

# Simple average (or replace with weighted average if needed)
merged_df['price'] = (merged_df['price_sbert'] + merged_df['price_bert']) / 2

# Save final predictions
merged_df[['sample_id', 'price']].to_csv("output_ensemble.csv", index=False)
print("Saved ensemble predictions to output_ensemble.csv")


Saved ensemble predictions to output_ensemble.csv


In [4]:
import pandas as pd
import numpy as np

# --- Configuration ---
VAL_FILE = '/kaggle/input/amazon-ml-dataset-csv/splits/splits/val.csv'
VAL_PRED_FILE = '/kaggle/working/output_ensemble.csv'
SEED = 42

def calculate_smape(actual_prices, predicted_prices):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE) as a percentage.
    
    Formula: SMAPE = (1/n) * SUM(|P - A| / ((|A| + |P|)/2)) * 100%
    """
    
    # Absolute difference between predicted and actual
    numerator = np.abs(predicted_prices - actual_prices)
    
    # Denominator: average of absolute actual and absolute predicted prices
    denominator = (np.abs(actual_prices) + np.abs(predicted_prices)) / 2
    
    # Handle division by zero case: if both prices are zero, the error is 0.
    # We replace any zero in the denominator with 1 to prevent division by zero, 
    # but the numerator will also be zero, so the fraction remains 0.
    # This is a common practice for handling zero values in SMAPE.
    smape_term = np.divide(
        numerator, 
        denominator, 
        out=np.zeros_like(numerator), # Output array initialized to zeros
        where=denominator != 0        # Only perform division where denominator is not zero
    )
    
    # Calculate the mean and multiply by 100 for the percentage result
    smape_percentage = np.mean(smape_term) * 100
    
    return smape_percentage

# --- Main execution block ---
if __name__ == "__main__":
    
    try:
        # 2. Load the data
        df_val = pd.read_csv(VAL_FILE, usecols=['sample_id', 'price'])
        df_pred = pd.read_csv(VAL_PRED_FILE)

        # Rename the price column in val.csv to 'actual_price' for clarity
        df_val = df_val.rename(columns={'price': 'actual_price'})
        
        # Rename the price column in val-pred.csv to 'predicted_price' for clarity
        df_pred = df_pred.rename(columns={'price': 'predicted_price'})
        
    except FileNotFoundError as e:
        print(f"Error: One or both files not found. Please ensure '{VAL_FILE}' and '{VAL_PRED_FILE}' are in the same directory.")
        print(e)
        exit()

    # 3. Merge the two DataFrames on the 'sample_id'
    # This ensures that each prediction is matched with its correct actual value.
    merged_df = pd.merge(df_val, df_pred, on='sample_id', how='inner')

    if merged_df.empty:
        print("Error: The merged DataFrame is empty. Check if 'sample_id' values match between the two files.")
        exit()
        
    print(f"\nSuccessfully loaded and merged {len(merged_df)} samples.")
    
    # 4. Extract price vectors
    actual_prices = merged_df['actual_price'].values
    predicted_prices = merged_df['predicted_price'].values

    # 5. Calculate SMAPE
    smape_score = calculate_smape(actual_prices, predicted_prices)
    
    # 6. Output the result
    print("-" * 40)
    print(f"SMAPE Score: {smape_score:.4f}%")
    print(f"Interpretation: On average, the predictions are off by {smape_score:.2f}% relative to the average of the actual and predicted prices.")
    print("-" * 40)




Successfully loaded and merged 11250 samples.
----------------------------------------
SMAPE Score: 43.5951%
Interpretation: On average, the predictions are off by 43.60% relative to the average of the actual and predicted prices.
----------------------------------------


In [None]:
# 3. Predict with BERT model
print("\n" + "="*60)
print("Starting prediction with BERT model...")
if not os.path.exists(CHECKPOINT_PATH_BERT):
    raise FileNotFoundError(f"BERT checkpoint not found at: {CHECKPOINT_PATH_BERT}")

bert_model = BERTModel()
bert_model.load_state_dict(torch.load(CHECKPOINT_PATH_BERT, map_location=device))
bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_BERT)

bert_test_dataset = ProductTextDataset(test_df, bert_tokenizer, MAX_LENGTH, embeddings_dict, is_test=True)
bert_test_loader = DataLoader(bert_test_dataset, batch_size=BATCH_SIZE, shuffle=False)

preds_bert_df = predict(bert_model, bert_test_loader, device)
preds_bert_df.rename(columns={'price': 'price_bert'}, inplace=True)
preds_bert_df.to_csv('output_bert.csv', index=False)
print("Prediction with BERT model finished.")
print("="*60)

# 4. Ensemble the predictions
print("\nEnsembling predictions...")
# Merge predictions from both models on 'sample_id'
ensemble_df = pd.merge(preds_sbert_df, preds_bert_df, on='sample_id')

# Simple averaging ensemble. You can also use weighted averaging.
# Example weighted average: 0.6 * ensemble_df['price_sbert'] + 0.4 * ensemble_df['price_bert']
ensemble_df['price'] = (ensemble_df['price_sbert'] + ensemble_df['price_bert']) / 2

# Prepare final submission file
submission_df = ensemble_df[['sample_id', 'price']]

# 5. Save the final output
output_filename = 'output-ensemble.csv'
submission_df.to_csv(output_filename, index=False)
print(f"\nEnsemble predictions successfully saved to {output_filename}")

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import os
# --- OPTIMIZATION: Added for mixed-precision inference ---
from torch.cuda.amp import autocast

# --- Placeholder Setups ---
# (Please fill these in with your actual values and helper functions)

# 1. Define your model names and checkpoint paths
MODEL_NAME_SBERT = 'sentence-transformers/all-mpnet-base-v2'
MODEL_NAME_BERT = 'bert-base-uncased' # Or your specific BERT model
CHECKPOINT_PATH_SBERT = '/kaggle/input/model-weights-amazon-ml/best_model_all_mp_net_ashish.pth'
CHECKPOINT_PATH_BERT = '/kaggle/input/model-weights-amazon-ml/best_model_44_48.pth'

# 2. Define data paths and constants
TEST_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/test_split_final.csv'
TRAIN_CSV_PATH = '/kaggle/input/amazon-ml-dataset-csv/preprocessed/train_split_final.csv'
EMBEDDINGS_PATH = 'path/to/your/embeddings.pkl' # e.g., image embeddings
ID_TO_IGNORE = -1 # From your original code
MAX_LENGTH = 128
BATCH_SIZE = 128

# 3. Define your helper functions (assuming they exist from your code)
def extract_value(text):
    # Placeholder for your function that extracts a numeric value from text
    # Example implementation:
    import re
    numbers = re.findall(r'\d+\.?\d*', str(text))
    return float(numbers[0]) if numbers else 0.0

def load_embeddings(path):
    # Placeholder for your function that loads image embeddings
    # Should return a dictionary mapping an identifier to an embedding vector
    # Example: return {'img_id_1': [0.1, 0.2, ...], ...}
    print(f"NOTE: Using placeholder for load_embeddings('{path}'). Implement your logic.")
    return {}

# 4. Define your custom Dataset class
class ProductTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, embeddings_dict, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.is_test = is_test
        self.embeddings = embeddings_dict

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = str(row.catalog_content)
        
        # Check if tokenizer is from SentenceTransformer
        is_sbert_tokenizer = hasattr(self.tokenizer, 'encode_plus') and not hasattr(self.tokenizer, 'prepare_for_model')

        if is_sbert_tokenizer:
             # Standard tokenization for models like BERT
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
        else: # Assuming Hugging Face Transformers tokenizer
            encoding = self.tokenizer(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numeric_features': torch.tensor([row.extracted_value], dtype=torch.float),
            'image_embedding': torch.tensor(self.embeddings.get(row.sample_id, [0]*768), dtype=torch.float), # Fallback to zeros
            'sample_id': row.sample_id
        }
        return item

# --- Model Architectures ---

# Architecture 1 (using all-mpnet-base-v2)
class SBERTModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(SBERTModel, self).__init__()
        self.sbert = SentenceTransformer(MODEL_NAME_SBERT)
        text_embedding_dim = self.sbert.get_sentence_embedding_dimension()
        image_embedding_dim = 768
        numeric_dim = 1
        combined_size = text_embedding_dim + numeric_dim + image_embedding_dim
        
        self.regression_head = nn.Sequential(
            nn.Linear(combined_size, 1024), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask, numeric_features, image_embedding):
        token_features = {'input_ids': input_ids, 'attention_mask': attention_mask}
        sbert_output = self.sbert(token_features)
        text_features = sbert_output['sentence_embedding']
        
        combined_features = torch.cat([text_features, numeric_features, image_embedding], dim=1)
        price = self.regression_head(combined_features)
        return price

# Architecture 2 (using BERT)
class BERTModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME_BERT)
        combined_size = 768 + 1 + 768 # bert_output + numeric + image
        self.regression_head = nn.Sequential(
            nn.Linear(combined_size, 1024), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask, numeric_features, image_embedding):
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        combined_features = torch.cat([text_features, numeric_features, image_embedding], dim=1)
        price = self.regression_head(combined_features)
        return price

# --- Prediction Function ---

def predict(model, data_loader, device):
    """Runs prediction on a given model and dataloader."""
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_sample_ids = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=f"Predicting with {model.__class__.__name__}"):
            # --- OPTIMIZATION: Use autocast for mixed-precision ---
            with autocast(enabled=device.type == 'cuda'):
                outputs = model(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    numeric_features=batch['numeric_features'].to(device),
                    image_embedding=batch['image_embedding'].to(device)
                )
            # Assuming your model outputs log(price + 1), so we use expm1
            preds = np.expm1(outputs.squeeze().cpu().numpy())
            all_predictions.extend(preds if preds.ndim > 0 else [preds])
            all_sample_ids.extend(batch['sample_id'].cpu().numpy())
            
    return pd.DataFrame({'sample_id': all_sample_ids, 'price': all_predictions})


# --- Main Ensembling Script ---

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # --- OPTIMIZATION: Set num_workers for parallel data loading ---
    # Use 2 or 4 workers if your system can handle it, 0 for debugging.
    NUM_WORKERS = 2 

    # 1. Load and preprocess data
    print("Loading and preprocessing test data...")
    test_df = pd.read_csv(TEST_CSV_PATH)
    train_df_for_scaler = pd.read_csv(TRAIN_CSV_PATH)
    test_df = test_df[test_df['sample_id'] != ID_TO_IGNORE].reset_index(drop=True)
    
    scaler = MinMaxScaler()
    train_df_for_scaler['extracted_value'] = train_df_for_scaler['catalog_content'].apply(extract_value)
    test_df['extracted_value'] = test_df['catalog_content'].apply(extract_value)
    scaler.fit(train_df_for_scaler[['extracted_value']])
    test_df['extracted_value'] = scaler.transform(test_df[['extracted_value']])
    
    embeddings_dict = load_embeddings(EMBEDDINGS_PATH)

    # 2. Predict with SBERT model
    print("\n" + "="*60)
    print("Starting prediction with SBERT model...")
    if not os.path.exists(CHECKPOINT_PATH_SBERT):
        raise FileNotFoundError(f"SBERT checkpoint not found at: {CHECKPOINT_PATH_SBERT}")
        
    sbert_model = SBERTModel()
    sbert_model.load_state_dict(torch.load(CHECKPOINT_PATH_SBERT, map_location=device))
    sbert_tokenizer = sbert_model.sbert.tokenizer
    
    sbert_test_dataset = ProductTextDataset(test_df, sbert_tokenizer, MAX_LENGTH, embeddings_dict, is_test=True)
    # --- OPTIMIZATION: Added num_workers to DataLoader ---
    sbert_test_loader = DataLoader(sbert_test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    
    preds_sbert_df = predict(sbert_model, sbert_test_loader, device)
    preds_sbert_df.rename(columns={'price': 'price_sbert'}, inplace=True)
    print("Prediction with SBERT model finished.")
    print("="*60)

    # 3. Predict with BERT model
    print("\n" + "="*60)
    print("Starting prediction with BERT model...")
    if not os.path.exists(CHECKPOINT_PATH_BERT):
        raise FileNotFoundError(f"BERT checkpoint not found at: {CHECKPOINT_PATH_BERT}")

    bert_model = BERTModel()
    bert_model.load_state_dict(torch.load(CHECKPOINT_PATH_BERT, map_location=device))
    bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_BERT)

    bert_test_dataset = ProductTextDataset(test_df, bert_tokenizer, MAX_LENGTH, embeddings_dict, is_test=True)
    # --- OPTIMIZATION: Added num_workers to DataLoader ---
    bert_test_loader = DataLoader(bert_test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

    preds_bert_df = predict(bert_model, bert_test_loader, device)
    preds_bert_df.rename(columns={'price': 'price_bert'}, inplace=True)
    print("Prediction with BERT model finished.")
    print("="*60)

    # 4. Ensemble the predictions
    print("\nEnsembling predictions...")
    # Merge predictions from both models on 'sample_id'
    ensemble_df = pd.merge(preds_sbert_df, preds_bert_df, on='sample_id')
    
    # Simple averaging ensemble. You can also use weighted averaging.
    # Example weighted average: 0.6 * ensemble_df['price_sbert'] + 0.4 * ensemble_df['price_bert']
    ensemble_df['price'] = (ensemble_df['price_sbert'] + ensemble_df['price_bert']) / 2
    
    # Prepare final submission file
    submission_df = ensemble_df[['sample_id', 'price']]
    
    # 5. Save the final output
    output_filename = 'output-ensemble.csv'
    submission_df.to_csv(output_filename, index=False)
    print(f"\nEnsemble predictions successfully saved to {output_filename}")

