In [17]:
# Run this in a notebook cell (prefix with !). If in VS Code terminal, run without !
!pip install -q pandas numpy scikit-learn transformers sentence_transformers xgboost tqdm pillow requests torchvision accelerate


IMPORTS AND BASIC CONFIG

In [18]:
# Code purpose: Imports and basic config
import os
import re
import time
import math
import json
import requests
from io import BytesIO
from PIL import Image
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

import torch
from transformers import BertModel, BertTokenizerFast, CLIPProcessor, CLIPModel

# Repro
RND = 42
np.random.seed(RND)
torch.manual_seed(RND)


  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x131bb01f0>

PATHS AND USER CHOICES

In [19]:
# Code purpose: file paths and parameters
TRAIN_CSV = 'dataset/train.csv'
TEST_CSV  = 'dataset/test.csv'
SAMPLE_OUT = 'dataset/sample_test_out.csv'
OUT_FILE = 'test_out.csv'

# How many images to download/process for quick multimodal experiments:
IMAGE_LIMIT = 5000  # user-specified limit to check image effect quickly

# CLIP device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", DEVICE)


Device: cpu


LOAD DATA

In [20]:
# Code purpose: load datasets
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (75000, 4)
Test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


Helper functions: text cleaning & quantity extraction

In [21]:
# Code purpose: cleaning and extract numeric signals like quantity and units
def clean_text(s):
    if pd.isna(s): return ''
    s = str(s)
    s = re.sub(r'<[^>]+>', ' ', s)     # remove html
    s = s.replace('-', ' ')
    s = re.sub(r'[^A-Za-z0-9\.\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

def extract_quantity(s):
    if pd.isna(s): return 1
    s = str(s).lower()
    # common patterns: "pack of 12", "12 pack", "12 pcs", "12 pcs.", "pack of 12 pcs"
    m = re.search(r'(\d{1,4})\s*(?:pack|pcs|pieces|pc|units|unit|count)\b', s)
    if m: 
        try: return int(m.group(1))
        except: pass
    m2 = re.search(r'pack of\s*(\d{1,4})', s)
    if m2:
        try: return int(m2.group(1))
        except: pass
    # fallback: search any standalone integer that might indicate quantity
    m3 = re.search(r'\b(\d{1,4})\b', s)
    if m3:
        val = int(m3.group(1))
        if val <= 1000 and val > 1:
            return val
    return 1

def extract_units_value(s):
    if pd.isna(s): return 0.0
    s = str(s).lower()
    # find patterns like "500 ml", "0.5 l", "250g", "250 g"
    m = re.search(r'(\d+\.?\d*)\s*(ml|l|litre|litres|g|kg|gram|grams)\b', s)
    if m:
        num = float(m.group(1))
        unit = m.group(2)
        # normalize units to ml or g (simple heuristics)
        if unit in ['l','litre','litres']:
            return num * 1000.0  # liters -> ml
        if unit == 'ml':
            return num
        if unit in ['kg']:
            return num * 1000.0  # kg -> g
        if unit in ['g','gram','grams']:
            return num
    return 0.0


Apply basic feature engineering on text for all rows

In [22]:
# Code purpose: create features from catalog_content for train and test
for df in (train, test):
    df['catalog_clean'] = df['catalog_content'].fillna('').apply(clean_text)
    df['quantity'] = df['catalog_clean'].apply(extract_quantity)
    df['unit_value'] = df['catalog_clean'].apply(extract_units_value)
    df['text_len'] = df['catalog_clean'].apply(lambda x: len(x))
    # keyword flags
    for kw in ['premium','refill','combo','original','pack','bottle','set','new']:
        df[f'kw_{kw}'] = df['catalog_clean'].apply(lambda s: 1 if kw in s else 0)

# Quick sanity check
train[['sample_id','catalog_clean','quantity','unit_value','text_len','kw_pack']].head()


Unnamed: 0,sample_id,catalog_clean,quantity,unit_value,text_len,kw_pack
0,33127,item name la victoria green taco sauce mild 12...,0,0.0,84,1
1,198967,item name salerno cookies the original butter ...,0,0.0,491,1
2,261251,item name bear creek hearty soup bowl creamy c...,4,0.0,315,1
3,55858,item name judee s blue cheese powder 11.25 oz ...,25,0.0,1279,0
4,292686,item name kedem sherry cooking wine 12.7 ounce...,0,0.0,144,0


TARGET HANDLING AND OUTLIER CLIPPING

In [23]:
# Code purpose: clip extreme prices and compute log target for training stability
# Find 99.5th percentile cap to limit huge outliers
price_99_5 = train['price'].quantile(0.995)
print("99.5 percentile price cap:", price_99_5)

# Create clipped target and log1p transform
train['price_clipped'] = train['price'].clip(1, price_99_5)
train['y_log1p'] = np.log1p(train['price_clipped'])


99.5 percentile price cap: 183.70050000000046


BERT TEXT EMBEDDING


In [25]:
# Code Purpose: Generate or Load MiniLM embeddings for product texts (train + test)

import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

# Embedding save paths
train_emb_path = 'train_text_emb.npy'
test_emb_path  = 'test_text_emb.npy'

# Check if embeddings already exist
if os.path.exists(train_emb_path) and os.path.exists(test_emb_path):
    print("✅ Loading embeddings from disk...")
    train_text_emb = np.load(train_emb_path)
    test_text_emb  = np.load(test_emb_path)
else:
    print("⚡ Computing MiniLM embeddings (this may take a few minutes)...")
    # Load MiniLM model (fast, good performance)
    text_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)

    # Convert to list
    train_texts = train['catalog_clean'].tolist()
    test_texts  = test['catalog_clean'].tolist()

    # Encode in batches
    train_text_emb = text_model.encode(
        train_texts,
        batch_size=64,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    test_text_emb = text_model.encode(
        test_texts,
        batch_size=64,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    # Save embeddings for future runs
    np.save(train_emb_path, train_text_emb)
    np.save(test_emb_path, test_text_emb)
    print("✅ Embeddings computed and saved to disk!")

print("MiniLM embeddings shapes:", train_text_emb.shape, test_text_emb.shape)


⚡ Computing MiniLM embeddings (this may take a few minutes)...


Batches: 100%|██████████| 1172/1172 [12:11<00:00,  1.60it/s]
Batches: 100%|██████████| 1172/1172 [13:56<00:00,  1.40it/s]


✅ Embeddings computed and saved to disk!
MiniLM embeddings shapes: (75000, 384) (75000, 384)


CLIP IMAGE EMBEDDINGS

In [36]:
# -----------------------------------------------------------
# Code Purpose: Generate & Cache CLIP Embeddings for All Images
# -----------------------------------------------------------

import os
import numpy as np
import pandas as pd
import torch
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from transformers import CLIPModel, CLIPProcessor

# ================================
# ⚙️ Configuration
# ================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
clip_model_name = "openai/clip-vit-base-patch32"

# Paths to save precomputed embeddings
train_clip_emb_path = "train_clip_embeddings.npy"
test_clip_emb_path = "test_clip_embeddings.npy"

# ================================
# 📦 Load CLIP model and processor
# ================================
clip = CLIPModel.from_pretrained(clip_model_name).to(DEVICE)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)

# ================================
# 🖼️ Helper function to download image
# ================================
def download_image(url, timeout=10):
    try:
        resp = requests.get(url, timeout=timeout)
        resp.raise_for_status()
        return Image.open(BytesIO(resp.content)).convert('RGB')
    except Exception:
        return None

# ================================
# 🧠 Extract CLIP embeddings for image URLs
# ================================
def extract_clip_embeddings_from_links(df, save_path, image_link_col='image_link', batch_size=64):
    # ✅ Load from cache if available
    if os.path.exists(save_path):
        print(f"✅ Found cached embeddings at {save_path}, loading...")
        return np.load(save_path)

    # Otherwise compute embeddings
    total_images = len(df)
    embeddings = np.zeros((total_images, clip.config.projection_dim))
    embeddings[:] = np.nan

    processed = 0
    for i in tqdm(range(total_images), desc="Downloading & encoding images"):
        url = df.iloc[i][image_link_col]
        if pd.isna(url) or not isinstance(url, str) or len(url.strip()) == 0:
            continue
        img = download_image(url)
        if img is None:
            continue
        try:
            inputs = clip_processor(images=img, return_tensors="pt").to(DEVICE)
            with torch.no_grad():
                image_emb = clip.get_image_features(**inputs)  # shape: (1, 512)
                image_emb = image_emb / image_emb.norm(p=2, dim=-1, keepdim=True)
                embeddings[i] = image_emb.cpu().numpy()
            processed += 1
        except Exception:
            continue

    print(f"✅ Processed {processed} / {total_images} images")
    np.save(save_path, embeddings)  # 💾 Save embeddings
    print(f"📁 Embeddings saved to {save_path}")
    return embeddings

# ================================
# 🚀 Generate or load embeddings
# ================================
# Load train_clip_emb from the pre-downloaded file
if os.path.exists(train_clip_emb_path):
    print(f"✅ Loading pre-downloaded train_clip_emb from {train_clip_emb_path}...")
    train_clip_emb = np.load(train_clip_emb_path)
else:
    raise FileNotFoundError(f"train_clip_emb_path not found at {train_clip_emb_path}. Please ensure it is downloaded.")

# Generate or load test_clip_emb
test_clip_emb = extract_clip_embeddings_from_links(test, test_clip_emb_path)

print("Train CLIP embeddings shape:", train_clip_emb.shape)
print("Test CLIP embeddings shape:", test_clip_emb.shape)

✅ Loading pre-downloaded train_clip_emb from train_clip_embeddings.npy...


Downloading & encoding images: 100%|██████████| 75000/75000 [5:03:54<00:00,  4.11it/s]      


✅ Processed 74993 / 75000 images
📁 Embeddings saved to test_clip_embeddings.npy
Train CLIP embeddings shape: (75000, 512)
Test CLIP embeddings shape: (75000, 512)


In [51]:
test_clip_emb.shape

(75000, 512)

In [52]:
np.save('test_clip_2.npy', test_clip_emb)  # Save test embeddings for future use

In [53]:
i = np.load('test_clip_2.npy')
i.shape

(75000, 512)

BUILD FEATURE METRICS

In [37]:
import numpy as np

# Numeric columns for feature engineering
num_cols = ['quantity', 'unit_value', 'text_len'] + [f'kw_{kw}' for kw in [
    'premium', 'refill', 'combo', 'original', 'pack', 'bottle', 'set', 'new'
]]

# Fill NaNs with 0 and convert to NumPy arrays
X_num_train = train[num_cols].fillna(0).values
X_num_test  = test[num_cols].fillna(0).values

print("Numeric feature shapes:", X_num_train.shape, X_num_test.shape)

# 👇 Note: We now use `train_text_emb` and `test_text_emb` (MiniLM) instead of train_bert_emb
X_text_train = np.hstack([train_text_emb, X_num_train])
X_text_test  = np.hstack([test_text_emb, X_num_test])
print("Text + Numeric shapes:", X_text_train.shape, X_text_test.shape)

# 🖼 If using CLIP image embeddings — align array shapes properly.
# Make sure `train_clip_emb` and `test_clip_emb` already exist.
# If some rows don’t have image embeddings, fill with zeros or np.nan (then handle accordingly).

# Example: fill missing image embeddings with zeros to maintain shape
if train_clip_emb is not None and test_clip_emb is not None:
    X_mm_train = np.hstack([train_text_emb, X_num_train, train_clip_emb])
    X_mm_test  = np.hstack([test_text_emb, X_num_test, test_clip_emb])
    print("Multimodal (Text+Num+Image) shapes:", X_mm_train.shape, X_mm_test.shape)
else:
    print("⚠️ No CLIP embeddings found — skipping multimodal combination.")


Numeric feature shapes: (75000, 11) (75000, 11)
Text + Numeric shapes: (75000, 395) (75000, 395)
Multimodal (Text+Num+Image) shapes: (75000, 907) (75000, 907)


SMAPE METRIC

In [38]:
# Code purpose: SMAPE
def smape(actual, pred):
    actual = np.array(actual, dtype=float)
    pred   = np.array(pred, dtype=float)
    denom = (np.abs(actual) + np.abs(pred)) / 2.0
    # avoid div by 0
    mask = denom == 0
    denom[mask] = 1.0
    diff = np.abs(actual - pred) / denom
    return 100.0 * np.mean(diff)


TRAIN TEXT ONLY

In [39]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Ensure required variables are defined
if 'X_text_train' not in locals():
    raise ValueError("X_text_train is not defined. Ensure it is created in the 'BUILD FEATURE METRICS' section.")
if 'train' not in locals() or 'y_log1p' not in train.columns:
    raise ValueError("The 'train' DataFrame or the 'y_log1p' column is not defined.")
if 'RND' not in locals():
    RND = 42  # Default random seed
if 'smape' not in locals():
    def smape(actual, pred):
        actual = np.array(actual, dtype=float)
        pred = np.array(pred, dtype=float)
        denom = (np.abs(actual) + np.abs(pred)) / 2.0
        denom[denom == 0] = 1e-8  # Avoid division by zero
        diff = np.abs(actual - pred) / denom
        return 100.0 * np.mean(diff)

# Define features and target
X = X_text_train
y = train['y_log1p']  # Target variable

# Split data into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.15, random_state=RND)

# Define XGBoost parameters
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',
    'random_state': RND,
    'verbosity': 1,
    'early_stopping_rounds': 50  # Early stopping moved here
}

# Initialize and train the XGBoost model
model_text = xgb.XGBRegressor(**xgb_params)

print("Training XGBoost model...")
model_text.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    verbose=50  # Use verbose_eval for newer XGBoost versions
)

# Validate the model
y_val_pred_log = model_text.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)  # Invert log1p transformation
y_val_true = np.expm1(y_val)  # Invert log1p transformation

# Calculate SMAPE
validation_smape = smape(y_val_true, y_val_pred)
print(f"Text-only validation SMAPE: {validation_smape:.4f}%")

Training XGBoost model...
[0]	validation_0-rmse:2.35843
[50]	validation_0-rmse:0.97503
[100]	validation_0-rmse:0.83255
[150]	validation_0-rmse:0.81080
[200]	validation_0-rmse:0.80033
[250]	validation_0-rmse:0.79324
[300]	validation_0-rmse:0.78728
[350]	validation_0-rmse:0.78243
[400]	validation_0-rmse:0.77872
[450]	validation_0-rmse:0.77577
[500]	validation_0-rmse:0.77320
[550]	validation_0-rmse:0.77081
[600]	validation_0-rmse:0.76860
[650]	validation_0-rmse:0.76644
[700]	validation_0-rmse:0.76482
[750]	validation_0-rmse:0.76318
[800]	validation_0-rmse:0.76174
[850]	validation_0-rmse:0.76048
[900]	validation_0-rmse:0.75909
[950]	validation_0-rmse:0.75800
[999]	validation_0-rmse:0.75670
Text-only validation SMAPE: 59.1547%


MULTIMODAL EXPERIMNET ON IMAGES

In [40]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# --- Configuration Constants ---
MIN_SAMPLES_FOR_TRAINING = 200
TEST_SPLIT_RATIO = 0.2
RANDOM_STATE = 42
XGB_N_ESTIMATORS = 800
XGB_EARLY_STOPPING_ROUNDS = 40
XGB_VERBOSE_INTERVAL = 50

# --- Helper Function ---
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    denominator[denominator == 0] = 1e-8  # Avoid division by zero
    return np.mean(numerator / denominator) * 100

# --- Data Validation and Preparation ---
if 'train_clip_emb' not in locals() or 'X_mm_train' not in locals() or 'train' not in locals():
    raise ValueError("Ensure 'train_clip_emb', 'X_mm_train', and 'train' are defined before running this block.")

# 1. Handle missing values in `train_clip_emb`
train_clip_emb = np.nan_to_num(train_clip_emb)

# 2. Find rows where all image embedding values are finite
valid_img_mask = np.isfinite(train_clip_emb).all(axis=1)
num_valid_rows = np.sum(valid_img_mask)

print(f"Found {num_valid_rows} rows with complete image embeddings.")

# 3. Check if we have enough valid data to proceed with training
if num_valid_rows >= MIN_SAMPLES_FOR_TRAINING:
    print("Sufficient data available. Preparing multimodal model training...")

    # 4. Prepare dataset using the boolean mask
    X_mm_valid = X_mm_train[valid_img_mask]
    y_mm_valid = train.loc[valid_img_mask, 'y_log1p'].values

    # 5. Split the valid data into training and validation sets
    X_tr_m, X_val_m, y_tr_m, y_val_m = train_test_split(
        X_mm_valid,
        y_mm_valid,
        test_size=TEST_SPLIT_RATIO,
        random_state=RANDOM_STATE
    )

    # --- Model Training and Evaluation ---
    xgb_params = {
        'objective': 'reg:squarederror',
        'learning_rate': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'n_estimators': XGB_N_ESTIMATORS,
        'random_state': RANDOM_STATE,
        'tree_method': 'hist'
    }

    model_mm = xgb.XGBRegressor(**xgb_params)

    print("Training multimodal XGBoost model...")
    model_mm.fit(
        X_tr_m,
        y_tr_m,
        eval_set=[(X_val_m, y_val_m)],
        early_stopping_rounds=XGB_EARLY_STOPPING_ROUNDS,
        verbose=XGB_VERBOSE_INTERVAL
    )

    # 6. Evaluate the model on the validation set
    y_val_mm_pred = np.expm1(model_mm.predict(X_val_m))
    y_val_mm_true = np.expm1(y_val_m)

    validation_smape = smape(y_val_mm_true, y_val_mm_pred)
    print(f"\nMultimodal validation SMAPE (on image subset): {validation_smape:.4f}%")

else:
    print(f"Not enough image-enabled rows to train multimodal model reliably. "
          f"Required: {MIN_SAMPLES_FOR_TRAINING}, Found: {num_valid_rows}")

Found 75000 rows with complete image embeddings.
Sufficient data available. Preparing multimodal model training...
Training multimodal XGBoost model...
[0]	validation_0-rmse:2.31631




[50]	validation_0-rmse:0.81540
[100]	validation_0-rmse:0.77457
[150]	validation_0-rmse:0.76148
[200]	validation_0-rmse:0.75309
[250]	validation_0-rmse:0.74637
[300]	validation_0-rmse:0.74168
[350]	validation_0-rmse:0.73769
[400]	validation_0-rmse:0.73429
[450]	validation_0-rmse:0.73145
[500]	validation_0-rmse:0.72956
[550]	validation_0-rmse:0.72812
[600]	validation_0-rmse:0.72644
[650]	validation_0-rmse:0.72499
[700]	validation_0-rmse:0.72379
[750]	validation_0-rmse:0.72253
[799]	validation_0-rmse:0.72149

Multimodal validation SMAPE (on image subset): 55.9991%


FINAL TRAINING AND TEST PREDICTIONS

In [41]:
# Retrain text-only on full training data and predict on entire test
model_text_full = xgb.XGBRegressor(**{**xgb_params, 'n_estimators': 1200})

# Handle missing values in the training and test sets
X_text_train = np.nan_to_num(X_text_train)
X_text_test = np.nan_to_num(X_text_test)

# Fit the model
model_text_full.fit(
    X_text_train,
    train['y_log1p'].values,
    eval_set=[(X_text_train, train['y_log1p'].values)],
    verbose=50
)

# Predict and invert log
y_test_pred_log = model_text_full.predict(X_text_test)
y_test_pred = np.expm1(y_test_pred_log)

# Clip predictions to avoid extremes
y_test_pred = np.clip(y_test_pred, 1.0, price_99_5)

# Save text-only submission
sub_text = pd.DataFrame({'sample_id': test['sample_id'], 'price': y_test_pred})
sub_text.to_csv('test_out_text_only.csv', index=False)

print("Text-only prediction saved to test_out_text_only.csv")
sub_text.head()

[0]	validation_0-rmse:2.32011
[50]	validation_0-rmse:0.81481
[100]	validation_0-rmse:0.74960
[150]	validation_0-rmse:0.71670
[200]	validation_0-rmse:0.69017
[250]	validation_0-rmse:0.66829
[300]	validation_0-rmse:0.64810
[350]	validation_0-rmse:0.62948
[400]	validation_0-rmse:0.61336
[450]	validation_0-rmse:0.59745
[500]	validation_0-rmse:0.58274
[550]	validation_0-rmse:0.56906
[600]	validation_0-rmse:0.55588
[650]	validation_0-rmse:0.54278
[700]	validation_0-rmse:0.53105
[750]	validation_0-rmse:0.51896
[800]	validation_0-rmse:0.50769
[850]	validation_0-rmse:0.49665
[900]	validation_0-rmse:0.48577
[950]	validation_0-rmse:0.47572
[1000]	validation_0-rmse:0.46568
[1050]	validation_0-rmse:0.45619
[1100]	validation_0-rmse:0.44690
[1150]	validation_0-rmse:0.43792
[1199]	validation_0-rmse:0.42966
Text-only prediction saved to test_out_text_only.csv


Unnamed: 0,sample_id,price
0,100179,14.890229
1,245611,19.925413
2,146263,20.500971
3,95658,13.670382
4,36806,18.845936


trying different models

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, early_stopping

# --- Configuration ---
RANDOM_STATE = 42
TEST_SPLIT_RATIO = 0.15

# --- Helper Function ---
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    denominator[denominator == 0] = 1e-8  # Avoid division by zero
    return np.mean(numerator / denominator) * 100

# --- Combine Text and Image Features ---
# Ensure train_clip_emb and X_text_train are already loaded
if 'train_clip_emb' not in locals() or 'X_text_train' not in locals():
    raise ValueError("Ensure 'train_clip_emb' and 'X_text_train' are defined before running this block.")

# Combine text and image embeddings
X_mm_train = np.hstack([X_text_train, train_clip_emb])
print("Combined multimodal feature shape:", X_mm_train.shape)

# Define target variable
y = train['y_log1p'].values

# --- Train-Test Split ---
X_tr, X_val, y_tr, y_val = train_test_split(X_mm_train, y, test_size=TEST_SPLIT_RATIO, random_state=RANDOM_STATE)

# --- Train LightGBM Model ---
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE
}

model_lgb = LGBMRegressor(**lgb_params)

print("Training LightGBM model on multimodal data...")
model_lgb.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=50, verbose=50)]
)

# --- Evaluate the Model ---
y_val_pred_log = model_lgb.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)  # Invert log1p transformation
y_val_true = np.expm1(y_val)  # Invert log1p transformation

# Calculate SMAPE
validation_smape = smape(y_val_true, y_val_pred)
print(f"Multimodal validation SMAPE: {validation_smape:.4f}%")

Combined multimodal feature shape: (75000, 907)
Training LightGBM model on multimodal data...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229235
[LightGBM] [Info] Number of data points in the train set: 63750, number of used features: 907
[LightGBM] [Info] Start training from score 2.739550
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.525212
Multimodal validation SMAPE: 56.3305%




In [45]:
!pip install catboost
from catboost import CatBoostRegressor

# --- Train CatBoost Model ---
cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'random_seed': RANDOM_STATE,
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 50,
    'verbose': 50
}

model_cat = CatBoostRegressor(**cat_params)

print("Training CatBoost model on multimodal data...")
model_cat.fit(
    X_tr,
    y_tr,
    eval_set=(X_val, y_val),
    use_best_model=True
)

# --- Evaluate the Model ---
y_val_pred_log = model_cat.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)  # Invert log1p transformation
y_val_true = np.expm1(y_val)  # Invert log1p transformation

# Calculate SMAPE
validation_smape = smape(y_val_true, y_val_pred)
print(f"Multimodal validation SMAPE with CatBoost: {validation_smape:.4f}%")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting catboost
  Downloading catboost-1.2.8-cp313-cp313-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.3.1-py3-none-any.whl.metadata (8.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib->catboost)
  Using cached contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib->catboost)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->catboost)
  Using cached fonttools-4.60.1-cp313-cp313-macosx_10_13_universal2.whl.metadata (112 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->catboost)
  Using cached kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=3 (from matplo

In [56]:
!pip install tensorflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (4.5 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Downloa

In [57]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

# Load the pre-trained model
model_path = "model.h5"  # Path to your model file
model = load_model(model_path)
print(f"✅ Loaded model from {model_path}")

# Load the test image embeddings
test_clip_emb_path = "test_clip_2.npy"  # Path to your test embeddings file
test_clip_emb = np.load(test_clip_emb_path)
print(f"✅ Loaded test embeddings from {test_clip_emb_path}")
print(f"Test embeddings shape: {test_clip_emb.shape}")

# Ensure the test embeddings are valid
if not np.isfinite(test_clip_emb).all():
    raise ValueError("Test embeddings contain NaN or infinite values. Please clean the data.")

# Predict using the loaded model
print("⚡ Making predictions on test data...")
test_predictions_log = model.predict(test_clip_emb)  # Predictions in log scale
test_predictions = np.expm1(test_predictions_log)  # Invert log1p transformation

# Clip predictions to avoid extreme values
price_99_5 = train['price'].quantile(0.995)  # Assuming `train` DataFrame is loaded
test_predictions = np.clip(test_predictions, 1.0, price_99_5)

# Save predictions to a CSV file
output_file = "test_predictions.csv"
test_sample_ids = test['sample_id']  # Assuming `test` DataFrame is loaded
submission = pd.DataFrame({'sample_id': test_sample_ids, 'price': test_predictions.flatten()})
submission.to_csv(output_file, index=False)

print(f"✅ Predictions saved to {output_file}")
submission.head()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Matplotlib is building the font cache; this may take a moment.


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [26]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

# Correct the path to the model file
model_path = "/Users/rashidixit/Downloads/student_resource/my_model (1).h5"  # Absolute path to your model file
model = load_model(model_path, compile=False)
print(f"✅ Loaded model from {model_path}")

model.summary()

✅ Loaded model from /Users/rashidixit/Downloads/student_resource/my_model (1).h5


In [27]:
X_test_text = np.load('test_text_emb.npy')
X_test_img = np.load('test_clip_2.npy')
X_test_img = np.nan_to_num(X_test_img)  # Handle NaNs

X_test = np.hstack([X_test_text, X_test_img])
print("Test feature shape:", X_test.shape)

#predict using the loaded model
print("⚡ Making predictions on test data...")
test_predictions = model.predict(X_test)  # Predictions in log scale
  # Invert log1p transformation

test_predictions.shape

test_predictions = np.maximum(test_predictions, 0)

t = pd.read_csv('dataset/test.csv')
submission = pd.DataFrame({'sample_id': t['sample_id'], 'price': test_predictions.flatten()})
submission.to_csv('test_out_new.csv', index=False)
print(f"✅ Predictions saved to test_out_new.csv")


Test feature shape: (75000, 896)
⚡ Making predictions on test data...
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
✅ Predictions saved to test_out_new.csv


#Fine tuning


In [20]:
# ===================================================================
# 1. SETUP & DATA LOADING
# ===================================================================
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# File paths
train_csv_path = "/Users/rashidixit/Downloads/student_resource/dataset/train.csv"
train_text_emb_path = "/Users/rashidixit/Downloads/student_resource/train_text_emb.npy"
train_clip_emb_path = "/Users/rashidixit/Downloads/student_resource/train_clip_embeddings.npy"

# Load data
df_train = pd.read_csv(train_csv_path)
text_features = np.load(train_text_emb_path)
image_features = np.load(train_clip_emb_path)

# Combine text and image features
X_text = np.array(text_features)
X_img = np.array(image_features)
X = np.concatenate([X_text, X_img], axis=1)
y_df = df_train[['price']]  # Load target as DataFrame for easier handling

# ===================================================================
# 2. DATA PREPROCESSING
# ===================================================================
print("\n--- Preprocessing Data ---")

# Combine features and target, and drop rows with NaN values
full_df = pd.concat([pd.DataFrame(X), y_df], axis=1)
full_df.dropna(inplace=True)

X = full_df.iloc[:, :-1].values
y_raw = full_df.iloc[:, -1].values
print(f"Data shape after dropping NaNs: {X.shape}")

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
scaler_path = "/Users/rashidixit/Downloads/student_resource/scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")

# Log-transform the target variable
y = np.log1p(y_raw)
print("Log-transform of target variable complete.")

# ===================================================================
# 3. CUSTOM SMAPE LOSS FUNCTION
# ===================================================================
def smape(y_true, y_pred):
    epsilon = 1e-6
    numerator = tf.abs(y_pred - y_true)
    denominator = (tf.abs(y_true) + tf.abs(y_pred) + epsilon) / 2.0
    return 100 * tf.reduce_mean(numerator / denominator)

# ===================================================================
# 4. BUILD THE MODEL
# ===================================================================
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2

def build_stable_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(1024, activation='relu', kernel_regularizer=l2(1e-5)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(512, activation='relu', kernel_regularizer=l2(1e-5)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(256, activation='relu', kernel_regularizer=l2(1e-5)),
        BatchNormalization(),

        Dense(128, activation='relu', kernel_regularizer=l2(1e-5)),
        BatchNormalization(),

        Dense(1, activation='linear')
    ])
    return model

model = build_stable_model(X_scaled.shape[1])
model.summary()

# ===================================================================
# 5. COMPILE AND TRAIN
# ===================================================================
optimizer = AdamW(learning_rate=1e-2, weight_decay=1e-5)
model.compile(optimizer=optimizer, loss=smape)

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)
chk_path = "/Users/rashidixit/Downloads/student_resource/best_model.h5"
chk = ModelCheckpoint(chk_path, monitor='val_loss', save_best_only=True)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"\n--- Starting Training on {len(X_train)} samples ---")
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_val, y_val),
    callbacks=[reduce_lr, early_stop, chk]
)

# ===================================================================
# 6. FINAL EVALUATION
# ===================================================================
print("\n--- Evaluating final model performance ---")
best_model = tf.keras.models.load_model(chk_path, custom_objects={'smape': smape})

val_preds_log = best_model.predict(X_val)
val_preds_original = np.expm1(val_preds_log.flatten())
y_val_original = np.expm1(y_val)

def smape_numpy(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

final_smape_score = smape_numpy(y_val_original, val_preds_original)

print("\n" + "="*50)
print(f"✅ Final SMAPE Score on Validation Set: {final_smape_score:.4f}%")
print("="*50)

# ===================================================================
# 7. PREDICT ON TEST DATA
# ===================================================================
# Load test data
test_text_emb_path = "/Users/rashidixit/Downloads/student_resource/test_text_emb.npy"
test_clip_emb_path = "/Users/rashidixit/Downloads/student_resource/test_clip_embeddings.npy"
test_csv_path = "/Users/rashidixit/Downloads/student_resource/dataset/test.csv"

test_text_features = np.load(test_text_emb_path)
test_image_features = np.load(test_clip_emb_path)
test_df = pd.read_csv(test_csv_path)

# Combine test features
X_test = np.concatenate([test_text_features, test_image_features], axis=1)
X_test_scaled = scaler.transform(X_test)

# Predict
print("⚡ Making predictions on test data...")
test_preds_log = best_model.predict(X_test_scaled)
test_preds = np.expm1(test_preds_log.flatten())

# Save predictions
output_file = "/Users/rashidixit/Downloads/student_resource/test_predictions.csv"
submission = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': test_preds})
submission.to_csv(output_file, index=False)
print(f"✅ Predictions saved to {output_file}")


--- Preprocessing Data ---
Data shape after dropping NaNs: (74984, 896)
Scaler saved to /Users/rashidixit/Downloads/student_resource/scaler.pkl
Log-transform of target variable complete.



--- Starting Training on 59987 samples ---
Epoch 1/100
[1m465/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 43.2964



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 30.2729 - val_loss: 25.5706 - learning_rate: 0.0100
Epoch 2/100
[1m465/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - loss: 23.6113



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - loss: 23.3676 - val_loss: 23.0918 - learning_rate: 0.0100
Epoch 3/100
[1m465/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 22.3437



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 22.4197 - val_loss: 22.3812 - learning_rate: 0.0100
Epoch 4/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 21.8029 - val_loss: 22.7967 - learning_rate: 0.0100
Epoch 5/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 21.3975 - val_loss: 23.0633 - learning_rate: 0.0100
Epoch 6/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - loss: 20.9989



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - loss: 21.0092 - val_loss: 21.9419 - learning_rate: 0.0100
Epoch 7/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 20.7750 - val_loss: 22.1192 - learning_rate: 0.0100
Epoch 8/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 20.5395 - val_loss: 22.0654 - learning_rate: 0.0100
Epoch 9/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 20.4063 - val_loss: 22.2274 - learning_rate: 0.0100
Epoch 10/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 20.2719 - val_loss: 22.0557 - learning_rate: 0.0100
Epoch 11/100
[1m466/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 19.9617
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 20.270



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 19.2798 - val_loss: 21.7161 - learning_rate: 0.0050
Epoch 13/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 18.7615 - val_loss: 21.9148 - learning_rate: 0.0050
Epoch 14/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 18.6913 - val_loss: 22.1212 - learning_rate: 0.0050
Epoch 15/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 18.4863 - val_loss: 22.1903 - learning_rate: 0.0050
Epoch 16/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 18.3766 - val_loss: 22.4088 - learning_rate: 0.0050
Epoch 17/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 17.9517
Epoch 17: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 18



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - loss: 15.0760 - val_loss: 21.6886 - learning_rate: 0.0012
Epoch 26/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 14.8872 - val_loss: 21.7014 - learning_rate: 0.0012
Epoch 27/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 14.7214 - val_loss: 21.6978 - learning_rate: 0.0012
Epoch 28/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - loss: 14.5587 - val_loss: 21.7056 - learning_rate: 0.0012
Epoch 29/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - loss: 14.4013 - val_loss: 21.7805 - learning_rate: 0.0012
Epoch 30/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 14.2232
Epoch 30: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 14



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - loss: 13.9292 - val_loss: 21.6512 - learning_rate: 6.2500e-04
Epoch 32/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - loss: 13.8420



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - loss: 13.8380 - val_loss: 21.6161 - learning_rate: 6.2500e-04
Epoch 33/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 13.6958 - val_loss: 21.6197 - learning_rate: 6.2500e-04
Epoch 34/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - loss: 13.6105 - val_loss: 21.6271 - learning_rate: 6.2500e-04
Epoch 35/100
[1m467/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 13.6071



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - loss: 13.5469 - val_loss: 21.6092 - learning_rate: 6.2500e-04
Epoch 36/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - loss: 13.3735 - val_loss: 21.6181 - learning_rate: 6.2500e-04
Epoch 37/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 13.3231 - val_loss: 21.6283 - learning_rate: 6.2500e-04
Epoch 38/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 13.2599 - val_loss: 21.6390 - learning_rate: 6.2500e-04
Epoch 39/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 13.2197



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 13.2047 - val_loss: 21.5958 - learning_rate: 6.2500e-04
Epoch 40/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 13.1571 - val_loss: 21.6404 - learning_rate: 6.2500e-04
Epoch 41/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 13.0739



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 13.1133 - val_loss: 21.5705 - learning_rate: 6.2500e-04
Epoch 42/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 13.0117 - val_loss: 21.6489 - learning_rate: 6.2500e-04
Epoch 43/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 12.9807



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 12.9685 - val_loss: 21.5372 - learning_rate: 6.2500e-04
Epoch 44/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 12.8768 - val_loss: 21.5634 - learning_rate: 6.2500e-04
Epoch 45/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 12.6716



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 12.7888 - val_loss: 21.5082 - learning_rate: 6.2500e-04
Epoch 46/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 12.7571 - val_loss: 21.5348 - learning_rate: 6.2500e-04
Epoch 47/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 12.7110



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 12.7077 - val_loss: 21.4867 - learning_rate: 6.2500e-04
Epoch 48/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 12.5223 - val_loss: 21.5623 - learning_rate: 6.2500e-04
Epoch 49/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 12.6004 - val_loss: 21.4877 - learning_rate: 6.2500e-04
Epoch 50/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - loss: 12.5370 - val_loss: 21.5178 - learning_rate: 6.2500e-04
Epoch 51/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 12.4492 - val_loss: 21.4969 - learning_rate: 6.2500e-04
Epoch 52/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 12.3813
Epoch 52: ReduceLROnPlateau reducing learning rate to 0.0003124999930150807.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - loss: 12.1881 - val_loss: 21.4826 - learning_rate: 3.1250e-04
Epoch 55/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 12.1111 - val_loss: 21.5336 - learning_rate: 3.1250e-04
Epoch 56/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 12.1298 - val_loss: 21.5228 - learning_rate: 3.1250e-04
Epoch 57/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 12.0715 - val_loss: 21.4827 - learning_rate: 3.1250e-04
Epoch 58/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 11.9858



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 12.0598 - val_loss: 21.4691 - learning_rate: 3.1250e-04
Epoch 59/100
[1m467/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 11.8667



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.9954 - val_loss: 21.4691 - learning_rate: 3.1250e-04
Epoch 60/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.9831 - val_loss: 21.4775 - learning_rate: 3.1250e-04
Epoch 61/100
[1m466/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 11.8342



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.9409 - val_loss: 21.4600 - learning_rate: 3.1250e-04
Epoch 62/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 11.9279



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.8653 - val_loss: 21.4487 - learning_rate: 3.1250e-04
Epoch 63/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.9192 - val_loss: 21.4623 - learning_rate: 3.1250e-04
Epoch 64/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.8270 - val_loss: 21.4516 - learning_rate: 3.1250e-04
Epoch 65/100
[1m468/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 11.8285



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.8253 - val_loss: 21.4016 - learning_rate: 3.1250e-04
Epoch 66/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.7822 - val_loss: 21.4409 - learning_rate: 3.1250e-04
Epoch 67/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.7241 - val_loss: 21.4386 - learning_rate: 3.1250e-04
Epoch 68/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 11.7412 - val_loss: 21.4409 - learning_rate: 3.1250e-04
Epoch 69/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 11.6271 - val_loss: 21.4292 - learning_rate: 3.1250e-04
Epoch 70/100
[1m466/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 11.6033
Epoch 70: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 11.5158 - val_loss: 21.4007 - learning_rate: 7.8125e-05
Epoch 78/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - loss: 11.4412 - val_loss: 21.4140 - learning_rate: 7.8125e-05
Epoch 79/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 11.3867 - val_loss: 21.4051 - learning_rate: 7.8125e-05
Epoch 80/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 11.2723



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 11.3371 - val_loss: 21.3957 - learning_rate: 7.8125e-05
Epoch 81/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 11.3970 - val_loss: 21.4167 - learning_rate: 7.8125e-05
Epoch 82/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - loss: 11.3128 - val_loss: 21.4055 - learning_rate: 7.8125e-05
Epoch 83/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - loss: 11.3767 - val_loss: 21.4072 - learning_rate: 7.8125e-05
Epoch 84/100
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 11.4000 - val_loss: 21.4002 - learning_rate: 7.8125e-05
Epoch 85/100
[1m467/469[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 11.3989
Epoch 85: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m




--- Evaluating final model performance ---
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

✅ Final SMAPE Score on Validation Set: 50.1018%
⚡ Making predictions on test data...
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
✅ Predictions saved to /Users/rashidixit/Downloads/student_resource/test_predictions.csv


In [23]:
pd.read_csv("test_predictions.csv").isna().sum()
df = pd.read_csv("test_predictions.csv")
df['price'] = df['price'].fillna(df['price'].median())
df.to_csv("test_predictions.csv", index=False)


In [24]:
pd.read_csv("test_predictions.csv").isna().sum()

sample_id    0
price        0
dtype: int64