# Product Price Prediction: Fine-Tuning GPT-4o-mini to Beat $76 Baseline Error

## Project Description

This project fine-tunes OpenAI's GPT-4o-mini model to predict product prices based solely on product descriptions. The goal is to improve upon the baseline mean absolute error of $76 by training the model to estimate prices from textual product information including specifications, features, and descriptions.

** What I did Differently ** 
- I used a specially curated open source subset of the Amazon Review Dataset to train the model

The dataset used for this project is the **Amazon Product Price Prediction Dataset** curated specifically for LLM fine-tuning by Jai Keshav Sharma (2024). This dataset contains 400,000 Amazon product listings with detailed descriptions and corresponding prices, making it ideal for training language models on price estimation tasks.

### Dataset Citation
```
@dataset{sharma2024amazon_price_prediction,
  title={Amazon Product Price Prediction Dataset: Curated for LLM Fine-tuning},
  author={Jai Keshav Sharma},
  year={2024},
  publisher={Hugging Face},
  url={https://huggingface.co/datasets/ksharma9719/Amazon-Reviews-2023-curated_for_price_prediction}
}
```

The model is trained to output price estimates in a standardized format, enabling e-commerce applications such as automated pricing, market analysis, and competitive intelligence.


Project Description:

In [None]:
# imports

import os
import re
from google.colab import userdata
import json
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import Optional
import re
from datasets import load_dataset
import random


In [None]:
hf_token = userdata.get('HF_TOKEN')
openai_api_key = userdata.get('OPENAI_API_KEY')

login(hf_token, add_to_git_credential=True)
openai = OpenAI(api_key=openai_api_key)

In [None]:
# This is the specially curated dataset from ksharrma
dataset = load_dataset(
    "ksharma9719/Amazon-Reviews-2023-curated_for_price_prediction",
    data_files="data/train-00000-of-00001.parquet"
)

In [None]:
# Access the training data, and dividing it into train and test data
total_length = len(dataset["train"])

# Shuffle indices
all_indices = list(range(total_length))
random.seed(42)
random.shuffle(all_indices)

train_indices = all_indices[:-2000]
test_indices = all_indices[-2000:]

train_data = dataset["train"].select(train_indices)
test_data = dataset["train"].select(test_indices)

In [None]:
print(f"Total entries: {total_length}")
print(f"Training entries: {len(train_data)}")
print(f"Test entries: {len(test_data)}")

In [None]:
# OpenAI recommends fine-tuning with populations of 50-100 examples
# But as our examples are very small, I'm suggesting we go with 200 examples (and 1 epoch)

fine_tune_train = train_data.select(range(200))
fine_tune_validation = train_data.select(range(200,250))

## Preparing the data for Fine Tuning Using JSONL

In [None]:
# This function thoroughly formats the price data to make sure that there is no data leak into the training model
def messages_for(item):
    system_message = "You are a price estimation assistant. Respond only with the estimated price in the format: Price is $X.XX"
    
    user_prompt = item["text"]
    price = item["price"]

    user_prompt = user_prompt.replace(" to the nearest dollar", "")
    user_prompt = user_prompt.replace("\n\nPrice is $", "")
    
    price_formats = [
        f"{price:.2f}",           
        f"{price:.0f}",           
        f"{price}",               
        f"{int(price)}",          
        f"{price:.2f}".replace('.', ''),  
    ]
    
    for price_str in price_formats:
        if user_prompt.endswith(price_str):
            user_prompt = user_prompt[:-len(price_str)].strip()
            break
        if f"${price_str}" in user_prompt:
            user_prompt = user_prompt.replace(f"${price_str}", "").strip()
        if user_prompt.rstrip().endswith(price_str):
            user_prompt = user_prompt.rstrip()[:-len(price_str)].strip()

    user_prompt = re.sub(r'(\d+\.?\d{0,2})$', '', user_prompt).strip()
    
    user_prompt = re.sub(r'\$\s*[\d,]+\.?\d{0,2}', '', user_prompt)

    if re.search(rf'\b{int(price)}\b\s*$', user_prompt):
        user_prompt = re.sub(rf'\b{int(price)}\b\s*$', '', user_prompt).strip()
    
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt.strip()},
        {"role": "assistant", "content": f"Price is ${item['price']:.2f}"}
    ]

In [None]:
messages_for(train_data[0])

In [None]:
# Convert the items into a list of json objects - a "jsonl" string
# Each row represents a message in the form:
# {"messages" : [{"role": "system", "content": "You estimate prices...

def make_jsonl(items):
    lines = []
    for item in items:
        messages = messages_for(item)
        json_obj = {"messages": messages}
        lines.append(json.dumps(json_obj))
    return '\n'.join(lines)

In [None]:
print(make_jsonl(train_data.select(range(3))))

In [None]:

def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [None]:
write_jsonl(fine_tune_train, "fine_tune_train.jsonl")

In [None]:
write_jsonl(fine_tune_validation, "fine_tune_validation.jsonl")

In [None]:
with open("fine_tune_train.jsonl", "rb") as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

In [None]:
train_file

In [None]:
with open("fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

In [None]:
validation_file

In [None]:
wandb_integration = {"type": "wandb", "wandb": {"project": "gpt-pricer"}}

In [None]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": 1},
    integrations = [wandb_integration],
    suffix="pricer"
)

In [None]:
# job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id
job_id="ftjob-kMWRKdN9t8H0lDAxzHT5kmeB"
print(job_id)

In [None]:
openai.fine_tuning.jobs.retrieve(job_id)

In [None]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model
print(fine_tuned_model_name)

In [None]:
# Try this out


def messages_for(item):
    system_message = "You are a price estimation assistant. Respond only with the estimated price in the format: Price is $X.XX"
    
    user_prompt = item["text"]
    price = item["price"]
    
    # Remove common price-related phrases
    user_prompt = user_prompt.replace(" to the nearest dollar", "")
    user_prompt = user_prompt.replace("\n\nPrice is $", "")
    
    # Create multiple price format variations to remove
    price_formats = [
        f"{price:.2f}",           # 329.00
        f"{price:.0f}",           # 329
        f"{price}",               # 329.0
        f"{int(price)}",          # 329
        f"{price:.2f}".replace('.', ''),  # 32900
    ]
    
    # Try to remove each format from the end of the string
    for price_str in price_formats:
        # Remove from end (most common)
        if user_prompt.endswith(price_str):
            user_prompt = user_prompt[:-len(price_str)].strip()
            break
        # Remove with $ prefix
        if f"${price_str}" in user_prompt:
            user_prompt = user_prompt.replace(f"${price_str}", "").strip()
        # Remove standalone number at the end
        if user_prompt.rstrip().endswith(price_str):
            user_prompt = user_prompt.rstrip()[:-len(price_str)].strip()
    
    # Additional regex cleanup - remove any trailing number that might be a price
    # This catches cases where the price is stuck to the end of a word
    user_prompt = re.sub(r'(\d+\.?\d{0,2})$', '', user_prompt).strip()
    
    # Remove $ signs followed by numbers anywhere in the text
    user_prompt = re.sub(r'\$\s*[\d,]+\.?\d{0,2}', '', user_prompt)
    
    # Final safety check - if the price (as int) appears at the very end, remove it
    if re.search(rf'\b{int(price)}\b\s*$', user_prompt):
        user_prompt = re.sub(rf'\b{int(price)}\b\s*$', '', user_prompt).strip()
    
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt.strip()},
        {"role": "assistant", "content": f"Price is ${item['price']:.2f}"}
    ]

In [None]:

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
get_price("The price is roughly $99.99 because blah blah")

In [None]:
# The function for gpt-4o-mini

def gpt_fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages_for(item),
        seed=42,
        max_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
item = test_data.select([0])[0]  # Select returns a dataset, so index [0] to get the item
print(item["price"])
print(gpt_fine_tuned(item))

In [None]:
import math
import matplotlib.pyplot as plt

GREEN = "\033[92m"
BLUE = "\033[94m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN, "blue":BLUE}

class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "blue"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"] if len(datapoint["text"]) <= 40 else datapoint["text"][:40]+"..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="blue")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
Tester.test(gpt_fine_tuned, test_data)