<a href="https://colab.research.google.com/github/dkisselev-zz/llm_engineering/blob/wk6-exerc/week6/community-contributions/dkisselev-zz/Week6-Excerise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Product Pricer Challenge

A baseline established by gpt4o and attempt to beat it


## Initialize and load configuration

In [None]:
# imports

import os
import re
import math
import json
import random
import pickle

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

from huggingface_hub import login
from openai import OpenAI

from items import Item
from testing import Tester

In [None]:
# environment

try:
  from google.colab import userdata
  os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_API_KEY')
  os.environ['HF_TOKEN']=userdata.get('HF_TOKEN')
  print("✅ Using Colab secrets")
except:
  from dotenv import load_dotenv
  load_dotenv(override=True)
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
  os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
  print("✅ Using local .env file")

In [None]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token)

In [None]:
openai = OpenAI()

In [None]:
%matplotlib inline

In [None]:
# Let's avoid curating all our data again! Load in the pickle files:

with open('train2.pkl', 'rb') as file:
    train = pickle.load(file)

with open('test2.pkl', 'rb') as file:
    test = pickle.load(file)

with open('validation2.pkl','rb') as file:
    validation = pickle.load(file)

In [None]:
# OpenAI recommends fine-tuning with populations of 50-100 examples
# But as our examples are very small, I'm suggesting we go with 500 examples (and 1 epoch)

fine_tune_train = train[:500]
fine_tune_validation = train[500:550]

In [None]:
# Weight and Biases
wandb_integration = {"type": "wandb", "wandb": {"project": "gpt-pricer-ft"}}

## Helpers

In [None]:
# A utility function to extract the price from a string

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
# Prompt
def messages_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]

In [None]:
def messages_with_price(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is ${item.price:.2f}"}
    ]

## Baseline *gpt4o*

**Error:** $78.02 RMSLE 0.81 Hits 55.6%

In [None]:
def gpt_4o_frontier(item):
    response = openai.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages_for(item),
        seed=42,
        max_tokens=5
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
Tester.test(gpt_4o_frontier, test)

## Fine tuned baseline *gpt4o*

**Error:** $105.37 RMSLE 0.84 Hits 41.2%

### Data Preprocessing

Prepare our data for fine-tuning in JSONL (JSON Lines) format and upload to OpenAI

In [None]:
# Convert the items into a list of json objects - a "jsonl" string
# Each row represents a message in the form:
# {"messages" : [{"role": "system", "content": "You estimate prices...

def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_with_price(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'
    return result.strip()

In [None]:
# Convert the items into jsonl and write them to a file

def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [None]:
write_jsonl(fine_tune_train, "fine_tune_train.jsonl")
write_jsonl(fine_tune_validation, "fine_tune_validation.jsonl")

In [None]:
with open("fine_tune_train.jsonl", "rb") as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

In [None]:
train_file

In [None]:
with open("fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

In [None]:
validation_file

### Fine Tune the model

In [None]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": 1},
    integrations = [wandb_integration],
    suffix="pricer"
)

In [None]:
openai.fine_tuning.jobs.list(limit=1)

In [None]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id

In [None]:
job_id

In [None]:
openai.fine_tuning.jobs.retrieve(job_id)

In [None]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

### Run inference on the fine tune model

In [None]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model

In [None]:
# Try this out

messages_for(test[237])

In [None]:
# The function for gpt-4o fine tuned

def gpt_fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages_for(item),
        seed=42,
        max_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
print(test[237].price)
print(gpt_fine_tuned(test[237]))

In [None]:
Tester.test(gpt_fine_tuned, test)

## gpt 4.1 base
**Error:** $70.36 RMSLE=0.522 Hits=64.4%

In [None]:
def gpt_4_1_frontier(item):
    response = openai.chat.completions.create(
        model="gpt-4.1-2025-04-14",
        messages=messages_for(item),
        seed=42,
        max_completion_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
Tester.test(gpt_4_1_frontier, test)

### gpt 4.1 hypertuned , extended dataset
**Error**: $67.93, RMSLE=0.47, Hits 68.8

In [None]:
def gpt_4_1_hypertuned(item):
    response = openai.chat.completions.create(
        model="gpt-4.1-2025-04-14",
        messages=messages_v2(item, with_price=False),
        seed=42,
        temperature=0.2,
        max_completion_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
Tester.test(gpt_4_1_hypertuned, test)

## gpt-5 base
**Error:** $77.97 RMLSE=0.53 Hits=61.6% (reasoning_effort="minimal"

**Error:** $70.63 RMLSE=0.51 Hits=61.6% (reasoning_effort="low"

In [None]:
def gpt_5_frontier(item):
    response = openai.chat.completions.create(
        model="gpt-5-2025-08-07",
        messages=messages_for(item),
        seed=42,
        reasoning_effort="low",
        max_completion_tokens=800
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
Tester.test(gpt_5_frontier, test)

## gpt 4.1 fine-tuned with extended dataset

### Data Preprocessing

In [None]:

def messages_v2(item, with_price=True):
    system_message = \
        "Role: You are a retail price estimator.\n" \
        "Market: United States; Currency: USD.\n" \
        "Scope: Predict the most likely new retail price. Ignore taxes, shipping, coupons, bundles, used/renewed.\n" \
        "Output: Only a number with two decimals (e.g., 129.99). No $ sign. No words.\n" \
        "Think silently; do not reveal reasoning."

    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": str({
            "query":"price_estimate",
            "locale":"en_US",
            "currency":"USD",
            "category":item.category,
            "description":user_prompt,
            "brand":json.loads(item.details).get("Brand","Unknown")
            })
        },
        {"role": "assistant", "content": f"Price is ${item.price:.2f}" if with_price else "Price is $"}
    ]

In [None]:
messages_v2(test[237], with_price=False)


In [None]:
def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_v2(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'
    return result.strip()

In [None]:
# Convert the items into jsonl and write them to a file

def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [None]:
write_jsonl(fine_tune_train, "fine_tune_train_v2.jsonl")
write_jsonl(fine_tune_validation, "fine_tune_validation_v2.jsonl")

In [None]:
with open("fine_tune_train_v2.jsonl", "rb") as f:
    train_file_v2 = openai.files.create(file=f, purpose="fine-tune")

with open("fine_tune_validation_v2.jsonl", "rb") as f:
    validation_file_v2 = openai.files.create(file=f, purpose="fine-tune")

### Fine tune the model

In [None]:
openai.fine_tuning.jobs.create(
    training_file=train_file_v2.id,
    validation_file=validation_file_v2.id,
    model="gpt-4.1-2025-04-14",
    seed=42,
    hyperparameters={"n_epochs": 1},
    integrations = [wandb_integration],
    suffix="pricer"
)

In [None]:
openai.fine_tuning.jobs.list(limit=1)

In [None]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id

In [None]:
openai.fine_tuning.jobs.retrieve(job_id)

In [None]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

### Inference

In [None]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model

In [None]:
def gpt_41fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages_v2(item, with_price=False),
        seed=42,
        temperature=1.0,
        max_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
Tester.test(gpt_41fine_tuned, test)

###