In [1]:
import os
import re
import json
from dotenv import load_dotenv

from huggingface_hub import login
from openai import OpenAI
from pricer.items import Item
from pricer.evaluator import evaluate

In [2]:
LITE_MODE = False

load_dotenv(override=True)
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
username = "ed-donner"
dataset = f"{username}/items_lite" if LITE_MODE else f"{username}/items_full"

train, val, test = Item.from_hub(dataset)

print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")

Loaded 800,000 training items, 10,000 validation items, 10,000 test items


In [4]:
openai = OpenAI()

In [5]:
fine_tune_train = train[:20_000]
fine_tune_validation = val[:50]

In [6]:
len(fine_tune_train)

20000

In [7]:
def messages_for(item):
    message = f"Estimate the price of this product.  Respond with the price, no explanation\n\n{item.summary}"
    return [
        {"role": "user", "content": message},
        {"role": "assistant", "content": f"${item.price:.2f}"}
    ]

In [8]:
messages_for(fine_tune_train[0])

[{'role': 'user',
  'content': 'Estimate the price of this product.  Respond with the price, no explanation\n\nTitle: Schlage F59 & 613 Andover Interior Knob (Deadbolt Included)  \nCategory: Home Hardware  \nBrand: Schlage  \nDescription: A single‑piece oil‑rubbed bronze knob that mounts to a deadbolt for secure, easy interior door use.  \nDetails: Designed for a 4" minimum center‑to‑center door prep, it offers a lifetime mechanical and finish warranty and comes ready for quick installation.'},
 {'role': 'assistant', 'content': '$64.30'}]

In [9]:
# Convert items into a list of json objects (a jsonl string)
# Each row represents a message in the format:
# {"messages": [{"role": "system", "content": "You estimate prices...."}]}

def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_for(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'

    return result.strip()

In [10]:
print(make_jsonl(train[:3]))

{"messages": [{"role": "user", "content": "Estimate the price of this product.  Respond with the price, no explanation\n\nTitle: Schlage F59 & 613 Andover Interior Knob (Deadbolt Included)  \nCategory: Home Hardware  \nBrand: Schlage  \nDescription: A single\u2011piece oil\u2011rubbed bronze knob that mounts to a deadbolt for secure, easy interior door use.  \nDetails: Designed for a 4\" minimum center\u2011to\u2011center door prep, it offers a lifetime mechanical and finish warranty and comes ready for quick installation."}, {"role": "assistant", "content": "$64.30"}]}
{"messages": [{"role": "user", "content": "Estimate the price of this product.  Respond with the price, no explanation\n\nTitle: Mini Electric Air Duster Fan  \nCategory: Electronics  \nBrand: Kica  \nDescription: Ultra\u2011compact 86,000\u202fRPM electric air duster with 11\u202fm/s wind speed for precise cleaning and inflation.  \nDetails: Powered by a 9.99\u202fWh motor, adjustable in four speed levels, it uses thre

In [11]:
def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [12]:
write_jsonl(fine_tune_train, "jsonl/fine_tune_train.jsonl")

In [13]:
write_jsonl(fine_tune_validation, "jsonl/fine_tune_validation.jsonl")

In [None]:
# Have OpenAI create the training jsonl file
with open("jsonl/fine_tune_train.jsonl", "rb") as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

In [15]:
train_file

FileObject(id='file-2ycB6XcF75NBQyrYYUuxHG', bytes=11064889, created_at=1772216300, filename='fine_tune_train.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [16]:
# Have OpenAI create the validation jsonl file
with open("jsonl/fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

In [17]:
validation_file

FileObject(id='file-Gpuh1GkgmgpkyZ9JFswBWm', bytes=27736, created_at=1772216499, filename='fine_tune_validation.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [18]:
openai_file_location = ("https://platform.openai.com/storage/files")

In [19]:
# Superised fine-tuning
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4.1-nano-2025-04-14",
    seed=42,
    hyperparameters={'n_epochs': 1, "batch_size": 16},
    suffix="pricer"
)

FineTuningJob(id='ftjob-t1LNeEtegygzlkluYTibAaef', created_at=1772216923, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=16, learning_rate_multiplier='auto', n_epochs=1), model='gpt-4.1-nano-2025-04-14', object='fine_tuning.job', organization_id='org-2O8if7lKykThaapcM0yUh6Qu', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-2ycB6XcF75NBQyrYYUuxHG', validation_file='file-Gpuh1GkgmgpkyZ9JFswBWm', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=16, learning_rate_multiplier='auto', n_epochs=1))), user_provided_suffix='pricer', usage_metrics=None, shared_with_openai=False, eval_id=None, internal_worker_backend=None)

In [20]:
openai.fine_tuning.jobs.list(limit=1)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-t1LNeEtegygzlkluYTibAaef', created_at=1772216923, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=16, learning_rate_multiplier='auto', n_epochs=1), model='gpt-4.1-nano-2025-04-14', object='fine_tuning.job', organization_id='org-2O8if7lKykThaapcM0yUh6Qu', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-2ycB6XcF75NBQyrYYUuxHG', validation_file='file-Gpuh1GkgmgpkyZ9JFswBWm', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=16, learning_rate_multiplier='auto', n_epochs=1))), user_provided_suffix='pricer', usage_metrics=None, shared_with_openai=False, eval_id=None, internal_worker_backend=None)], has_more=False, object='list')

In [21]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id

In [22]:
openai.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-t1LNeEtegygzlkluYTibAaef', created_at=1772216923, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=16, learning_rate_multiplier=0.1, n_epochs=1), model='gpt-4.1-nano-2025-04-14', object='fine_tuning.job', organization_id='org-2O8if7lKykThaapcM0yUh6Qu', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-2ycB6XcF75NBQyrYYUuxHG', validation_file='file-Gpuh1GkgmgpkyZ9JFswBWm', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=16, learning_rate_multiplier=0.1, n_epochs=1))), user_provided_suffix='pricer', usage_metrics=None, shared_with_openai=False, eval_id=None, internal_worker_backend=None)

In [23]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

[FineTuningJobEvent(id='ftevent-ZM7FRYoZosBSlq6wkklt8IZg', created_at=1772216923, level='info', message='Validating training file: file-2ycB6XcF75NBQyrYYUuxHG and validation file: file-Gpuh1GkgmgpkyZ9JFswBWm', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-j2EgBZrFfLLV7stEFBbzoX7N', created_at=1772216923, level='info', message='Created fine-tuning job: ftjob-t1LNeEtegygzlkluYTibAaef', object='fine_tuning.job.event', data={}, type='message')]

In [24]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

[FineTuningJobEvent(id='ftevent-kbuU09Nn3B0MZg5vbHJutzHQ', created_at=1772217213, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message'),
 FineTuningJobEvent(id='ftevent-lr7bhqNgHUxCh5ggpM62JoV9', created_at=1772217202, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-ZM7FRYoZosBSlq6wkklt8IZg', created_at=1772216923, level='info', message='Validating training file: file-2ycB6XcF75NBQyrYYUuxHG and validation file: file-Gpuh1GkgmgpkyZ9JFswBWm', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-j2EgBZrFfLLV7stEFBbzoX7N', created_at=1772216923, level='info', message='Created fine-tuning job: ftjob-t1LNeEtegygzlkluYTibAaef', object='fine_tuning.job.event', data={}, type='message')]

In [25]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

[FineTuningJobEvent(id='ftevent-kbuU09Nn3B0MZg5vbHJutzHQ', created_at=1772217213, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message'),
 FineTuningJobEvent(id='ftevent-lr7bhqNgHUxCh5ggpM62JoV9', created_at=1772217202, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-ZM7FRYoZosBSlq6wkklt8IZg', created_at=1772216923, level='info', message='Validating training file: file-2ycB6XcF75NBQyrYYUuxHG and validation file: file-Gpuh1GkgmgpkyZ9JFswBWm', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-j2EgBZrFfLLV7stEFBbzoX7N', created_at=1772216923, level='info', message='Created fine-tuning job: ftjob-t1LNeEtegygzlkluYTibAaef', object='fine_tuning.job.event', data={}, type='message')]

In [26]:
openai_fine_tuning_run_location = ("https://platform.openai.com/finetune/")

In [27]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

[FineTuningJobEvent(id='ftevent-X7ZoTQWwRpejyEzP6wTBJ89S', created_at=1772219734, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-JdLT3w1vfAPxUWwbM2HhiCQw', created_at=1772219729, level='info', message='Usage policy evaluations completed, model is now enabled for sampling', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-AvDWtU08J7CRT8G3OUjYeFns', created_at=1772219729, level='info', message='Moderation checks for snapshot ft:gpt-4.1-nano-2025-04-14:personal:pricer:DDxPPQf5 passed.', object='fine_tuning.job.event', data={'blocked': False, 'results': [{'flagged': False, 'category': 'harassment/threatening', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'sexual', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'sexual/minors', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'propaganda', 'enforcement': 'blocking

In [28]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model

In [29]:
fine_tuned_model_name

'ft:gpt-4.1-nano-2025-04-14:personal:pricer:DDxPPQf5'

In [35]:
def test_message_for(item):
    message = f"Estimate the price of this product.  Respond with the price, no explanation\n\n{item.summary}"
    return [
        {"role": "user", "content": message},
    ]

In [36]:
test_message_for(test[0])

[{'role': 'user',
  'content': 'Estimate the price of this product.  Respond with the price, no explanation\n\nTitle: Excess V2 Distortion/Modulation Pedal  \nCategory: Music Pedals  \nBrand: Old Blood Noise  \nDescription: A versatile pedal offering distortion and three modulation modes—delay, chorus, and harmonized fifths—with full control over signal routing and expression.  \nDetails: Features include separate gain, tone, and volume controls; time, depth, and volume per modulation; order switching, soft‑touch bypass, and expression jack for dynamic control.'}]

In [37]:
# Inference

def gpt_4__1_nano_fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=test_message_for(item),
        seed=42,
        max_tokens=7
    )
    return response.choices[0].message.content

In [39]:
print(test[0].price)
print(gpt_4__1_nano_fine_tuned(test[0]))

219.0
$189.95


In [40]:
evaluate(gpt_4__1_nano_fine_tuned, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[92m$29 [91m$220 [92m$10 [93m$45 [91m$142 [92m$16 [92m$3 [93m$82 [92m$14 [91m$406 [91m$463 [93m$120 [92m$13 [92m$11 [92m$25 [92m$18 [92m$10 [92m$15 [92m$26 [93m$49 [92m$3 [93m$49 [92m$7 [91m$365 [91m$159 [91m$216 [91m$245 [92m$0 [91m$210 [93m$60 [92m$8 [92m$15 [93m$41 [93m$63 [93m$63 [92m$87 [91m$203 [92m$23 [93m$84 [92m$19 [91m$184 [93m$59 [92m$14 [92m$28 [91m$115 [92m$3 [93m$48 [92m$3 [93m$61 [92m$34 [92m$4 [91m$99 [91m$323 [92m$21 [93m$64 [91m$117 [92m$13 [93m$80 [92m$4 [92m$9 [93m$50 [93m$45 [93m$63 [93m$56 [91m$629 [92m$20 [91m$102 [91m$305 [92m$24 [93m$64 [92m$13 [92m$23 [93m$105 [92m$13 [92m$10 [92m$7 [92m$25 [92m$3 [92m$7 [92m$0 [92m$10 [92m$19 [92m$35 [92m$33 [92m$5 [92m$0 [91m$209 [91m$185 [92m$30 [93m$59 [92m$38 [91m$118 [92m$7 [92m$5 [92m$13 [92m$42 [92m$4 [92m$33 [93m$66 [91m$402 [92m$25 [92m$2 [92m$8 [92m$9 [93m$66 [93m$72 [92m$8 [91m$346 [92m$15 [92m$16 [92m$32 