# The Product Pricer

A model that can estimate how much something costs, from its description


In [None]:
! uv -q pip install langchain-ollama

In [None]:
# imports

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
import pickle
import re
from langchain_ollama import OllamaLLM
from openai import OpenAI
from testing import Tester
import json


In [None]:
load_dotenv(override=True)
hf_token = os.getenv("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
from items import Item
from loaders import ItemLoader

%matplotlib inline

In [None]:
dataset_names = [
  "Appliances",
]

In [None]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load())

In [None]:
print(f"A grand total of {len(items):,} items")

In [None]:
# Plot the distribution of token counts again

tokens = [item.token_count for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Token counts: Avg {sum(tokens)/len(tokens):,.1f} and highest {max(tokens):,}\n")
plt.xlabel('Length (tokens)')
plt.ylabel('Count')
plt.hist(tokens, rwidth=0.7, color="skyblue", bins=range(0, 300, 10))
plt.show()

In [None]:
# Plot the distribution of prices

prices = [item.price for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.1f} and highest {max(prices):,}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="blueviolet", bins=range(0, 1000, 10))
plt.show()

In [None]:
# How does the price vary with the character count of the prompt?

sample = items

sizes = [len(item.prompt) for item in sample]
prices = [item.price for item in sample]

# Create the scatter plot
plt.figure(figsize=(15, 8))
plt.scatter(sizes, prices, s=0.2, color="red")

# Add labels and title
plt.xlabel('Size')
plt.ylabel('Price')
plt.title('Is there a simple correlation?')

# Display the plot
plt.show()

In [None]:
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))

In [None]:
report(sample[50])

In [None]:
import random


random.seed(42)
random.shuffle(sample)
train = sample[:25_000]
test = sample[25_000:27_000]
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

In [None]:
print(train[0].prompt)

In [None]:
print(test[0].test_prompt())

In [None]:
# Plot the distribution of prices in the first 250 test points

prices = [float(item.price) for item in test[:250]]
plt.figure(figsize=(15, 6))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="darkblue", bins=range(0, 1000, 10))
plt.show()

In [None]:
filtered_prices = [float(item.price) for item in test if item.price > 99.999]

### Confirm that the tokenizer tokenizes all 3 digit prices into 1 token

In [None]:
for price in filtered_prices:
    tokens = Item.tokenizer.encode(f"{price}", add_special_tokens=False)
    assert len(tokens) == 3


## Helpers

In [None]:
def messages_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]

In [None]:
# A utility function to extract the price from a string

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
# Convert the items into a list of json objects - a "jsonl" string
# Each row represents a message in the form:
# {"messages" : [{"role": "system", "content": "You estimate prices...


def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_for(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'
    return result.strip()

In [None]:
# Convert the items into jsonl and write them to a file

def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

## Load data

In [None]:
with open('train_lite.pkl', 'rb') as f:
    train_lite = pickle.load(f)

with open('test_lite.pkl', 'rb') as f:
    test_lite = pickle.load(f)


In [None]:
messages_for(test_lite[0])

In [None]:
get_price("The price is roughly $99.99 because blah blah")

## Models

In [None]:
MODEL_LLAMA3_2 = "llama3.2"
MODEL_MISTRAL = "mistral"
MODEL_TINY_LLAMA = "tinyllama"

llm3_2 = OllamaLLM(model=MODEL_LLAMA3_2)
llmMistral = OllamaLLM(model=MODEL_MISTRAL)
llmTinyLlama = OllamaLLM(model=MODEL_TINY_LLAMA)


## Model Tests

In [None]:
def llama3_2_model(item):
  response = llm3_2.invoke(messages_for(item))
  return get_price(response)

def mistral_model(item):
  response = llmMistral.invoke(messages_for(item))
  return get_price(response)

def tinyllama_model(item):
  response = llmTinyLlama.invoke(messages_for(item))
  return get_price(response)


In [None]:
test_lite[0].price

In [None]:
Tester.test(llama3_2_model, test_lite)

In [None]:
Tester.test(mistral_model, test_lite)

In [None]:
Tester.test(tinyllama_model, test_lite)