# Fine Tuning with OpenAI

## Utilities

In [None]:
# Constants - used for printing to stdout in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

### Item

In [None]:
from typing import Optional
from transformers import AutoTokenizer
import re

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content
MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens

MIN_CHARS = 300
CEILING_CHARS = MAX_TOKENS * 7

class Item:
    """
    An Item is a cleaned, curated datapoint of a Product with a Price
    """

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    PREFIX = "Price is $"
    QUESTION = "How much does this cost to the nearest dollar?"
    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]

    title: str
    price: float
    category: str
    token_count: int = 0
    details: Optional[str]
    prompt: Optional[str] = None
    include = False

    def __init__(self, data, price):
        self.title = data['title']
        self.price = price
        self.parse(data)

    def scrub_details(self):
        """
        Clean up the details string by removing common text that doesn't add value
        """
        details = self.details
        for remove in self.REMOVALS:
            details = details.replace(remove, "")
        return details

    def scrub(self, stuff):
        """
        Clean up the provided text by removing unnecessary characters and whitespace
        Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
        """
        stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
        stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
        words = stuff.split(' ')
        select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
        return " ".join(select)

    def parse(self, data):
        """
        Parse this datapoint and if it fits within the allowed Token range,
        then set include to True
        """
        contents = '\n'.join(data['description'])
        if contents:
            contents += '\n'
        features = '\n'.join(data['features'])
        if features:
            contents += features + '\n'
        self.details = data['details']
        if self.details:
            contents += self.scrub_details() + '\n'
        if len(contents) > MIN_CHARS:
            contents = contents[:CEILING_CHARS]
            text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
            tokens = self.tokenizer.encode(text, add_special_tokens=False)
            if len(tokens) > MIN_TOKENS:
                tokens = tokens[:MAX_TOKENS]
                text = self.tokenizer.decode(tokens)
                self.make_prompt(text)
                self.include = True

    def make_prompt(self, text):
        """
        Set the prompt instance variable to be a prompt appropriate for training
        """
        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
        self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))

    def test_prompt(self):
        """
        Return a prompt suitable for testing, with the actual price removed
        """
        return self.prompt.split(self.PREFIX)[0] + self.PREFIX

    def __repr__(self):
        """
        Return a String version of this Item
        """
        return f"<{self.title} = ${self.price}>"


### Tester

In [None]:
class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint.price
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
# A utility function to extract the price from a string

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r'[-+]?\d*\.?\d+', s) # Simplify regex
    return float(match.group()) if match else 0

## Data Curation

In [None]:
%pip install --upgrade --quiet  jupyterlab ipython ipywidgets huggingface_hub datasets transformers

%matplotlib notebook


### Load Dataset from Hugging Face

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer


dataset = load_dataset('ranskills/Amazon-Reviews-2023-raw_meta_All_Beauty', split='full')

In [None]:
from IPython.display import display, JSON


print(f'Number of datapoints: {dataset.num_rows:,}')
display(JSON(dataset.features.to_dict()))

In [None]:
def non_zero_price_filter(datapoint: dict):
    try:
        price = float(datapoint['price'])
        return price > 0
    except:
        return False

filtered_dataset = dataset.filter(non_zero_price_filter)

print(f'Prices with non-zero prices:{filtered_dataset.num_rows:,}  = {filtered_dataset.num_rows / dataset.num_rows * 100:,.2f}%')

In [None]:
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt


data = defaultdict(lambda: [])
for datapoint in filtered_dataset:
    price = float(datapoint['price'])
    contents = datapoint["title"] + str(datapoint["description"]) + str(datapoint["features"]) + str(datapoint["details"])

    data['price'].append(price)
    data['characters'].append(len(contents))

%matplotlib inline

df = pd.DataFrame(data)

combined_describe = pd.concat(
    [df['price'].describe(), df['characters'].describe()],
    axis=1
)

display(combined_describe)

prices = data['price']
lengths = data['characters']

plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {df['price'].mean():,.2f} and highest {df['price'].max():,}\n")
plt.xlabel('Length (chars)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="orange", bins=range(0, 300, 10))
plt.show()

plt.figure(figsize=(15, 6))
plt.title(f"Characters: Avg {sum(lengths)/len(lengths):,.0f} and highest {max(lengths):,}\n")
plt.xlabel('Length (characters)')
plt.ylabel('Count')
plt.hist(lengths, rwidth=0.7, color="lightblue", bins=range(0, 2500, 50))
plt.show()

In [None]:
BASE_MODEL = 'meta-llama/Meta-Llama-3.1-8B'
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

tokenizer.encode('114', add_special_tokens=False)

items = []
for datapoint in filtered_dataset:
    price = float(datapoint['price'])
    items.append(Item(datapoint, price))


In [None]:
print(items[0].test_prompt())

In [None]:
# filter out items with None prompt as a result of their content being below the minimum threshold
valid_items = [item for item in items if item.prompt is not None]

data_size = len(valid_items)


training_size = int(data_size * 0.9)
train = valid_items[:training_size]
test = valid_items[training_size:]


In [None]:
train_prompts = [item.prompt for item in train]
train_prices = [item.price for item in train]
test_prompts = [item.test_prompt() for item in test]
test_prices = [item.price for item in test]

In [None]:
# Create a Dataset from the lists

train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

### Export Data

In [None]:
import pickle

DATA_DIR = 'data'

train_storage_file = lambda ext: f'{DATA_DIR}/all_beauty_train{ext}'
test_storage_file = lambda ext: f'{DATA_DIR}/all_beauty_test{ext}'

with open(train_storage_file('.pkl'), 'wb') as file:
    pickle.dump(train, file)

with open(test_storage_file('.pkl'), 'wb') as file:
    pickle.dump(test, file)

In [None]:
dataset['train'].to_parquet(train_storage_file('.parquet'))
dataset['test'].to_parquet(test_storage_file('.parquet'))

# How to load back the data
# loaded_dataset = load_dataset("parquet", data_files='amazon_polarity_train.parquet')

### Predictions

#### Random Pricer

In [None]:
import random
import math


def random_pricer(item):
    return random.randrange(1,200)

random.seed(42)

# Run our TestRunner
Tester.test(random_pricer, test)

#### Constant Pricer

In [None]:
training_prices = [item.price for item in train]
training_average = sum(training_prices) / len(training_prices)

def constant_pricer(item):
    return training_average

Tester.test(constant_pricer, test)

In [None]:
train[0].details

### Prepare Fine-Tuning Data

In [None]:
fine_tune_train = train[:100]
fine_tune_validation = train[100:125]

In [None]:
def messages_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is ${item.price:.2f}"}
    ]

messages_for(train[0])

In [None]:
import json
from pathlib import Path
DATA_DIR = 'data'

data_path = Path(DATA_DIR)

def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_for(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'
    return result.strip()

# print(make_jsonl(train[:3]))
data_path.absolute()
if not data_path.exists():
    data_path.mkdir(parents=True)



train_jsonl_path = f'{data_path}/pricer_train.jsonl'
validation_jsonl_path = f'{data_path}/pricer_validation.jsonl'

In [None]:
def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [None]:

write_jsonl(fine_tune_train, train_jsonl_path)

In [None]:
write_jsonl(fine_tune_validation, validation_jsonl_path)

## Training

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from google.colab import userdata

load_dotenv()
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

openai = OpenAI()

In [None]:
with open(train_jsonl_path, 'rb') as f:
    train_file = openai.files.create(file=f, purpose='fine-tune')

In [None]:
train_file

In [None]:
with open(validation_jsonl_path, 'rb') as f:
    validation_file = openai.files.create(file=f, purpose='fine-tune')

validation_file

In [None]:
wandb_integration = {"type": "wandb", "wandb": {"project": "gpt-pricer"}}

In [None]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": 1},
    integrations = [wandb_integration],
    suffix="pricer"
)

In [None]:
openai.fine_tuning.jobs.list(limit=1)

In [None]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id
job_id

In [None]:
openai.fine_tuning.jobs.retrieve(job_id)

In [None]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data

In [None]:
import wandb
from wandb.integration.openai.fine_tuning import WandbLogger


wandb.login()
# Sync the fine-tuning job with Weights & Biases.
WandbLogger.sync(fine_tune_job_id=job_id, project="gpt-pricer")

In [None]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model
fine_tuned_model_name

In [None]:
def messages_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]

In [None]:
# The function for gpt-4o-mini

def gpt_fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages_for(item),
        seed=42,
        max_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
print(test[0].price)
print(gpt_fine_tuned(test[0]))

In [None]:
Tester.test(gpt_fine_tuned, test)