# Fine Tuning

In [None]:
!pip install gensim
!pip install --upgrade datasets==3.6.0

In [None]:
import os
import math
import random
import json
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from openai import OpenAI
from datetime import datetime
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from huggingface_hub import login
from sklearn.svm import LinearSVR
from gensim.models import Word2Vec
from IPython.display import display
from transformers import AutoTokenizer
from gensim.utils import simple_preprocess
from collections import Counter, defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from concurrent.futures import ProcessPoolExecutor
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
load_dotenv(override=True)
openai_key = os.environ.get("OPENAI_API_KEY")

#anthropic_key = os.environ.get("ANTHROPIC_API_KEY")

hf_token = os.environ.get("HF_TOKEN")
print(hf_token)

if hf_token:
    print("Loggin in...")
    login(hf_token, add_to_git_credential=True)

In [None]:
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red": RED, "orange": YELLOW, "green": GREEN}

In [None]:
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

MIN_CHARS = 300
MIN_TOKENS = 150
MAX_TOKENS = 160
CEILING_CHARS = MAX_TOKENS * 7

class Item:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    PREFIX = "Price is $"
    QUESTION = "How much does this cost to the nearest dollar?"
    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]

    def __init__(self, data, price):
        self.title = data["title"]
        self.price = price
        self.category = data.get("category", "Unknown")
        self.token_count = 0
        self.details = None
        self.prompt = None
        self.include = False
        self.parse(data)

    def scrub_details(self):
        details = self.details

        for remove in self.REMOVALS:
            details = details.replace(remove, "")

        return details

    def scrub(self, text):
        text = re.sub(r'[:\[\]"{}【】\s]+', ' ', text).strip()
        text = text.replace(" ,", ",").replace(",,,",",").replace(",,",",")
        words = text.split(" ")
        select = [word for word in words if len(word) < 7 or not any(char.isdigit() for char in word)]
        return " ".join(select)

    def parse(self, data):
        contents = '\n'.join(data.get("description", []))

        if contents:
            contents += '\n'

        features = '\n'.join(data.get("features", []))
        if features:
            contents += features + '\n'

        self.details = data.get("details")
        if self.details:
            contents += self.scrub_details() + '\n'

        if len(contents) > MIN_CHARS:
            contents = contents[:CEILING_CHARS]
            text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
            tokens = self.tokenizer.encode(text, add_special_tokens=False)

            if len(tokens) > MIN_TOKENS:
                tokens = tokens[:MAX_TOKENS]
                text = self.tokenizer.decode(tokens)
                self.make_prompt(text)
                self.include = True

    def make_prompt(self, text):
        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
        self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))

    def test_prompt(self):
        return self.prompt.split(self.PREFIX)[0] + self.PREFIX

    def __repr__(self):
        return f"<{self.title} = ${self.price}>"

In [None]:
MIN_PRICE = 0.5
CHUNK_SIZE = 1000
MAX_PRICE = 999.49

class ItemLoader:
    def __init__(self, name):
        self.name = name
        self.dataset = None

    def from_datapoint(self, datapoint):
        try:
            price_str = datapoint.get("price")
            if price_str:
                price = float(price_str)
                if MIN_PRICE <= price <= MAX_PRICE:
                    item = Item(datapoint, price)
                    if item.include:
                        return item
        except ValueError:
            return None

    def from_chunk(self, chunk):
        batch = []
        for datapoint in chunk:
            item = self.from_datapoint(datapoint)

            if item:
                batch.append(item)

        return batch

    def chunk_generator(self):
        size = len(self.dataset)
        for start in range(0, size, CHUNK_SIZE):
            yield self.dataset.select(range(start, min(start + CHUNK_SIZE, size)))

    def load_in_parallel(self, workers):
        results = []
        chunk_count = (len(self.dataset) // CHUNK_SIZE) + 1

        with ProcessPoolExecutor(max_workers=workers) as pool:
            for batch in tqdm(pool.map(self.from_chunk, self.chunk_generator()), total=chunk_count):
                results.extend(batch)

        for result in results:
            result.category = self.name

        return results

    def load(self, workers=8):
        self.dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{self.name}", split="full", trust_remote_code=True)
        start = datetime.now()
        print(f"Loading {self.dataset}")
        results = self.load_in_parallel(workers)
        duration = (datetime.now() - start).total_seconds() / 60
        print(f"Completed {self.name} with {len(results):,} items in {duration:.1f} mins")
        return results

In [None]:
class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error < 40 or error / truth < 0.2:
            return "green"

        if error < 80 or error / truth < 0.4:
            return "orange"

        return "red"

    def run_datapoint(self, index):
        datapoint = self.data[index]
        guess = self.predictor(datapoint)
        truth = datapoint.price
        error = abs(guess - truth)
        log_error = math.log(truth + 1) - math.log(guess + 1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        name = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40] + "..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{index + 1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {name}{RESET}")

    def chart(self, title):
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color="deepskyblue", lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel("Ground Truth")
        plt.ylabel("Model Estimate")
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits / self.size * 100:.1f}%"
        self.chart(title)

    def run(self):
        for index in range(self.size):
            self.run_datapoint(index)

        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

def get_price(s):
    s = s.replace("$", "").replace(",", "")
    match = re.search(r"[-+]?\d*\.?\d+", s)
    return float(match.group()) if match else 0.0

## Data

### Load Catalogs

In [None]:
catalog_labels = [
    "All_Beauty",
    # "Automotive",
    # "Electronics",
    # "Office_Products",
    # "Tools_and_Home_Improvement",
    # "Cell_Phones_and_Accessories",
    # "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
    "Software",
    "Handmade_Products"
]
curated_pool = []

for label in catalog_labels:
    print("Loading " + label)
    loader = ItemLoader(label)
    curated_pool.extend(loader.load())

print(f"Total curated items: {len(curated_pool):,}")

In [None]:
price_series = [item.price for item in curated_pool]
token_series = [item.token_count for item in curated_pool]
category_tally = Counter(item.category for item in curated_pool)
summary_frame = pd.DataFrame({"price": price_series, "tokens": token_series})

display(summary_frame.describe())
display(pd.DataFrame.from_dict(category_tally, orient="index", columns=["count"]).sort_values("count", ascending=False))

In [None]:
price_slots = defaultdict(list)
for item in curated_pool:
    key = round(item.price)
    if 1 <= key <= 999:
        price_slots[key].append(item)

slot_counts = {k: len(v) for k, v in price_slots.items()}
print(f"Slots populated: {len(slot_counts)}")

In [None]:
random.seed(123)
np.random.seed(123)
balanced_bundle = []

for price in range(1, 1000):
    bucket = price_slots.get(price, [])

    if price >= 240:
        balanced_bundle.extend(bucket)

    elif len(bucket) <= 1200:
        balanced_bundle.extend(bucket)

    else:
        weights = np.array([1 if item.category == "Automotive" else 5 for item in bucket], dtype=float)
        weights /= weights.sum()
        indices = np.random.choice(len(bucket), size=1200, replace=False, p=weights)
        for idx in indices:
            balanced_bundle.append(bucket[idx])

print(f"Balanced bundle size: {len(balanced_bundle):,}")

In [None]:
bundle_prices = [item.price for item in balanced_bundle]
bundle_tokens = [item.token_count for item in balanced_bundle]
bundle_categories = Counter(item.category for item in balanced_bundle)
display(pd.Series(bundle_prices).describe())
display(pd.DataFrame.from_dict(bundle_categories, orient="index", columns=["count"]).sort_values("count", ascending=False))

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(bundle_prices, bins=range(0, 1000, 10), color="midnightblue", rwidth=0.8)
plt.xlabel("Price")
plt.ylabel("Count")
plt.figure(figsize=(12, 5))
plt.hist(bundle_tokens, bins=range(0, 300, 10), color="forestgreen", rwidth=0.8)
plt.xlabel("Tokens")
plt.ylabel("Count")
plt.show()

In [None]:
random.seed(123)
random.shuffle(balanced_bundle)
test_target = min(2000, max(1, len(balanced_bundle) // 20))
train_target = min(400_000, len(balanced_bundle) - test_target)
train_items = balanced_bundle[:train_target]
test_items = balanced_bundle[train_target:train_target + test_target]
print(f"Training set: {len(train_items):,}")
print(f"Test set: {len(test_items):,}")

In [None]:
train_prompts = [item.prompt for item in train_items]
train_prices = [item.price for item in train_items]
test_prompts = [item.test_prompt() for item in test_items]
test_prices = [item.price for item in test_items]

In [None]:
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
pricing_dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

### Persist

In [None]:
storage_dir = Path("data")
storage_dir.mkdir(exist_ok=True)

with open(storage_dir / "balanced_train.pkl", "wb") as f:
    pickle.dump(train_items, f)

with open(storage_dir / "balanced_test.pkl", "wb") as f:
    pickle.dump(test_items, f)

In [None]:
pricing_dataset["train"].to_parquet(storage_dir / "balanced_train.parquet")
pricing_dataset["test"].to_parquet(storage_dir / "balanced_test.parquet")

## Baselines

### Stochastic Anchor

In [None]:
def stochastic_anchor(item):
    return random.randrange(1, 1000)

random.seed(123)
Tester.test(stochastic_anchor, test_items[:250])

### Global Mean

In [None]:
train_price_values = [item.price for item in train_items]
global_mean_price = sum(train_price_values) / len(train_price_values)

def global_mean_estimator(item):
    return global_mean_price

Tester.test(global_mean_estimator, test_items[:250])

In [None]:
def parse_features(raw):
    if not raw:
        return {}
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        return {}
for item in train_items:
    item.features = parse_features(item.details)
for item in test_items:
    item.features = parse_features(item.details)

### Feature Engineering

In [None]:
def infer_weight(item):
    payload = item.features.get("Item Weight")
    if not payload:
        return None

    parts = payload.split(" ")
    amount = float(parts[0])
    unit = parts[1].lower()

    if unit == "pounds":
        return amount

    if unit == "ounces":
        return amount / 16

    if unit == "grams":
        return amount / 453.592

    if unit == "milligrams":
        return amount / 453592

    if unit == "kilograms":
        return amount / 0.453592

    if unit == "hundredths" and len(parts) > 2 and parts[2].lower() == "pounds":
        return amount / 100

    return None

In [None]:
def infer_rank(item):
    payload = item.features.get("Best Sellers Rank")
    if not payload:
        return None

    values = list(payload.values()) if isinstance(payload, dict) else []
    if not values:
        return None

    return sum(values) / len(values)

top_brands = {"nvidea","hp","dell","lenovo","samsung","asus","sony","canon","apple","intel"}

def is_top_brand(item):
    brand = item.features.get("Brand")
    return 1 if brand and brand.lower() in top_brands else 0

In [None]:
train_weights = [infer_weight(item) for item in train_items]
train_weights = [value for value in train_weights if value is not None]
average_weight = sum(train_weights) / len(train_weights) if train_weights else 1.0
train_ranks = [infer_rank(item) for item in train_items]
train_ranks = [value for value in train_ranks if value is not None]
average_rank = sum(train_ranks) / len(train_ranks) if train_ranks else 1_000_000.0

In [None]:
def build_features(item):
    weight = infer_weight(item)
    rank = infer_rank(item)

    return {
        "weight": weight if weight is not None else average_weight,
        "rank": rank if rank is not None else average_rank,
        "text_length": len(item.test_prompt()),
        "top_brand": is_top_brand(item)
    }

In [None]:
train_frame = pd.DataFrame([build_features(item) for item in train_items])
train_frame["price"] = [item.price for item in train_items]
test_frame = pd.DataFrame([build_features(item) for item in test_items[:250]])
test_frame["price"] = [item.price for item in test_items[:250]]

In [None]:
feature_columns = ["weight", "rank", "text_length", "top_brand"]
X_train = train_frame[feature_columns]
y_train = train_frame["price"]
X_test = test_frame[feature_columns]
y_test = test_frame["price"]
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

def linear_baseline(item):
    return float(linear_model.predict(pd.DataFrame([build_features(item)]))[0])

Tester.test(linear_baseline, test_items[:250])

### NLP Baselines

In [None]:
document_texts = [item.test_prompt() for item in train_items]
price_targets = np.array([item.price for item in train_items])
vectorizer = CountVectorizer(max_features=1000, stop_words="english")
X_matrix = vectorizer.fit_transform(document_texts)
bow_model = LinearRegression()
bow_model.fit(X_matrix, price_targets)

def bow_predictor(item):
  pred = float(bow_model.predict(vectorizer.transform([item.test_prompt()]))[0])
  return max(pred, 0)

Tester.test(bow_predictor, test_items[:250])

In [None]:
processed_docs = [simple_preprocess(text) for text in document_texts]
word2vec_model = Word2Vec(sentences=processed_docs, vector_size=400, window=5, min_count=1, workers=4)

def document_vector(text):
    words = simple_preprocess(text)
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]

    if not vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

w2v_features = np.array([document_vector(text) for text in document_texts])
svr_model = LinearSVR()
svr_model.fit(w2v_features, price_targets)

def w2v_predictor(item):
    return float(svr_model.predict([document_vector(item.test_prompt())])[0])

Tester.test(w2v_predictor, test_items[:250])

In [None]:
forest_model = RandomForestRegressor(n_estimators=200, random_state=123)
forest_model.fit(X_train, y_train)

def forest_predictor(item):
    return float(forest_model.predict(pd.DataFrame([build_features(item)])[feature_columns])[0])

Tester.test(forest_predictor, test_items[:250])

In [None]:
fine_tune_train = train_items[:200]
fine_tune_validation = train_items[200:250]

In [None]:
def compose_messages(item, include_price=True):
    system_message = "You estimate prices of items. Reply only with the price"
    user_prompt = item.test_prompt().replace(" to the nearest dollar", "").replace("\n\nPrice is $", "")
    assistant_content = f"Price is ${item.price:.2f}" if include_price else "Price is $"
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_content}
    ]

In [None]:
def build_jsonl(items):
    lines = []
    for item in items:
        payload = {"messages": compose_messages(item)}
        lines.append(json.dumps(payload))

    return "\n".join(lines)

In [None]:
train_jsonl = storage_dir / "balanced_pricer_train.jsonl"
validation_jsonl = storage_dir / "balanced_pricer_validation.jsonl"
train_jsonl.write_text(build_jsonl(fine_tune_train))
validation_jsonl.write_text(build_jsonl(fine_tune_validation))

In [None]:
openai_client = OpenAI()

with open(train_jsonl, "rb") as f:
    train_file = openai_client.files.create(file=f, purpose="fine-tune")

with open(validation_jsonl, "rb") as f:
    validation_file = openai_client.files.create(file=f, purpose="fine-tune")

train_file, validation_file

In [None]:
wandb_integration = {"type": "wandb", "wandb": {"project": "balanced-pricer"}}
fine_tune_job = openai_client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=123,
    hyperparameters={"n_epochs": 1},
    integrations=[wandb_integration],
    suffix="balanced-pricer"
)
fine_tune_job

In [None]:
job_status = openai_client.fine_tuning.jobs.retrieve(fine_tune_job.id)
job_events = openai_client.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tune_job.id, limit=10)
job_status, job_events

In [None]:
fine_tuned_model_name = openai_client.fine_tuning.jobs.retrieve(fine_tune_job.id).fine_tuned_model
print(fine_tuned_model_name)

In [None]:
def tuned_predictor(item):
    messages = compose_messages(item, include_price=False)
    response = openai_client.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages,
        seed=123,
        max_tokens=7
)
    answer = response.choices[0].message.content
    return get_price(answer)

In [None]:
if test_items:
    sample_item = test_items[0]
    print(sample_item.price)
    print(tuned_predictor(sample_item))

In [None]:
Tester.test(tuned_predictor, test_items[:250])