In [None]:
!pip install tqdm huggingface_hub numpy sentence-transformers datasets chromadb catboost peft torch bitsandbytes

In [None]:
import os
import re
import zipfile
import chromadb
import joblib
import numpy as np
import pandas as pd
import requests
import torch
from datasets import load_dataset
from google.colab import userdata
from huggingface_hub import HfApi, hf_hub_download, login
from openai import OpenAI
from peft import PeftModel
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from catboost import CatBoostRegressor

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
openai_api_key = userdata.get("OPENAI_API_KEY")
openai = OpenAI(api_key=openai_api_key)

hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [6]:
# Configuration
HF_USER = "qshaikh"

In [None]:
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset = load_dataset(DATASET_NAME)
test = dataset["test"]

In [None]:
def description(item):
    text = item["text"].replace(
        "How much does this cost to the nearest dollar?\n\n", ""
    )
    text = text.split("\n\nPrice is $")[0]
    return f"passage: {text}"

In [None]:
CHROMA_PATH = "/content/drive/MyDrive/chroma"
COLLECTION_NAME = "price_items"

print(f"Attempting to load ChromaDB from: {CHROMA_PATH}")

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)

print(f"Successfully loaded ChromaDB collection '{COLLECTION_NAME}'.")

In [None]:
embedding_model = SentenceTransformer("intfloat/e5-small-v2", device="cuda")

In [None]:
BASE_MODEL = "meta-llama/Llama-3.1-8B"
FINETUNED_MODEL = "ed-donner/pricer-2024-09-13_13.04.39"
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, quantization_config=quant_config, device_map="auto"
)

fine_tuned_model = PeftModel.from_pretrained(
    base_model, FINETUNED_MODEL, revision=REVISION
)

fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
#Cat Boost Trained Model
catboost_model_path = "/content/drive/MyDrive/catboost_model.pkl"
catboost_model = joblib.load(catboost_model_path)
print(f"Successfully loaded CatBoost model from {catboost_model_path}")

In [24]:
def extract_tagged_price(output: str):
    try:
        contents = output.split("Price is $")[1].replace(",", "")
        match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
        return float(match.group()) if match else 0.0
    except Exception:
        return 0.0

In [25]:
def ft_llama_price(description: str):
    prompt = (
        f"How much does this cost to the nearest dollar?\n\n{description}\n\nPrice is $"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = fine_tuned_model.generate(
        **inputs, max_new_tokens=5, num_return_sequences=1
    )

    result = tokenizer.decode(outputs[0])
    price = extract_tagged_price(result)
    return price

In [26]:
def catboost_price(description: str):
    vector = embedding_model.encode([description], normalize_embeddings=True)[0]
    pred = catboost_model.predict([vector])[0]
    return round(float(max(0, pred)), 2)

In [27]:
def gpt4o_price(item):
    def get_embedding(text):
        return embedding_model.encode([text], normalize_embeddings=True)

    def find_similars(text):
        results = collection.query(
            query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5
        )
        docs = results["documents"][0]
        prices = [m["price"] for m in results["metadatas"][0]]
        return docs, prices

    def format_context(similars, prices):
        context = (
            "To provide some context, here are similar products and their prices:\n\n"
        )
        for sim, price in zip(similars, prices):
            context += f"Product:\n{sim}\nPrice is ${price:.2f}\n\n"
        return context

    def build_messages(description, similars, prices):
        system_message = (
            "You are a pricing expert. "
            "Given a product description and a few similar products with their prices, "
            "estimate the most likely price. "
            "Respond ONLY with a number, no words."
        )
        context = format_context(similars, prices)
        user_prompt = (
            "Estimate the price for the following product:\n\n"
            + description
            + "\n\n"
            + context
        )
        return [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": "Price is $"},
        ]

    docs, prices = find_similars(description(item))
    messages = build_messages(description(item), docs, prices)
    response = openai.chat.completions.create(
        model="gpt-4o-mini", messages=messages, seed=42, max_tokens=5
    )
    reply = response.choices[0].message.content
    return float(
        re.search(r"[-+]?\d*\.\d+|\d+", reply.replace("$", "").replace(",", "")).group()
        or 0
    )

In [None]:
print("Splitting entire dataset...")
np.random.seed(42)
all_indices = list(range(len(test)))
np.random.shuffle(all_indices)

train_split_size = int(0.8 * len(all_indices))
train_indices = all_indices[:train_split_size]  # 80%
test_indices = all_indices[train_split_size:]  # 20%

train_indices = train_indices[:250]
test_indices = test_indices[:50]

In [None]:
ft_llama_preds_train = []
gpt4omini_preds_train = []
catboost_preds_train = []
true_prices_train = []

for i in tqdm(train_indices):
    item = test[i]
    text = description(item)
    true_prices_train.append(item["price"])
    ft_llama_preds_train.append(ft_llama_price(text))
    gpt4omini_preds_train.append(gpt4o_price(item))
    catboost_preds_train.append(catboost_price(text))

In [None]:
print("True Prices:", true_prices_train)
print("FT-LLaMA Predictions:", ft_llama_preds_train)
print("GPT-4o-mini Predictions:", gpt4omini_preds_train)
print("CatBoost Predictions:", catboost_preds_train)

In [None]:
maxes_train = [
    max(a, b, c)
    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)
]
means_train = [
    np.mean([a, b, c])
    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)
]

X_train = pd.DataFrame(
    {
        "FT_LLaMA": ft_llama_preds_train,
        "GPT4oMini": gpt4omini_preds_train,
        "CatBoost": catboost_preds_train,
        "Max": maxes_train,
        "Mean": means_train,
    }
)

y_train = pd.Series(true_prices_train)

In [None]:
np.random.seed(42)
lr = LinearRegression()
lr.fit(X_train, y_train)

feature_columns = X_train.columns.tolist()
for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

In [None]:
ft_llama_preds_test = []
gpt4omini_preds_test = []
catboost_preds_test = []
true_prices_test = []

print("Processing TEST data (50 items)...")
for i in tqdm(test_indices):
    item = test[i]
    text = description(item)
    true_prices_test.append(item["price"])
    ft_llama_preds_test.append(ft_llama_price(text))
    gpt4omini_preds_test.append(gpt4o_price(item))
    catboost_preds_test.append(catboost_price(text))

maxes_test = [
    max(a, b, c)
    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)
]
means_test = [
    np.mean([a, b, c])
    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)
]

X_test = pd.DataFrame(
    {
        "FT_LLaMA": ft_llama_preds_test,
        "GPT4oMini": gpt4omini_preds_test,
        "CatBoost": catboost_preds_test,
        "Max": maxes_test,
        "Mean": means_test,
    }
)

y_test = pd.Series(true_prices_test)

In [None]:
print("Evaluating model...")
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² score: {r2:.4f}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"MAPE: {mape:.2f}%")