## Dataset Curator Amazon Appliance project

This has been moved to a Google Colab:
https://colab.research.google.com/drive/1_7UXT4dp2Cr0RiRzMkMFxKgiGSt5QpVU#scrollTo=MDyR63OTNUJ6

In [None]:
import os
import glob
import re
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from datasets import Dataset
from dotenv import load_dotenv
from huggingface_hub import login
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
from openai import OpenAI

In [None]:
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

In [None]:
# Load environment variables in a file called .env
load_dotenv()
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-hf-token-if-not-using-env')

In [None]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Appliances", split="full", trust_remote_code=True)

In [None]:
print(dataset[0].keys())

In [None]:
item = dataset[0]
print(item['title'])
print(item['description'])
print(item['features'])
print(item['price'])
print(item['price']=="None")

In [None]:
class Item:

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    stop = set(['the', 'and', 'for', 'is', 'to', 'this', 'with', 'a', 'of', 'your', 'are', 'in','from', 'you', 'or', 'an'])

    
    def __init__(self, data):
        self.title = data['title']
        self.description = data['description']
        self.features = data['features']
        self.price = data['price']
        self._token_count = None

    def inference_prompt(self):
        prompt = "Predict the price of this item.\n"
        prompt += f"Title: {self.title}\n"
        prompt += f"Description: {self.description}\n"
        prompt += f"Features: {self.features}\n"
        prompt += f"The answer: Predicted price=$"
        return prompt

    def train_prompt(self):
        return f"{self.inference_prompt()}{self.price}"

    def token_count(self):
        if self._token_count == None:
            self._token_count = len(self.tokenizer.encode(self.train_prompt()))
        return self._token_count

    def tokens_between(self, low, high):
        token_count = self.token_count()
        return token_count >= low and token_count < high

    def words(self):
        text = f"{self.title} {self.description} {self.features}"
        text = re.sub(r'[()\[\]{},\'"-]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        words = text.strip().lower().split(' ')
        return [word for word in words if word not in self.stop]

In [None]:
items = []
for data in tqdm(dataset):
    try:
        price = float(data['price'])
        if price>0:
            items.append(Item(data))
    except ValueError:
        pass

In [None]:
print(f"There are {len(items):,} out of {len(dataset):,} with prices")
print(f"\nItem 0 has {items[0].token_count()} tokens:")
print(items[0].train_prompt())
print(f"\nItem 1 has {items[1].token_count()} tokens:")
print(items[1].train_prompt())

In [None]:
descriptions, features, both = 0, 0, 0
for item in items:
    description = str(item.description)
    feature = str(item.features)
    if len(description)>8: descriptions += 1
    if len(feature)>8: features += 1
    if len(description)>8 and len(feature)>8: both +=1
print(len(items))
print(descriptions, features, both)

In [None]:
token_counts = [item.token_count() for item in tqdm(items)]
%matplotlib inline
fig, ax = plt.subplots(1, 1)
ax.set_xlabel('Number of tokens')
ax.set_ylabel('Count of items');
_ = ax.hist(token_counts, rwidth=0.7, color="orange", bins=range(0, 2000, 100))

In [None]:
low_cutoff = 100
high_cutoff = 300
subset = [item for item in tqdm(items) if item.tokens_between(low_cutoff, high_cutoff)]
subset_count = len(subset)
count = len(items)
print(f"Between {low_cutoff} and {high_cutoff}, we get {subset_count:,} out of {count:,} which is {subset_count/count*100:.1f}%")

In [None]:
token_counts = [item.token_count() for item in subset]
%matplotlib inline
fig, ax = plt.subplots(1, 1)
ax.set_xlabel('Number of tokens')
ax.set_ylabel('Count of items');
_ = ax.hist(token_counts, rwidth=0.7, color="purple", bins=range(0, 400, 20))

In [None]:
random.seed(42)
random.shuffle(subset)
split_index = int(len(subset) * 0.95)
train = subset[:split_index]
test = subset[split_index:]

In [None]:
from collections import Counter
words = Counter()
for item in train:
    words.update(item.words())
top_20_words = words.most_common(20)
print("Top 20 words:", top_20_words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import numpy as np

documents = [" ".join(item.words()) for item in train]
labels = np.array([float(item.price) for item in train])

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# regressor = LinearRegression()
regressor = SVR(kernel='linear')
regressor.fit(X, labels)


In [None]:


error_total = 0
squared_log_error_total = 0
size = 50
for item in test[:size]:
    x = vectorizer.transform([" ".join(item.words())])
    truth = float(item.price)
    prediction = max(regressor.predict(x)[0], 0)
    error_total += abs(truth - prediction)
    squared_log_error_total += (math.log(truth+1) - math.log(prediction+1)) ** 2
    print(f"Result: Truth={truth} Prediction={prediction}")
error = error_total / size
rmsle = math.sqrt(squared_log_error_total / size)
print(f'Average error: {error:.2f}')
print(f'Root mean squared log error: {rmsle:.2f}')

In [None]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
class Frontier:
    model = 'gpt-4o-mini'
    gpt = OpenAI()
    system_message = "You predict product prices"

    def __init__(self, item):
        self.guess = 0
        self.item = item
        self.truth = float(item.price)

    @staticmethod
    def extract_float_from_string(s):
        match = re.search(r"[-+]?\d*\.\d+|\d+", s)
        return float(match.group()) if match else 0

    def run(self):
        user_prompt = item.inference_prompt()
        prompts = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": user_prompt}
        ]
        completion = self.gpt.chat.completions.create(
            model='gpt-4o-mini',
            messages=prompts,
            max_tokens=8
        )
        response = completion.choices[0].message.content
        self.guess = self.extract_float_from_string(response)

    def error(self):
        return abs(self.truth - self.guess)

    def squared_log_error(self):
        log_error = math.log(self.truth+1) - math.log(self.guess+1)
        return log_error ** 2

    def __repr__(self):
        return f"Truth={self.truth} Guess={self.guess} Error={self.error():.2f} SLE={self.squared_log_error():.2f}"

In [None]:
error_total = 0
squared_log_error_total = 0
size = 50
for item in test[:size]:
    frontier=Frontier(item)
    frontier.run()
    error_total += frontier.error()
    squared_log_error_total += frontier.squared_log_error()
    print(frontier)
error = error_total / size
rmsle = math.sqrt(squared_log_error_total / size)
print(f'Average error: {error:.2f}')
print(f'Root mean squared log error: {rmsle:.2f}')

In [None]:
train_prompts = [item.train_prompt() for item in train]
train_prices = [float(item.price) for item in train]
test_prompts = [item.inference_prompt() for item in test]
test_prices = [float(item.price) for item in test]

In [None]:
# Create a Dataset from the lists
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})

# Combine the datasets into a DatasetDict for easy access
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
DATASET_NAME = "ed-donner/appliances"
login(token=os.environ['HF_TOKEN'])
dataset.push_to_hub(DATASET_NAME, private=True)

## And now to head over to a Google Colab for fine-tuning in the cloud

Follow this link for the Colab: https://colab.research.google.com/drive/19E9hoAzWKvn9c9SHqM4Xan_Ph4wNewHS?usp=sharing


In [None]:
import matplotlib.pyplot as plt

models = ["Traditional", "GPT-4o-mini", "Llama 3.1 base", "Llama 3.1 finetuned"]
rmsle = [2.02, 1.15, 1.60, 0.56]

# Convert RMSLE to accuracy-like metric
accuracy = [1 / (e + 1) for e in rmsle]

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(models, rmsle, color="lightblue", width=0.6)

# Add titles and labels
plt.title('Model Error')
plt.xlabel('Models')
plt.ylabel('Error (RMSLE)')

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt

models = ["Traditional ML", "GPT-4o", "Claude-3.5-Sonnet", "Llama 3.1 base", "Llama 3.1 finetuned"]
errors = [67.57, 63.12, 58.49, 83.44, 23.95]

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(models, errors, color="lightblue", width=0.6)

# Add titles and labels
plt.title('Average Price Difference between Prediction and Actual across 50 Appliances')
plt.xlabel('Models')
plt.ylabel('Average Difference ($)')

# Display the plot
plt.show()