In [None]:
import sys
sys.path.append("../..")

import os
import pickle
import json
from openai import OpenAI
from items import Item
import tiktoken
from dotenv import load_dotenv
import math
import matplotlib.pyplot as plt
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

OUTLIER_EXECUTED = False
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

# This is the my fine-tuned model you can use it or decide to train your own
FINE_TUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:quicksearch-plus::CV6dqS5l"
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red": RED, "orange": YELLOW, "green": GREEN}

In [None]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

In [None]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_Appliances", split="full", trust_remote_code=True)

In [None]:
data = pd.DataFrame(dataset,columns=["main_category", "title", "description", "features", "details", "price"])

In [None]:
data["title"] = data["title"].apply(str)
data["description"] = data["description"].apply(str)
data["features"] = data["features"].apply(str)

# Replace "None" and [] with None 
data["price"] = data["price"].replace("None", None)
data["title"] = data["title"].replace("", None)
data["description"] = data["description"].replace("[]", None)
data["features"] = data["features"].replace("[]", None)


In [None]:
data = data.dropna()
data["price"] = data["price"].apply(float)

In [None]:
data = data.drop_duplicates(subset=["title", "description","price"])


In [None]:
# Handle outliers
# To do that we use the interquartile range
# First we need to calculate the first and third quartiles
# Make sure to run this just once  

q1 = data["price"].quantile(0.25)
q3 = data["price"].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
higher_bound = q3 + 1.5 * iqr

if not OUTLIER_EXECUTED:
    OUTLIER_EXECUTED = True
    data = data[(data["price"] >= lower_bound) & (data["price"] <= higher_bound) & (data["price"] > 0)]
else:
   print("Outlier already executed")


In [None]:
#Further cleansing of the data (dealing with lists and dicts)
def clean_list_string(field):
    """Convert string representation of list to clean string"""
    try:
        # Try to parse as literal list
        if field.startswith('[') and field.endswith(']'):
            parsed = ast.literal_eval(field)
            return ' '.join(str(item) for item in parsed)
    except:
        pass
    return str(field)

def clean_dict_string(field):
    """Convert string representation of dict to clean string"""
    try:
        # Try to parse as literal dict
        if field.startswith('{') and field.endswith('}'):
            parsed = ast.literal_eval(field)
            parts = []
            for key, value in parsed.items():
                if isinstance(value, dict):
                    value = ', '.join(f"{k}: {v}" for k, v in value.items())
                parts.append(f"{key}: {value}")
            return ' | '.join(parts)
    except:
        pass
    return str(field)


data["description"] = data["description"].apply(clean_list_string)
data["features"] = data["features"].apply(clean_list_string)
data["details"] = data["details"].apply(clean_dict_string)


In [None]:
SYSTEM_PROMPT = """
You are a price prediction expert. Given a product's title, description, features, or details, predict its price in USD.

Rules:
1. Analyze all available product information carefully
2. If information is incomplete or truncated, use your knowledge of similar products and market pricing to make informed predictions
3. Consider product quality indicators, brand reputation, features, and typical market values
4. Return ONLY the numeric price (e.g., "29.99") 
5. Do not include currency symbols, explanations, or additional text 
6. Return just the raw float number
"""

In [None]:
def truncate_by_tokens(text, max_tokens=300):
    """Truncate to max tokens"""
    encoding = tiktoken.encoding_for_model("gpt-4o-mini")
    tokens = encoding.encode(text)
    
    if len(tokens) <= max_tokens:
        return text
    
    truncated_tokens = tokens[:max_tokens]
    return encoding.decode(truncated_tokens)

def generate_prompt(data):
    """
    Generate a prompt for the model to predict the price of a product
    """

    prompt = f"""
    Below are the details of the product: 
    Title: {data['title']}
    Description: {data['description']}
    Features: {data['features']}
    """
    return truncate_by_tokens(prompt)

def generate_message(data):
    """
    Generate a message for the model to predict the price of a product
    """
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": data["prompt"]},
        {"role": "assistant", "content": str(data['price'])}
    ]
    return messages


In [None]:
data["prompt"] = data.apply(lambda x: generate_prompt(x), axis=1)


In [None]:
train_data = data.sample(n=200, random_state=42)
train_set = train_data.sample(frac=0.8, random_state=42)
validation_set = train_data.drop(train_set.index)

In [None]:
# Create a jsonl file for the training set

with open('training_data.jsonl', 'w') as f:
    for index, row in train_set.iterrows():
        messages = {"messages": generate_message(row)}
        f.write(json.dumps(messages) + '\n')

with open('validation_data.jsonl', 'w') as f:
    for index, row in validation_set.iterrows():
        messages = {"messages": generate_message(row)}
        f.write(json.dumps(messages) + '\n')


In [None]:
client = OpenAI()

# Uncoment the following code to train your own model

# print("Uploading training file...")
# training_file = client.files.create(
#     file=open('training_data.jsonl', 'rb'),
#     purpose='fine-tune'
# )
# print(f"File uploaded: {training_file.id}")

# print("Uploading validation file...")
# validation_file = client.files.create(
#     file=open('validation_data.jsonl', 'rb'),
#     purpose='fine-tune'
# )
# print(f"Validation file uploaded: {validation_file.id}")

# print("Starting fine-tuning...")
# job = client.fine_tuning.jobs.create(
#     validation_file=validation_file.id,
#     training_file=training_file.id,
#     model='gpt-4o-mini-2024-07-18'
# )
# print(f"Job created: {job.id}")

# status = client.fine_tuning.jobs.retrieve(job.id)
# print(f"Status: {status.status}")

# import time
# while status.status not in ['succeeded', 'failed']:
#     time.sleep(60)
#     status = client.fine_tuning.jobs.retrieve(job.id)
#     print(f"Status: {status.status}")

# if status.status == 'succeeded':
#     print(f"Model ready: {status.fine_tuned_model}")
# else:
#     print(f"Training failed: {status.error}")

In [None]:
class PriceTester:
    
    def __init__(self, predictor, data, title="Price Prediction Model", size=None):
        """
        predictor: function that takes a row and returns predicted price
        data: pandas DataFrame with test data
        """
        self.predictor = predictor
        self.data = data
        self.title = title
        self.size = size or len(data)
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []
    
    def color_for(self, error, truth):
        """Determine color based on error"""
        if error < 40 or error/truth < 0.2:
            return "green"
        elif error < 80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"
    
    def run_datapoint(self, i):
        """Test single datapoint"""
        row = self.data.iloc[i]
        predict = self.predictor(row)
        try:
            guess = float(predict)
        except (ValueError, TypeError):
            print(f"{YELLOW}{i+1}: Skipped - Non-numeric response: {predict[:50]}...{RESET}")
            return 
        
        truth = float(row['price']) 
        error = abs(guess - truth)
        log_error = math.log(truth + 1) - math.log(guess + 1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = row['title'] if len(row['title']) <= 40 else row['title'][:40] + "..."
        
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:.4f} Item: {title}{RESET}")
    
    def chart(self, title):
        """Create scatter plot of predictions vs truth"""
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth Price ($)', fontsize=12)
        plt.ylabel('Predicted Price ($)', fontsize=12)
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title, fontsize=14)
        plt.show()
    
    def report(self):
        """Generate final report with metrics"""
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        hit_rate = hits / self.size * 100
        
        # Print summary
        print(f"\n{'='*60}")
        print(f"FINAL REPORT: {self.title}")
        print(f"{'='*60}")
        print(f"Total Predictions: {self.size}")
        print(f"Average Error: ${average_error:,.2f}")
        print(f"RMSLE: {rmsle:.4f}")
        print(f"Hit Rate (Green): {hit_rate:.1f}% ({hits}/{self.size})")
        print(f"{'='*60}\n")
        
        # Create chart
        chart_title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:.2f} Hits={hit_rate:.1f}%"
        self.chart(chart_title)
        
        # Return metrics
        return {
            'average_error': average_error,
            'rmsle': rmsle,
            'hit_rate': hit_rate,
            'hits': hits,
            'guesses': self.guesses,
            'truths': self.truths,
            'errors': self.errors,
            'sles': self.sles,
            'colors': self.colors
        }
    
    def run(self):
        """Run test on all datapoints"""
        print(f"Testing {self.size} predictions...\n")
        
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        
        return self.report()
    
    @classmethod
    def test(cls, predictor, data, title="Price Prediction Model"):
        """Quick test method"""
        return cls(predictor, data, title).run()

In [None]:
def predictor(data):
    user_prompt = data["description"]    
    if not user_prompt or user_prompt.strip() == "":
        print("Warning: Empty prompt!")
        return data["price"]

    user_prompt = f"""
    Return the price of the product in USD.
    Return just the raw float number.

    Product Description: {user_prompt}
    Note: Numbers in this description show product specifications like:
    - Dimensions (size measurements)
    - Weight (ounces/pounds)
    - Rankings (popularity/sales rank)
    - Part/model numbers
    
    Price prediction:
    """

    test = client.chat.completions.create(
        # uncomment this line to use your own model
        # model=status.fine_tuned_model, 
        model=FINE_TUNED_MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ]
    )

    result = test.choices[0].message.content
    return test.choices[0].message.content


#

In [None]:
# I prepared test set from the test_lite.pkl file
# I converted it from a list of objects to a pandas DataFrame
# I cleaned the data to remove None values and duplicates

with open('../../test_lite.pkl', 'rb') as file:
    test = pickle.load(file)

test_set_in_obj_format = []
for t in test:
    desc = " ".join(t.prompt.split("\n")[2:4])
    title = t.title
    price = t.price
    test_set_in_obj_format.append({"description": desc, "price": price, "title": title})

test_set = pd.DataFrame(test_set_in_obj_format)

test_set["title"] = test_set["title"].apply(str)
test_set["description"] = test_set["description"].apply(str)

# Replace "None" and [] with None 
test_set["price"] = test_set["price"].replace("None", None)
test_set["title"] = test_set["title"].replace("", None)
test_set["description"] = test_set["description"].replace("[]", None)

test_set = test_set.dropna()
test_set["price"] = test_set["price"].apply(float)

test_set = test_set.drop_duplicates(subset=["title", "description","price"])

In [None]:
result = PriceTester.test(predictor, test_set, title="GPT-4o-mini Fine-tuned")