In [None]:
# imports

import os
import re
import math
import json
import random
from dotenv import load_dotenv
from huggingface_hub import login
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter, defaultdict
from openai import OpenAI
import pandas as pd


In [None]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

# Log in to HuggingFace
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

# Import custom classes
import sys
sys.path.append('../../') 
from items import Item
from testing import Tester

# Setup
openai = OpenAI()
%matplotlib inline


In [None]:
# Configuration 

TRAINING_SIZE = 1000  # Options: 100, 500, 1000, 5000
PROMPT_STRATEGY = 'expert'  #Tested with different prompt strategies: 'basic', 'description', 'expert'
N_EPOCHS = 1  
VALIDATION_SPLIT = 0.1  # 10% for validation

print(f"""
Configuration:
Training Size: {TRAINING_SIZE} examples
Prompt Strategy: {PROMPT_STRATEGY}
Epochs: {N_EPOCHS}
Validation: {VALIDATION_SPLIT*100:.0f}%
""")


In [None]:
# Prompt Strategies

PROMPT_STRATEGIES = {
    'baseline': {
        'system': "You estimate prices of items. Reply only with the price, no explanation",
        'description': 'Original baseline prompt'
    },
    'detailed': {
        'system': """You are an expert price estimator for retail products. 
Analyze the product description carefully and estimate its market price in USD. 
Consider factors like brand, features, specifications, and category. 
Reply only with the price in format: Price is $XX.XX""",
        'description': 'Detailed instruction with context'
    },
    'concise': {
        'system': "Estimate product price from description. Return only: Price is $XX.XX",
        'description': 'Ultra-concise instruction'
    },
    'range_aware': {
        'system': """You estimate retail product prices (typically $1-999). 
Analyze the description and estimate the most likely market price. 
Reply only with: Price is $XX.XX""",
        'description': 'Includes price range context'
    },
    'expert': {
        'system': """You are a pricing analyst with expertise in consumer electronics, appliances, and retail products.
Based on product features, brand, and specifications, estimate the typical retail price.
Format: Price is $XX.XX""",
        'description': 'Expert persona with domain knowledge'
    }
}

print("Available Prompt Strategies:\n")
for name, config in PROMPT_STRATEGIES.items():
    indicator = "ðŸ‘‰ " if name == PROMPT_STRATEGY else "   "
    print(f"{indicator}{name.upper()}: {config['description']}")


## Step 1: Load and Analyze Data


In [None]:
# Load the pickle files 

with open('../../train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../../test.pkl', 'rb') as file:
    test = pickle.load(file)

print(f"Loaded {len(train):,} training items")
print(f"Loaded {len(test):,} test items")

# Quick stats
train_prices = [item.price for item in train]
test_prices = [item.price for item in test]

print(f"\nTraining: Mean=${np.mean(train_prices):.2f}, Median=${np.median(train_prices):.2f}")
print(f"Test: Mean=${np.mean(test_prices):.2f}, Median=${np.median(test_prices):.2f}")


In [None]:
# Visualize price distributions

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(train_prices, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_title(f'Training Price Distribution (n={len(train):,})')
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Count')
axes[0].grid(alpha=0.3)

axes[1].hist(test_prices, bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1].set_title(f'Test Price Distribution (n={len(test):,})')
axes[1].set_xlabel('Price ($)')
axes[1].set_ylabel('Count')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


## Step 2: Create Balanced Training Sets


In [None]:
# Helper functions

def categorize_price(price):
    """Categorize price into ranges"""
    if price < 50:
        return '$0-50'
    elif price < 100:
        return '$50-100'
    elif price < 300:
        return '$100-300'
    else:
        return '$300+'

def create_balanced_dataset(items, size, validation_split=0.1, seed=42):
    """Create balanced dataset with even distribution across price ranges"""
    random.seed(seed)
    
    # Group by price range
    price_buckets = defaultdict(list)
    for item in items:
        bucket = categorize_price(item.price)
        price_buckets[bucket].append(item)
    
    # Sample equally from each bucket
    items_per_bucket = size // len(price_buckets)
    selected_items = []
    
    for bucket, bucket_items in price_buckets.items():
        sample_size = min(items_per_bucket, len(bucket_items))
        selected_items.extend(random.sample(bucket_items, sample_size))
    
    # Shuffle and split
    random.shuffle(selected_items)
    val_size = int(len(selected_items) * validation_split)
    
    return selected_items[val_size:], selected_items[:val_size]

def show_balance(dataset, name):
    """Display dataset balance"""
    prices = [item.price for item in dataset]
    categories = Counter([categorize_price(p) for p in prices])
    print(f"\n{name}:")
    for cat in ['$0-50', '$50-100', '$100-300', '$300+']:
        count = categories[cat]
        pct = count/len(dataset)*100 if len(dataset) > 0 else 0
        print(f"  {cat}: {count:,} ({pct:.1f}%)")


In [None]:
# Create different sized datasets

print("Creating balanced training sets...\n")

train_500, val_500 = create_balanced_dataset(train, 500, VALIDATION_SPLIT)
print(f"500 examples: {len(train_500)} train + {len(val_500)} val")

train_1000, val_1000 = create_balanced_dataset(train, 1000, VALIDATION_SPLIT)
print(f"1000 examples: {len(train_1000)} train + {len(val_1000)} val")

train_2000, val_2000 = create_balanced_dataset(train, 2000, VALIDATION_SPLIT)
print(f"2000 examples: {len(train_2000)} train + {len(val_2000)} val")

# Select based on configuration
if TRAINING_SIZE == 500:
    selected_train, selected_val = train_500, val_500
elif TRAINING_SIZE == 1000:
    selected_train, selected_val = train_1000, val_1000
else:
    selected_train, selected_val = train_2000, val_2000

print(f"\nðŸ‘‰ Using {len(selected_train)} train + {len(selected_val)} val")
show_balance(selected_train, f"{TRAINING_SIZE}-example Training Set")


## Step 3: Prepare JSONL Files for Fine-Tuning


In [None]:
# JSONL conversion functions

def messages_for(item, strategy='baseline'):
    """Create message format for training"""
    system_message = PROMPT_STRATEGIES[strategy]['system']
    user_prompt = item.test_prompt().replace(" to the nearest dollar", "").replace("\n\nPrice is $", "")
    
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is ${item.price:.2f}"}
    ]

def make_jsonl(items, strategy='baseline'):
    """Convert items to JSONL format"""
    result = ""
    for item in items:
        messages = messages_for(item, strategy)
        result += '{"messages": ' + json.dumps(messages) + '}\n'
    return result.strip()

def write_jsonl(items, filename, strategy='baseline'):
    """Write JSONL file"""
    with open(filename, "w") as f:
        f.write(make_jsonl(items, strategy))
    print(f"Written {len(items)} items to {filename}")

# Test
print("Example message:")
print(json.dumps(messages_for(selected_train[0], PROMPT_STRATEGY), indent=2)[:200] + "...")


In [None]:
# Write JSONL files

print(f"Using prompt strategy: '{PROMPT_STRATEGY}'\n")
write_jsonl(selected_train, "fine_tune_train.jsonl", PROMPT_STRATEGY)
write_jsonl(selected_val, "fine_tune_validation.jsonl", PROMPT_STRATEGY)


## Step 4: Upload Files to OpenAI


In [None]:
# Upload training file

with open("fine_tune_train.jsonl", "rb") as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

print(f"Training file uploaded: {train_file.id}")
train_file


In [None]:
# Upload validation file

with open("fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

print(f"Validation file uploaded: {validation_file.id}")
validation_file


## Step 5: Create Fine-Tuning Job

**Optional**: Set up Weights & Biases at https://wandb.ai for training monitoring


In [None]:
# Create fine-tuning job

wandb_integration = {"type": "wandb", "wandb": {"project": "product-pricer-improved"}}

print(f"Starting fine-tuning:")
print(f"  Model: gpt-4o-mini-2024-07-18")
print(f"  Training: {len(selected_train)} examples")
print(f"  Validation: {len(selected_val)} examples")
print(f"  Epochs: {N_EPOCHS}\n")

fine_tune_job = openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": N_EPOCHS},
    integrations=[wandb_integration],
    suffix="pricer-improved"
)

job_id = fine_tune_job.id
print(f"Job created: {job_id}")
print(f"   Status: {fine_tune_job.status}")

# Save configuration
config = {
    "job_id": job_id,
    "training_size": TRAINING_SIZE,
    "prompt_strategy": PROMPT_STRATEGY,
    "n_epochs": N_EPOCHS
}

with open("training_config.json", "w") as f:
    json.dump(config, f, indent=2)
with open("job_id.txt", "w") as f:
    f.write(job_id)

print("\n Config saved to training_config.json")
fine_tune_job


In [None]:
# Check job status (run this cell to monitor progress)

status = openai.fine_tuning.jobs.retrieve(job_id)
print(f"Job Status: {status.status}\n")

# Show recent events
events = openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=5)
print("Recent events:")
for event in events.data[::-1]:
    print(f"  {event.message}")

status


In [None]:
# Get fine-tuned model name and final status

job_status = openai.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_name = job_status.fine_tuned_model

if fine_tuned_model_name:
    print(f" Model ready: {fine_tuned_model_name}")
    if job_status.trained_tokens:
        print(f"   Trained tokens: {job_status.trained_tokens:,}")
else:
    print(f"Still training... Status: {job_status.status}")
    print(" Run this cell again in a few minutes")


## Step 6: Evaluate the Fine-Tuned Model


In [None]:
# Prediction function

def get_price(s):
    """Extract price from response"""
    s = s.replace('$', '').replace(',', '')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

def gpt_fine_tuned_improved(item):
    """Get price prediction from our model"""
    system_message = PROMPT_STRATEGIES[PROMPT_STRATEGY]['system']
    user_prompt = item.test_prompt().replace(" to the nearest dollar", "").replace("\n\nPrice is $", "")
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]
    
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages,
        seed=42,
        max_tokens=10,
        temperature=0
    )
    
    return get_price(response.choices[0].message.content)

# Test on one item
sample = test[0]
pred = gpt_fine_tuned_improved(sample)
print(f"Product: {sample.title[:60]}...")
print(f"Predicted: ${pred:.2f}")
print(f"Actual: ${sample.price:.2f}")
print(f"Error: ${abs(pred - sample.price):.2f}")


In [None]:
# Run full evaluation on 250 test items

print("Running comprehensive evaluation...\n")
print("This will take a few minutes.\n")
print("="*80 + "\n")

Tester.test(gpt_fine_tuned_improved, test)
