# Getting Started

Before running this notebook, please ensure you have the following:

1.  **Local Modules:** Upload the necessary local Python files (`items.py`, `loaders.py`, `testing.py`) to the Colab runtime's temporary storage. You can do this by clicking the folder icon on the left sidebar, then the upload icon, and selecting the files.
2.  **Hugging Face Access Token:** Add your Hugging Face access token to Colab's user data secrets. Click the key icon on the left sidebar, click "New secret", and add your token with the name `HF_TOKEN`.
3.  **Install Dependencies:** Run the first code cell to install the required libraries with the specified versions.

Once these steps are completed, you can run the rest of the notebook cells sequentially.

In [None]:
# Install exact versions from local environment to match the course's environment
!pip install --upgrade pip

# Install specific versions of required libraries
!pip install datasets==3.6.0
!pip install transformers==4.51.3
!pip install huggingface_hub==0.31.2
!pip install matplotlib==3.10.3
!pip install numpy==1.26.4
!pip install python-dotenv==1.1.0
!pip install tqdm==4.67.1

In [None]:
# Import necessary libraries
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle

In [None]:
# Retrieve the Hugging Face access token from Colab's user data secrets
# This token is needed to interact with the Hugging Face Hub
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
# Import custom classes from local files (items.py and loaders.py)
# These files were manually added to the Colab runtime's temporary storage
from loaders import ItemLoader
from items import Item

In [None]:
# Set the backend for matplotlib to display plots inline in the notebook
%matplotlib inline

In [None]:
# Load a single dataset ("All_Beauty") using the custom ItemLoader
# This was likely an initial test or example loading step
items = ItemLoader("Appliances").load()

In [None]:
# Define a list of dataset names (Amazon product categories) to be loaded
dataset_names = [
    "Automotive",
    "Electronics",
    "Office_Products",
    "Tools_and_Home_Improvement",
    "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
]

In [None]:
# Check and print the available CPU cores and RAM in the Colab runtime environment
# This helps understand the resources available for data processing
import psutil
print(f"CPU cores: {psutil.cpu_count()}")
print(f"Available RAM: {psutil.virtual_memory().available / (1024**3):.1f} GB")

In [None]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load(workers=8))

# Now, time for a coffee break!!
# By the way, I put the biggest datasets first.. it gets faster.

In [None]:
# Print the total number of items loaded from all datasets
print(f"A grand total of {len(items):,} items")

In [None]:
# Extract token counts from all loaded items
tokens = [item.token_count for item in items]
# Create and display a histogram of token counts
plt.figure(figsize=(15, 6))
plt.title(f"Token counts: Avg {sum(tokens)/len(tokens):,.1f} and highest {max(tokens):,}\n")
plt.xlabel('Length (tokens)')
plt.ylabel('Count')
plt.hist(tokens, rwidth=0.7, color="skyblue", bins=range(0, 300, 10))
plt.show()

In [None]:
# Extract prices from all loaded items
prices = [item.price for item in items]
# Create and display a histogram of item prices
plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.1f} and highest {max(prices):,}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="blueviolet", bins=range(0, 1000, 10))
plt.show()

In [None]:
# Count the occurrences of each category in the loaded items
category_counts = Counter()
for item in items:
    category_counts[item.category]+=1

# Extract categories and their counts for plotting
categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

# Create and display a bar chart showing the count of items per category
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="goldenrod")
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar for clarity
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

# Display the chart
plt.show()

In [None]:
# Create a dictionary where keys are rounded prices and values are lists of items with that price
# This is done to group items by price for sampling
slots = defaultdict(list)
for item in items:
    slots[round(item.price)].append(item)

In [None]:
# Create a curated sample dataset with a more even distribution of prices and reduced bias towards 'Automotive' category
# Items with price >= $240 are included entirely
# For prices < $240, if the number of items is <= 1200, all are included
# If the number of items > 1200, a weighted random sample of 1200 items is taken,
# giving non-Automotive items higher weight

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
sample = []
for i in range(1, 1000):
    slot = slots[i]
    if i>=240:
        sample.extend(slot)
    elif len(slot) <= 1200:
        sample.extend(slot)
    else:
        # Assign weights: 1 for 'Automotive', 5 for other categories
        weights = np.array([1 if item.category=='Automotive' else 5 for item in slot])
        # Normalize weights
        weights = weights / np.sum(weights)
        # Randomly select 1200 indices based on weights
        selected_indices = np.random.choice(len(slot), size=1200, replace=False, p=weights)
        # Select the items corresponding to the chosen indices
        selected = [slot[i] for i in selected_indices]
        sample.extend(selected)

# Print the total number of items in the curated sample
print(f"There are {len(sample):,} items in the sample")

In [None]:
# Extract prices from the curated sample
prices = [float(item.price) for item in sample]
# Create and display a histogram of prices for the sample dataset
# This helps visualize the effect of the sampling process on the price distribution
plt.figure(figsize=(15, 10))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="darkblue", bins=range(0, 1000, 10))
plt.show()

In [None]:
# Count the occurrences of each category in the curated sample
category_counts = Counter()
for item in sample:
    category_counts[item.category]+=1

# Extract categories and their counts for plotting
categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

# Create and display a bar chart showing the count of items per category in the sample
# This helps visualize the effect of weighted sampling on category distribution
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="lightgreen")

# Customize the chart
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar for clarity
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

# Display the chart
plt.show()

In [None]:
# Create and display a pie chart showing the percentage distribution of items across categories in the sample
plt.figure(figsize=(12, 10))
plt.pie(counts, labels=categories, autopct='%1.0f%%', startangle=90)

# Add a circle at the center to create a donut chart (optional)
centre_circle = plt.Circle((0,0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('Categories')

# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')

plt.show()

In [None]:
# Markdown cell indicates that the dataset curation is complete and ready for final checks
# Dataset Curated!

# We've crafted an excellent dataset.

# Let's do some final checks

In [None]:
# Extract prompt lengths (character counts) and prices from the curated sample
sizes = [len(item.prompt) for item in sample]
prices = [item.price for item in sample]

# Create and display a scatter plot to visualize the relationship between prompt size and price
# This helps check for any simple correlation between the two
plt.figure(figsize=(15, 8))
plt.scatter(sizes, prices, s=0.2, color="red")

# Add labels and title
plt.xlabel('Size')
plt.ylabel('Price')
plt.title('Is there a simple correlation?')

# Display the plot
plt.show()

In [None]:
# Define a helper function to report information about an item
# It prints the item's prompt, the last 10 token IDs, and the decoded last 10 tokens
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))

In [None]:
# Use the report function to display information about a specific item in the sample
# This helps inspect the data and the tokenizer's behavior
report(sample[398000])

## Observation

An interesting thing about the Llama tokenizer is that every number from 1 to 999 gets mapped to 1 token, much as we saw with gpt-4o. The same is not true of qwen2, gemma and phi3, which all map individual digits to tokens. This does turn out to be a bit useful for our project, although it's not an essential requirement.

# Finally

It's time to break down our data into a training, test and validation dataset.

It's typical to use 5%-10% of your data for testing purposes, but actually we have far more than we need at this point. We'll take 400,000 points for training, and we'll reserve 2,000 for testing, although we won't use all of them.


In [None]:
# Set random seed for reproducibility before shuffling and splitting the sample
random.seed(42)
# Shuffle the curated sample dataset
random.shuffle(sample)
# Split the shuffled sample into training (400,000 items) and testing (2,000 items) sets
train = sample[:400_000]
test = sample[400_000:402_000]
# Print the sizes of the training and testing sets
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

In [None]:
# Extract prices from the first 250 items of the test set
prices = [float(item.price) for item in test[:250]]
# Create and display a histogram of prices for the first 250 test items
# This provides a quick look at the price distribution in a small portion of the test data
plt.figure(figsize=(15, 6))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="darkblue", bins=range(0, 1000, 10))
plt.show()

In [None]:
# Extract prompts from the training set
train_prompts = [item.prompt for item in train]
# Extract prices from the training set
train_prices = [item.price for item in train]
# Extract test prompts (using the test_prompt method) from the test set
test_prompts = [item.test_prompt() for item in test]
# Extract prices from the test set
test_prices = [item.price for item in test]

In [None]:
# Create Hugging Face Dataset objects from the training and testing data
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
# Create a DatasetDict containing the training and testing datasets
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
# Push the created DatasetDict to the Hugging Face Hub
# Replace "aaron-official" with your Hugging Face username
# The dataset will be named "your-username/pricer-data" and will be private
# HF_USER = "aaron-official" # Uncomment and replace with your HF username
# DATASET_NAME = f"{HF_USER}/pricer-data" # Uncomment
# dataset.push_to_hub(DATASET_NAME, private=True) # Uncomment to push to hub

In [None]:
# Pickle (serialize) the training and testing datasets and save them as files
# This allows for quick loading of the processed data in future sessions
with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)

In [None]:
# Mount Google Drive to access files in your Drive
from google.colab import drive
drive.mount('/content/drive')

Once your Google Drive is mounted, you can copy the file to a folder in your Drive. Replace `My Drive/your_folder_name` with the path to the folder where you want to save the file.

In [None]:
# Import the shutil module for file operations
import shutil

# Define the destination path in Google Drive and the source path of the pickled training data
# Replace 'My Drive/your_folder_name' with your desired folder path in Google Drive
destination_path = '/content/drive/My Drive/train.pkl'
source_path = '/content/train.pkl'

# Copy the pickled training data file from the Colab environment to Google Drive
shutil.copyfile(source_path, destination_path)

# Print a confirmation message
print(f"Copied {source_path} to {destination_path}")

In [None]:
# Import the shutil module for file operations
import shutil

# Define the destination path in Google Drive and the source path of the pickled test data
# Replace 'My Drive/your_folder_name' with your desired folder path in Google Drive
destination_path = '/content/drive/My Drive/test.pkl'
source_path = '/content/test.pkl'

# Copy the pickled test data file from the Colab environment to Google Drive
shutil.copyfile(source_path, destination_path)

# Print a confirmation message
print(f"Copied {source_path} to {destination_path}")