<a href="https://colab.research.google.com/github/dkisselev-zz/llm_engineering/blob/wk7/Week_7_Excersise_fine_tuned_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Product Prices

Model evaluation and inference tuning



## Libraries and configuration

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb

In [None]:
import os
import re
import math
import numpy as np
from google.colab import userdata
from huggingface_hub import login
import wandb
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from datasets import load_dataset
from peft import PeftModel
import matplotlib.pyplot as plt

In [None]:
# Models

# WB or HF location of artifacts
ARTIFCAT_LOCATTION="HF"

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

PROJECT_NAME = "pricer"

# RUN_NAME = "2025-10-23_23.41.24" # - Fine tuned 16 batches / 8 bit run
# RUN_NAME = "2025-10-25_05.02.00" # - Fine tuned 4 batches / 4 bit / LoRA 64/128 / Gradient 8
RUN_NAME = "2024-09-13_13.04.39" # Ed's model run

# Hugging Face
HF_USER = "dkisselev"

if ARTIFCAT_LOCATTION=="HF":
  PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
  # REVISION = None
  REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"


  # FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"

  # Ed's model
  FINETUNED_MODEL = f"ed-donner/{PROJECT_RUN_NAME}"
else:
  # Weights and Biases
  WANDB_ENTITY = "dkisselev"
  os.environ["WANDB_API_KEY"]=userdata.get('WANDB_API_KEY')

  MODEL_ARTIFACT_NAME = f"model-{RUN_NAME}"
  REVISION_TAG="v22"
  WANDB_ARTIFACT_PATH = f"{WANDB_ENTITY}/{PROJECT_NAME}/{MODEL_ARTIFACT_NAME}:{REVISION_TAG}"

# Data set

# DATASET_NAME = f"{HF_USER}/pricer-data2"
DATASET_NAME = "ed-donner/pricer-data"

# Hyperparameters for QLoRA
QUANT_4_BIT = True
K_SEARCH_LIMIT = 900

# Used for writing to output in color
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
BLUE = "\033[94m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": BLUE, "green": GREEN}

### Load Data

Data is loaded from Huggin Face


In [None]:
# Log in to HuggingFace
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

## Load Tokenizer and Model

In [None]:
# 4 or 8 but quantization
if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True
  )

In [None]:
# Load model from w&b
if ARTIFCAT_LOCATTION=="WB":
  artifact = wandb.Api().artifact(WANDB_ARTIFACT_PATH, type='model')
  artifact_dir = artifact.download() # Downloads to a local cache dir

In [None]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

if ARTIFCAT_LOCATTION=="HF":
  # Load the fine-tuned model with PEFT
  if REVISION:
    fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
  else:
    fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
else:
  # Model at W&B
  fine_tuned_model = PeftModel.from_pretrained(base_model, artifact_dir)

print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

## Hyperparameter helpers

In [None]:
def calculate_weighted_price(prices, probabilities):
    """
    Calculates a normalized weighted average price.

    Args:
        prices (list or np.array): A list of prices.
        probabilities (list or np.array): A list of corresponding probabilities (or weights).
    Returns:
        float: The normalized weighted average price.
    """
    # Convert lists to numpy arrays
    prices_array = np.array(prices)
    probs_array = np.array(probabilities)

    # Total of the probabilities to use for normalization
    total_prob = np.sum(probs_array)

    # Catch zero
    if total_prob == 0:
        if len(prices_array) > 0:
            return np.mean(prices_array)
        else:
            return 0.0

    # Weighted avrage
    weighted_price = np.average(prices_array, weights=probs_array)

    return weighted_price

In [None]:
def get_top_k_predictions(prompt, device="cuda"):
    """
    Gets the top K price/probability pairs from the model.

    Returns:
        (list, list): A tuple containing (prices, probabilities)
    """
    set_seed(42)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(inputs.shape, device=device)

    with torch.no_grad():
        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :].to('cpu')

    next_token_probs = F.softmax(next_token_logits, dim=-1)
    top_prob, top_token_id = next_token_probs.topk(K_SEARCH_LIMIT)

    prices = []
    probabilities = []

    for i in range(K_SEARCH_LIMIT):
      predicted_token = tokenizer.decode(top_token_id[0][i])
      probability_tensor = top_prob[0][i]

      try:
        price = float(predicted_token)
      except ValueError as e:
        price = 0.0

      if price > 0:
        prices.append(price)
        probabilities.append(probability_tensor.item())

    if not prices:
      return [], []

    return prices, probabilities

In [None]:
def make_prompt(text):
  if ARTIFCAT_LOCATTION=="HF":
      return text
  p_array = text.split("\n")
  p_question = p_array[0].replace("How much does this cost to the nearest dollar?","What is the price of this item?")
  p_title = p_array[2]
  p_descr = re.sub(r'\d', '', p_array[3])
  p_price = p_array[5]
  prompt =  p_title + "\n" + p_descr + "\n" + "Question: "+ p_question + "\n\n" + p_price
  # prompt = p_array[0] + "\n\n\n" + p_title + "\n\n" + p_descr + "\n\n" + p_price
  # return text
  return prompt

In [None]:
%matplotlib inline

class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]

        base_prompt = datapoint["text"]
        prompt = make_prompt(base_prompt)

        guess = self.predictor(prompt)

        # guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"].split("\n\n")[1][:20] + "..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
class Search_K:
    """
    Search for the optimal 'k' value.
    """
    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.truths = []

        self.all_k_errors = []
        self.max_k = K_SEARCH_LIMIT

        # Store the list of probabilities for each inference
        self.all_prob_lists = []
        # Store the standard deviation of probs for each inference
        self.prob_std_devs = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        base_prompt = datapoint["text"]
        prompt = make_prompt(base_prompt)
        truth = datapoint["price"]
        self.truths.append(truth)

        # Get the raw lists of prices and probabilities
        prices, probabilities = self.predictor(prompt)

        self.all_prob_lists.append(probabilities)

        if probabilities:
            # Calculate and store the spread (std dev) of this prob list
            self.prob_std_devs.append(np.std(probabilities))
        else:
            # No probabilities, append 0 for spread
            self.prob_std_devs.append(0.0)

        errors_for_this_datapoint = []

        if not prices:
            print(f"{i+1}: No valid prices found. Truth: ${truth:,.2f}.")
            error = np.abs(0 - truth)
            errors_for_this_datapoint = [error] * self.max_k
            self.all_k_errors.append(errors_for_this_datapoint)
            return

        # Iterate from k=1 up to max_k
        for k in range(1, self.max_k + 1):
            k_prices = prices[:k]
            k_probabilities = probabilities[:k]

            # Calculate the weighted price just for this k
            guess = calculate_weighted_price(k_prices, k_probabilities)

            # Calculate and store the error for this k
            error = np.abs(guess - truth)
            errors_for_this_datapoint.append(error)

        # Store the list of errors (for k=1 to max_k)
        self.all_k_errors.append(errors_for_this_datapoint)

        # Print a summary for this datapoint
        title = datapoint["text"].split("\n\n")[1][:20] + "..."

        # Using [0], [19], [-1] for k=1, k=20, k=max_k (0-indexed)
        k_1_err = errors_for_this_datapoint[0]
        k_20_err = errors_for_this_datapoint[19]
        k_max_err = errors_for_this_datapoint[-1]

        color = self.color_for(k_1_err, truth)
        print(f"{COLOR_MAP[color]}{i+1}: Truth: ${truth:,.2f}. "
              f"Errors (k=1, k=20, k={self.max_k}): "
              f"(${k_1_err:,.2f}, ${k_20_err:,.2f}, ${k_max_err:,.2f}) "
              f"Item: {title}{RESET}")

    def plot_k_vs_error(self, k_values, avg_errors_by_k, best_k, min_error):
        """
        Plots the Average Error vs. k
        """
        plt.figure(figsize=(12, 8))
        plt.plot(k_values, avg_errors_by_k, label='Average Error vs. k')

        # Highlight the best k
        plt.axvline(x=best_k, color='red', linestyle='--',
                    label=f'Best k = {best_k} (Avg Error: ${min_error:,.2f})')

        plt.xlabel('Number of Top Probabilities/Prices (k)')
        plt.ylabel('Average Absolute Error ($)')
        plt.title(f'Optimal k Analysis for {self.title}')
        plt.legend()
        plt.grid(True, which='both', linestyle='--', linewidth=0.5)
        # Set x-axis to start at 1
        plt.xlim(left=1)
        plt.savefig("k_vs_error_plot.png")
        plt.show()


    def plot_probability_spread(self, idx_min_std, idx_med_std, idx_max_std):
        probs_min = self.all_prob_lists[idx_min_std]
        probs_med = self.all_prob_lists[idx_med_std]
        probs_max = self.all_prob_lists[idx_max_std]
        std_min = self.prob_std_devs[idx_min_std]
        std_med = self.prob_std_devs[idx_med_std]
        std_max = self.prob_std_devs[idx_max_std]

        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 7), sharey=True)
        fig.suptitle('Probability Distribution Spread Analysis (Examples)', fontsize=16)

        def plot_strip(ax, probs, title):
            if not probs:
                ax.set_title(f"{title}\n(No probabilities found)")
                return
            jitter = np.random.normal(0, 0.01, size=len(probs))
            ax.scatter(jitter, probs, alpha=0.5, s=10) # Made points slightly larger
            ax.set_title(title)
            ax.set_xlabel("Jitter")
            ax.get_xaxis().set_ticks([])

        plot_strip(ax1, probs_min,
                   f'Inference {idx_min_std} (Lowest Spread)\nStd Dev: {std_min:.6f}')
        ax1.set_ylabel('Probability')
        plot_strip(ax2, probs_med,
                   f'Inference {idx_med_std} (Median Spread)\nStd Dev: {std_med:.6f}')
        plot_strip(ax3, probs_max,
                   f'Inference {idx_max_std} (Highest Spread)\nStd Dev: {std_max:.6f}')

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig("spread_examples_plot.png")
        plt.show()

    def plot_all_std_devs(self):
        """
        Plots a histogram and a line plot of the standard deviation
        for ALL inferences.
        """
        if not self.prob_std_devs:
            print("No probability spreads recorded, skipping all-std plot.")
            return

        # Create a figure with two subplots
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        fig.suptitle('Full Spread Analysis for All Inferences', fontsize=16)

        # --- Plot Histogram ---
        ax1.hist(self.prob_std_devs, bins=50, edgecolor='black')
        ax1.set_title('Distribution of Probability Standard Deviations')
        ax1.set_xlabel('Standard Deviation')
        ax1.set_ylabel('Frequency (Number of Inferences)')

        mean_std = np.mean(self.prob_std_devs)
        ax1.axvline(mean_std, color='red', linestyle='--',
                    label=f'Mean Std Dev: {mean_std:.6f}')
        ax1.legend()

        # --- Plot Line Plot ---
        ax2.plot(self.prob_std_devs, marker='o', linestyle='-',
                 markersize=3, alpha=0.7, label='Std Dev per Inference')
        ax2.set_title('Probability Standard Deviation per Inference')
        ax2.set_xlabel('Inference Index (0 to 249)')
        ax2.set_ylabel('Standard Deviation')

        ax2.axhline(mean_std, color='red', linestyle='--',
                    label=f'Mean Std Dev: {mean_std:.6f}')
        ax2.legend()
        ax2.set_xlim(0, len(self.prob_std_devs) - 1)

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig("all_std_devs_plot.png") # Save the plot
        plt.show()

    def report(self):
        """
        Calls all three plotting functions.
        """
        if not self.all_k_errors:
             print("\nNo data to report on. Exiting.")
             return

        # Optimal k Analysis ---
        errors_array = np.array(self.all_k_errors)
        avg_errors_by_k = np.mean(errors_array, axis=0)
        best_k_index = np.argmin(avg_errors_by_k)
        min_error = avg_errors_by_k[best_k_index]
        best_k = best_k_index + 1

        print("\n" + "="*40)
        print("--- Optimal k Analysis Report ---")
        print(f"Model: {self.title}")
        print(f"Inferences Run: {self.size}")
        print(f"Analyzed k from 1 to {self.max_k}")
        print(f"===================================")
        print(f"==> Best k: {best_k}")
        print(f"==> Minimum Average Error: ${min_error:,.2f}")
        print("="*40 + "\n")

        k_values = np.arange(1, self.max_k + 1)
        self.plot_k_vs_error(k_values, avg_errors_by_k, best_k, min_error)

        # Probability Spread Analysis ---
        if not self.prob_std_devs:
            print("\nNo probability spreads recorded, skipping spread plots.")
            return

        print("\n" + "="*40)
        print("--- Probability Spread Analysis ---")

        # Find indices for examples
        std_sorted_indices = np.argsort(self.prob_std_devs)
        idx_min_std = std_sorted_indices[0]
        idx_med_std = std_sorted_indices[len(std_sorted_indices) // 2]
        idx_max_std = std_sorted_indices[-1]

        print(f"Lowest spread (std):  {self.prob_std_devs[idx_min_std]:.6f} (Inference {idx_min_std})")
        print(f"Median spread (std): {self.prob_std_devs[idx_med_std]:.6f} (Inference {idx_med_std})")
        print(f"Highest spread (std): {self.prob_std_devs[idx_max_std]:.6f} (Inference {idx_max_std})")
        print("="*40 + "\n")

        # Plot example spreads
        self.plot_probability_spread(idx_min_std, idx_med_std, idx_max_std)

        # Plot all spreads
        self.plot_all_std_devs()

        return best_k

    def run(self):
        for i in range(self.size):
            self.run_datapoint(i)
        best_k=self.report()
        return best_k

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
# Search best K
search_k = Search_K(get_top_k_predictions, test, title=f"{MODEL_ARTIFACT_NAME}:{REVISION_TAG}" if ARTIFCAT_LOCATTION=="WB" else None)
best_k = search_k.run()

In [None]:
top_K = best_k

def improved_model_predict(prompt, device="cuda"):
    set_seed(42)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(inputs.shape, device=device)

    with torch.no_grad():
        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :].to('cpu')

    next_token_probs = F.softmax(next_token_logits, dim=-1)
    top_prob, top_token_id = next_token_probs.topk(top_K)

    prices = []
    # Renamed 'weights' to 'probabilities' for clarity
    probabilities = []

    for i in range(top_K):
      predicted_token = tokenizer.decode(top_token_id[0][i])
      # This is a torch.Tensor
      probability_tensor = top_prob[0][i]

      # print(predicted_token, probability_tensor)

      try:
        # Try to convert the decoded token string to a float
        price = float(predicted_token)
      except ValueError as e:
        price = 0.0

      # Only include valid, positive prices
      if price > 0:
        prices.append(price)
        # We append the tensor to our list
        probabilities.append(probability_tensor)

    if not prices:
      # If no valid prices were found, return 0.0
      return 0.0


    # Convert the list of prices to a numpy array
    prices_np = np.array(prices)

    # Convert the list of torch.Tensors to a numpy array of floats
    probs_np = np.array([p.item() for p in probabilities])

    # Calculate the normalized weighted average
    final_price = np.average(prices_np, weights=probs_np)

    return float(final_price) # Return as a standard python float

In [None]:
prompt=make_prompt(test[80]['text'])
print(prompt)

improved_model_predict(prompt)

In [None]:
# Run Estimate vs Ground Truth
tester = Tester(improved_model_predict, test, title=f"{MODEL_ARTIFACT_NAME}:{REVISION_TAG}" if ARTIFCAT_LOCATTION=="WB" else None)
tester.run()