In [9]:
import os
import torch
import wandb
from datetime import datetime
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, EarlyStoppingCallback, set_seed
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
import math
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [None]:
# Testing it on the lite pricer data that I have created
DATASET_NAME = f"qshaikh/lite-pricer-data"
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
split_ratio = 0.10  # 10% for validation

TRAIN_SIZE = 15000
train = train.select(range(TRAIN_SIZE))

total_size = len(train)
val_size = int(total_size * split_ratio)

val_data = train.select(range(val_size))
train_data = train.select(range(val_size, total_size))

In [None]:
print(f"Train data size     : {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size      : {len(test)}")

In [4]:
PROJECT_NAME = "llama3-new-pricer"
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}-size{total_size}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"qshaikh/{PROJECT_RUN_NAME}"

In [None]:
LOG_TO_WANDB = True
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
BASE_MODEL = "meta-llama/Llama-3.2-1B"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

In [7]:
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [8]:
LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1

In [9]:
lora_parameters = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
EPOCHS = 1
BATCH_SIZE = 8          # Tested it with 4 first to be on the safer side, however was taking too long. Increased it upto 8 then.
GRADIENT_ACCUMULATION_STEPS = 1
MAX_SEQUENCE_LENGTH = 182
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

SAVE_STEPS = 200
STEPS = 20
save_total_limit = 10


In [None]:
train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    run_name=RUN_NAME,
    dataset_text_field="text",
    max_seq_length=MAX_SEQUENCE_LENGTH,

    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    max_steps=-1,
    group_by_length=True,

    eval_strategy="steps",
    eval_steps=STEPS,
    per_device_eval_batch_size=1,

    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    optim=OPTIMIZER,
    weight_decay=0.001,
    max_grad_norm=0.3,

    fp16=False,
    bf16=True,

    logging_steps=STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=save_total_limit,
    report_to="wandb" if LOG_TO_WANDB else None,

    push_to_hub=True,
    hub_strategy="every_save",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [None]:
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=lora_parameters,    
    args=train_parameters,          
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] 
)

In [None]:
fine_tuning.train()
print(f"Best model pushed to HF Hub: {HUB_MODEL_NAME}")

## Evaluating Model Performance

In [None]:
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)

    def chart(self, title):
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)

        from matplotlib.lines import Line2D
        legend_elements = [
            Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),
            Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),
            Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)
        ]
        plt.legend(handles=legend_elements, loc='upper right')

        plt.show()


    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
test[0]

In [None]:
FINETUNED_MODEL = "qshaikh/llama3-new-pricer-2025-10-29_00.32.54-size15000"
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")
fine_tuned_model

In [None]:
top_K = 3

def improved_model_predict(prompt, device="cuda"):
    set_seed(42) 
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(inputs.shape, device=device)

    with torch.no_grad(): 
        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :].to('cpu')

    next_token_probs = F.softmax(next_token_logits, dim=-1)
    top_prob, top_token_id = next_token_probs.topk(top_K)

    prices, weights = [], [] 

    for i in range(top_K):
      predicted_token = tokenizer.decode(top_token_id[0][i])
      probability = top_prob[0][i]

      try:
        result = float(predicted_token)
      except ValueError as e:
        result = 0.0

      if result > 0:
        prices.append(result)
        weights.append(probability)

    if not prices:
      return 0.0, 0.0

    total = sum(weights)

    weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]

    return sum(weighted_prices).item()

In [None]:
improved_model_predict(test[0]["text"], device="cuda")

In [None]:
Tester.test(improved_model_predict, test)