In [1]:
from datetime import datetime
import os
import torch
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel
from pathlib import Path
import gc  
import re

In [3]:
from base_llm import BaseLLM
from data import Dataset, benchmark

In [4]:
def tokenize(tokenizer, question: str, answer: str):
    """
    Tokenizes data elements.
    1. Adds <EOS> token to question/answer pairs.
    2. Tokenizes and constructs answer labels.
    3. Sets labels[i] == -100 for question parts or masked parts so that only answers are learned.
    """
    full_text = f"{question} {answer}{tokenizer.eos_token}"

    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    full = tokenizer(
        full_text, 
        padding="max_length", 
        truncation=True, 
        max_length=256  # Set maximum length to 256
    )

    input_ids = full["input_ids"]
    question_len = len(tokenizer(question)["input_ids"])

    # Create labels: mask prompt parts
    labels = [-100] * question_len + input_ids[question_len:]
    for i in range(len(labels)):
        if full["attention_mask"][i] == 0:
            labels[i] = -100

    full["labels"] = labels
    return full


def format_example(prompt: str, answer: float) -> dict[str, str]:
    """
    Constructs question/answer pairs. Consider rounding the answer for easier processing by the LLM.
    """
    formatted_answer = f"<answer>{answer}</answer>"
    
    return {
        "question": prompt,
        "answer": formatted_answer
    }

In [5]:
class TokenizedDataset:
    def __init__(self, tokenizer, data: Dataset, format_fn):
        """
        Uses the following:
        - BaseLLM.tokenizer
        - Dataset
        - format_fn: A function that converts data elements into a dictionary with:
          - question: str
          - answer: str
        """
        self.format_fn = format_fn
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        formated_data = self.format_fn(*self.data[idx])
        return tokenize(self.tokenizer, **formated_data)


In [6]:
# Helper function for memory management
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        torch.mps.empty_cache()
        torch.mps.synchronize()


def test_model(ckpt_path: str, testset="valid"):
    """Test the model."""
    testset = Dataset(testset)

    if ckpt_path.startswith("HuggingFaceTB/SmolLM2-360M"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-360M-Instruct")
    elif ckpt_path.startswith("HuggingFaceTB/SmolLM2-1.7B"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    else:
        llm = BaseLLM()
        # Load model with LoRA adapter
        llm.model = PeftModel.from_pretrained(llm.model, ckpt_path).to(llm.device)

    benchmark_result = benchmark(llm, testset, 100)
    print(f"{benchmark_result.answer_rate=}  {benchmark_result.bleu_score=}  {benchmark_result.rouge_score=}  {benchmark_result.bertscore=}")
    return benchmark_result

In [7]:

from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
rouge = Rouge()

In [8]:
# Define global LLM variable
_base_llm = None

def set_base_llm(base_model_path: str):
    """Set global LLM"""
    global _base_llm
    _base_llm = BaseLLM(base_model_path)

def compute_metrics(eval_pred):
    """Function to calculate evaluation metrics"""
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    # Utility function to extract answers
    def extract_pred_answer(answer: str) -> str:
        
        # Check if the answer contains <answer> tags
        if "<answer>" in answer:
            # Find the last occurrence of <answer> tag
            last_answer_pos = answer.rfind("<answer>")
            
            # Extract content after the last <answer> tag
            content_after_tag = answer[last_answer_pos + len("<answer>"):]
            
            # Check if there's a closing </answer> tag after the last opening tag
            if "</answer>" in content_after_tag:
                end_pos = content_after_tag.find("</answer>")
                return content_after_tag[:end_pos].strip()
            else:
                # If no closing tag, return everything after the last <answer>
                return content_after_tag.strip()
        else:
            return answer.strip() if answer else ""
    
    # Load actual answer dataset
    valid_dataset = Dataset("valid")
    true_answers = [valid_dataset[i][1] for i in range(len(valid_dataset))][:len(labels)]
    
    sample_correct = 0
    valid_samples = 0
    
    # Create BaseLLM instance to use tokenizer
    """Function to calculate evaluation metrics"""
    global _base_llm
    if _base_llm is None:
        raise ValueError("BaseLLM is not set. Please call set_base_llm() first.")
    llm = _base_llm
        
    bleu_score = 0
    rouge_score = 0

    for i in range(len(labels)):
        # Decode prediction results
        pred_tokens = predictions[i][labels[i] != -100]
        pred_text = llm.tokenizer.decode(pred_tokens, skip_special_tokens=True)
        
        # Extract numbers from predictions
        pred_value = extract_pred_answer(pred_text)
        true_value = true_answers[i]

        bleu_score += sentence_bleu([true_value], pred_value) if pred_value else 0
        rouge_score += rouge.get_scores(pred_value, true_value)[0]['rouge-l']['f'] if pred_value else 0
        valid_samples += 1
    
    # accuracy = sample_correct / valid_samples if valid_samples > 0 else 0.0
    bleu_score = bleu_score / valid_samples if valid_samples > 0 else 0.0
    rouge_score = rouge_score / valid_samples if valid_samples > 0 else 0.0
                
    return {
        "sample_bleu": bleu_score,
        "sample_rouge": rouge_score,
        "valid_samples": valid_samples
    }


def train_model(output_dir: str = "./output", final_model_path="sft_model_360M"):
    """Train the model."""
    # Load base model and dataset
    if final_model_path.startswith("HuggingFaceTB/SmolLM2-360M"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-360M-Instruct")
    elif final_model_path.startswith("HuggingFaceTB/SmolLM2-1.7B"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    else:
        llm = BaseLLM()
    train_data = Dataset("train")
    
    # Set model to training mode
    llm.model.train()
    
    # Enable gradient checkpointing
    llm.model.config.use_cache = False  # Required: not compatible with gradient checkpointing
    llm.model.gradient_checkpointing_enable()
    
    # MPS (Apple Silicon) optimization settings
    is_mps = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
    
    # LoRA configuration
    lora_config = LoraConfig(
        r=4,  # rank
        lora_alpha=32,  # alpha = 8*r (recommended 4-5x)
        target_modules="all-linear",  # Apply to all linear layers
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
    )
    
    # Apply LoRA
    peft_model = get_peft_model(llm.model, lora_config)
    
    # GPU settings if available
    if torch.cuda.is_available() or is_mps:
        peft_model.enable_input_require_grads()
    
    # Tokenize dataset    
    tokenized_dataset = TokenizedDataset(llm.tokenizer, train_data, format_example)
    valid_dataset = TokenizedDataset(llm.tokenizer, Dataset("valid"), format_example)
    
    # Create unique experiment name
    run_id = datetime.now().strftime('%Y%m%d_%H%M%S')
    experiment_name = f"run_{run_id}"
    experiment_dir = os.path.join(output_dir, experiment_name)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=experiment_dir,
        logging_dir=experiment_dir,
        report_to="tensorboard",
        run_name=f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        learning_rate=5e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        weight_decay=0.01,
        gradient_checkpointing=True,
        optim="adamw_torch",
        
        # Memory optimization settings
        gradient_accumulation_steps=8,
        
        # Validation & Logging
        evaluation_strategy="epoch",
        per_device_eval_batch_size=4,
        logging_steps=20,
        
        # Model saving settings
        save_strategy="epoch",
        save_total_limit=1,
        
        # Early stopping
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        
        # MPS optimization
        fp16=False,
        bf16=False,
        torch_compile=False,
        
        # Other settings
        remove_unused_columns=True,
        max_grad_norm=0.5,
        dataloader_num_workers=0,
    )

    set_base_llm("HuggingFaceTB/SmolLM2-1.7B-Instruct")  # Set here
    
    
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Set here
    )
    
    # Calculate LoRA adapter size (trainable parameters only)
    adapter_size = sum(
        p.numel() * p.element_size()
        for n, p in peft_model.named_parameters()
        if 'lora_' in n
    ) / (1024 ** 2)

    print(f"LoRA Adapter Memory Usage: {adapter_size:.2f}MB")
    assert adapter_size < 20, "Adapter size exceeds 20MB limit"
    
    # Clear memory before training
    clear_memory()
    
    # Start training
    trainer.train()
    
    # Save final model after training
    # final_model_path = "./sft_model"  # Modify according to notebook path
    trainer.save_model(final_model_path)
    
    # Clear memory and test
    clear_memory()
    test_model(final_model_path, "valid")

In [11]:
# Run the training
# Create output directory
os.makedirs("./output", exist_ok=True)

# Execute model training
train_model("./output", "sft_model_1.7B")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


LoRA Adapter Memory Usage: 8.28MB


Epoch,Training Loss,Validation Loss,Sample Bleu,Sample Rouge,Valid Samples
0,2.0338,1.736998,0.619947,0.50589,182
1,1.7154,1.532657,0.651079,0.541656,182
2,1.5195,1.42182,0.677206,0.577026,182
3,1.3768,1.387189,0.685971,0.592285,182
4,1.2072,1.373743,0.694081,0.60053,182
5,1.099,1.391988,0.69876,0.60601,182
6,0.9852,1.428161,0.702341,0.608588,182
7,0.9133,1.452175,0.703814,0.610636,182


python(6746) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
LLM Running on Micro Batches 16:   0%|          | 0/7 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
LLM Running on Micro Batches 16: 100%|██████████| 7/7 [02:05<00:00, 17.90s/it]


generations: ["What is (are) Hyperthermia for treating cancer ? <answer>Hyperthermia is treatment that uses heat to kill cancer cells. It is used to treat many types of cancer.  The type of heat used is called microwave energy.  The goal of hyperthermia is to damage the cancer cells so that the body's natural defenses (such as the immune system) can destroy the cancer cells.  Hyperthermia is often used in combination with other treatments, such as chemotherapy or radiation.  This article discusses microwave energy.)</answer>", 'Do you have information about Am I in labor? <answer>You are going into labor. Labor is the process of your uterus (womb) moving out of the vagina and opening to allow the baby to leave.  Labor is usually 10 to 16 days long.  Labor is divided into three stages:  - Dilation of the cervix (the opening of the uterus)  - Rupture of the membranes (the opening of the uterus)  - Labor pain (the opening of the uterus)</answer>', 'What to do in case of emergency or overd

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.29250041873719523  benchmark_result.rouge_score=0.34454022413951657  benchmark_result.bertscore=0.8643306118249893


In [12]:
orig_result = test_model("HuggingFaceTB/SmolLM2-1.7B-Instruct", "test")

LLM Running on Micro Batches 16: 100%|██████████| 7/7 [03:31<00:00, 30.26s/it] 


generations: ['What is (are) Central sleep apnea ?\n\nA:', "What are the symptoms of Acetaminophen and codeine overdose ?\n\nThe symptoms of an acetaminophen and codeine overdose can vary depending on the severity of the overdose and the individual's overall health. Common symptoms include:\n\nNausea and vomiting\nAbdominal pain\nDizziness\nDrowsiness\nConfusion\nSlow or irregular heartbeat\nRespiratory depression\n\nIn severe cases, an acetaminophen and codeine overdose can lead to more serious symptoms, such as:\n\nRespiratory failure\nKidney failure\nComa\nDeath\n\nIt's essential to seek immediate medical attention if you suspect an acetaminophen and codeine overdose, as prompt treatment can help minimize the risk of serious complications.\n\nWhat are the treatment options for an acetaminophen and codeine overdose ?\n\nTreatment for an acetaminophen and codeine overdose typically involves administering activated charcoal to absorb the excess medication in the stomach, followed by su

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.12057346409853502  benchmark_result.rouge_score=0.1285958316349848  benchmark_result.bertscore=0.8256665998697281


In [13]:
print(f"{orig_result.samples[0].question=}")
print(f"{orig_result.samples[0].answer=}")
print(f"{orig_result.samples[0].correct_answer=}")
print(f"{orig_result.samples[0].bleu_score=}")
print(f"{orig_result.samples[0].rouge_score=}")
print(f"{orig_result.samples[0].bertscore=}")


orig_result.samples[0].question='What is (are) Central sleep apnea ?'
orig_result.samples[0].answer='What is (are) Central sleep apnea ?\n\nA:'
orig_result.samples[0].correct_answer='Central sleep apnea is a sleep disorder in which breathing stops over and over during sleep.)'
orig_result.samples[0].bleu_score=0.15319585047649734
orig_result.samples[0].rouge_score=0.2727272680991736
orig_result.samples[0].bertscore=0.8210733532905579


In [14]:
# Test trained model
finetuned_result = test_model("sft_model_1.7B", "test")

LLM Running on Micro Batches 16: 100%|██████████| 7/7 [02:21<00:00, 20.27s/it]


generations: ['What is (are) Central sleep apnea ? <answer>Central sleep apnea is a sleep disorder in which a person stops breathing for short periods of time while sleeping. The breathing stops because the brain fails to send signals to the muscles that control breathing.)</answer>', 'What are the symptoms of Acetaminophen and codeine overdose ? <answer>Symptoms of acetaminophen and codeine overdose may include:  - Abdominal pain  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.26917179047806256  benchmark_result.rouge_score=0.29075810118424295  benchmark_result.bertscore=0.8573109579086303


In [15]:
print(f"{finetuned_result.samples[0].question=}")
print(f"{finetuned_result.samples[0].answer=}")
print(f"{finetuned_result.samples[0].correct_answer=}")
print(f"{finetuned_result.samples[0].bleu_score=}")
print(f"{finetuned_result.samples[0].rouge_score=}")
print(f"{finetuned_result.samples[0].bertscore=}")


finetuned_result.samples[0].question='What is (are) Central sleep apnea ?'
finetuned_result.samples[0].answer='Central sleep apnea is a sleep disorder in which a person stops breathing for short periods of time while sleeping. The breathing stops because the brain fails to send signals to the muscles that control breathing.)'
finetuned_result.samples[0].correct_answer='Central sleep apnea is a sleep disorder in which breathing stops over and over during sleep.)'
finetuned_result.samples[0].bleu_score=0.3550660770709003
finetuned_result.samples[0].rouge_score=0.4999999956611571
finetuned_result.samples[0].bertscore=0.9371243119239807
