In [1]:
from datetime import datetime
import os
import torch
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel
from pathlib import Path
import gc  
import re

In [3]:
from base_llm import BaseLLM
from data import Dataset, benchmark

In [4]:
def tokenize(tokenizer, question: str, answer: str):
    """
    Tokenizes data elements.
    1. Adds <EOS> token to question/answer pairs.
    2. Tokenizes and constructs answer labels.
    3. Sets labels[i] == -100 for question parts or masked parts so only answers are learned.
    """
    full_text = f"{question} {answer}{tokenizer.eos_token}"

    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    full = tokenizer(
        full_text, 
        padding="max_length", 
        truncation=True, 
        max_length=256  # Set maximum length to 256
    )

    input_ids = full["input_ids"]
    question_len = len(tokenizer(question)["input_ids"])

    # Create labels: mask prompt parts
    labels = [-100] * question_len + input_ids[question_len:]
    for i in range(len(labels)):
        if full["attention_mask"][i] == 0:
            labels[i] = -100

    full["labels"] = labels
    return full


def format_example(prompt: str, answer: float) -> dict[str, str]:
    """
    Constructs question/answer pairs. Consider rounding the answer for easier processing by the LLM.
    """
    formatted_answer = f"<answer>{answer}</answer>"
    
    return {
        "question": prompt,
        "answer": formatted_answer
    }

In [5]:
class TokenizedDataset:
    def __init__(self, tokenizer, data: Dataset, format_fn):
        """
        Uses the following:
        - BaseLLM.tokenizer
        - Dataset
        - format_fn: A function that converts data elements into a dictionary with:
          - question: str
          - answer: str
        """
        self.format_fn = format_fn
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        formated_data = self.format_fn(*self.data[idx])
        return tokenize(self.tokenizer, **formated_data)


In [6]:
# Helper function for memory management
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        torch.mps.empty_cache()
        torch.mps.synchronize()


def test_model(ckpt_path: str, testset="valid"):
    """Tests the model."""
    testset = Dataset(testset)

    if ckpt_path.startswith("HuggingFaceTB/SmolLM2-360M"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-360M-Instruct")
    elif ckpt_path.startswith("HuggingFaceTB/SmolLM2-1.7B"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    else:
        llm = BaseLLM()
        # Load model with LoRA adapter
        llm.model = PeftModel.from_pretrained(llm.model, ckpt_path).to(llm.device)

    benchmark_result = benchmark(llm, testset, 100)
    print(f"{benchmark_result.answer_rate=}  {benchmark_result.bleu_score=}  {benchmark_result.rouge_score=}  {benchmark_result.bertscore=}")
    return benchmark_result

In [7]:

from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
rouge = Rouge()

In [8]:
# Define global llm variable
_base_llm = None

def set_base_llm(base_model_path: str):
    """Set global llm"""
    global _base_llm
    _base_llm = BaseLLM(base_model_path)

def compute_metrics(eval_pred):
    """Function to calculate evaluation metrics"""
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    # Utility function to extract answers
    def extract_pred_answer(answer: str) -> str:
        
        # Check if the answer contains <answer> tags
        if "<answer>" in answer:
            # Find the last occurrence of <answer> tag
            last_answer_pos = answer.rfind("<answer>")
            
            # Extract content after the last <answer> tag
            content_after_tag = answer[last_answer_pos + len("<answer>"):]
            
            # Check if there's a closing </answer> tag after the last opening tag
            if "</answer>" in content_after_tag:
                end_pos = content_after_tag.find("</answer>")
                return content_after_tag[:end_pos].strip()
            else:
                # If no closing tag, return everything after the last <answer>
                return content_after_tag.strip()
        else:
            return answer.strip() if answer else ""
    
    # Load actual answer dataset
    valid_dataset = Dataset("valid")
    true_answers = [valid_dataset[i][1] for i in range(len(valid_dataset))][:len(labels)]
    
    sample_correct = 0
    valid_samples = 0
    
    # Create BaseLLM instance to use tokenizer
    """Function to calculate evaluation metrics"""
    global _base_llm
    if _base_llm is None:
        raise ValueError("BaseLLM is not set. Please call set_base_llm() first.")
    llm = _base_llm
        
    bleu_score = 0
    rouge_score = 0

    for i in range(len(labels)):
        # Decode prediction results
        pred_tokens = predictions[i][labels[i] != -100]
        pred_text = llm.tokenizer.decode(pred_tokens, skip_special_tokens=True)
        
        # Extract number from prediction
        pred_value = extract_pred_answer(pred_text)
        true_value = true_answers[i]

        bleu_score += sentence_bleu([true_value], pred_value) if pred_value else 0
        rouge_score += rouge.get_scores(pred_value, true_value)[0]['rouge-l']['f'] if pred_value else 0
        valid_samples += 1
    
    # accuracy = sample_correct / valid_samples if valid_samples > 0 else 0.0
    bleu_score = bleu_score / valid_samples if valid_samples > 0 else 0.0
    rouge_score = rouge_score / valid_samples if valid_samples > 0 else 0.0
                
    return {
        "sample_bleu": bleu_score,
        "sample_rouge": rouge_score,
        "valid_samples": valid_samples
    }


def train_model(output_dir: str = "./output", final_model_path="sft_model_360M"):
    """Train the model."""
    # Load base model and dataset
    if final_model_path.startswith("HuggingFaceTB/SmolLM2-360M"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-360M-Instruct")
    elif final_model_path.startswith("HuggingFaceTB/SmolLM2-1.7B"):
        llm = BaseLLM("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    else:
        llm = BaseLLM()
    train_data = Dataset("train")
    
    # Set model to training mode
    llm.model.train()
    
    # Enable gradient checkpointing
    llm.model.config.use_cache = False  # Required: not compatible with gradient checkpointing
    llm.model.gradient_checkpointing_enable()
    
    # MPS (Apple Silicon) optimization settings
    is_mps = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
    
    # LoRA configuration
    lora_config = LoraConfig(
        r=4,  # rank
        lora_alpha=32,  # alpha = 8*r (recommended 4-5x)
        target_modules="all-linear",  # Apply to all linear layers
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
    )
    
    # Apply LoRA
    peft_model = get_peft_model(llm.model, lora_config)
    
    # GPU settings
    if torch.cuda.is_available() or is_mps:
        peft_model.enable_input_require_grads()
    
    # Tokenize dataset    
    tokenized_dataset = TokenizedDataset(llm.tokenizer, train_data, format_example)
    valid_dataset = TokenizedDataset(llm.tokenizer, Dataset("valid"), format_example)
    
    # Create unique experiment name
    run_id = datetime.now().strftime('%Y%m%d_%H%M%S')
    experiment_name = f"run_{run_id}"
    experiment_dir = os.path.join(output_dir, experiment_name)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=experiment_dir,
        logging_dir=experiment_dir,
        report_to="tensorboard",
        run_name=f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        learning_rate=5e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        weight_decay=0.01,
        gradient_checkpointing=True,
        optim="adamw_torch",
        
        # Memory optimization settings
        gradient_accumulation_steps=8,
        
        # Validation & Logging
        evaluation_strategy="epoch",
        per_device_eval_batch_size=4,
        logging_steps=20,
        
        # Model saving settings
        save_strategy="epoch",
        save_total_limit=1,
        
        # Early stopping
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        
        # MPS optimization
        fp16=False,
        bf16=False,
        torch_compile=False,
        
        # Other settings
        remove_unused_columns=True,
        max_grad_norm=0.5,
        dataloader_num_workers=0,
    )

    set_base_llm("HuggingFaceTB/SmolLM2-360M-Instruct")  # Set here
    
    
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Set here
    )
    
    # Calculate LoRA adapter size (trainable parameters only)
    adapter_size = sum(
        p.numel() * p.element_size()
        for n, p in peft_model.named_parameters()
        if 'lora_' in n
    ) / (1024 ** 2)

    print(f"LoRA Adapter Memory Usage: {adapter_size:.2f}MB")
    assert adapter_size < 20, "Adapter size exceeds 20MB limit"
    
    # Clear memory before training
    clear_memory()
    
    # Start training
    trainer.train()
    
    # Save final model after training
    # final_model_path = "./sft_model"  # Modify according to notebook path
    trainer.save_model(final_model_path)
    
    # Clear memory and test
    clear_memory()
    test_model(final_model_path, "valid")

In [11]:
# Run training
# Create output directory
os.makedirs("./output", exist_ok=True)

# Execute model training
train_model("./output", "sft_model_360M")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


LoRA Adapter Memory Usage: 8.28MB


Epoch,Training Loss,Validation Loss,Sample Bleu,Sample Rouge,Valid Samples
0,2.0326,1.738324,0.611702,0.501197,182
1,1.7163,1.53559,0.650061,0.543812,182
2,1.5219,1.424317,0.676704,0.579522,182
3,1.3796,1.390839,0.685899,0.592727,182
4,1.2105,1.380463,0.692843,0.60133,182
5,1.1023,1.396223,0.698548,0.608036,182
6,0.9867,1.435567,0.697342,0.606046,182
7,0.9147,1.467019,0.696573,0.603566,182


python(31544) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
LLM Running on Micro Batches 16:   0%|          | 0/7 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
LLM Running on Micro Batches 16: 100%|██████████| 7/7 [02:04<00:00, 17.80s/it]




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.3083121774421056  benchmark_result.rouge_score=0.3446031458391489  benchmark_result.bertscore=0.8631736457347869


In [12]:
orig_result = test_model("HuggingFaceTB/SmolLM2-360M-Instruct", "test")

LLM Running on Micro Batches 16: 100%|██████████| 7/7 [01:07<00:00,  9.59s/it]


generations: ['What is (are) Central sleep apnea ?\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral slee

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.13776801739345138  benchmark_result.rouge_score=0.1604196905770521  benchmark_result.bertscore=0.8155777990818024


In [13]:
print(f"{orig_result.samples[0].question=}")
print(f"{orig_result.samples[0].answer=}")
print(f"{orig_result.samples[0].correct_answer=}")
print(f"{orig_result.samples[0].bleu_score=}")
print(f"{orig_result.samples[0].rouge_score=}")
print(f"{orig_result.samples[0].bertscore=}")


orig_result.samples[0].question='What is (are) Central sleep apnea ?'
orig_result.samples[0].answer='What is (are) Central sleep apnea ?\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep disorder in which the brain fails to regulate breathing during sleep.\n\nCentral sleep apnea is a sleep

In [14]:
# Test trained model
finetuned_result = test_model("sft_model_360M", "test")

LLM Running on Micro Batches 16: 100%|██████████| 7/7 [01:43<00:00, 14.76s/it]


generations: ["What is (are) Central sleep apnea ? <answer>Central sleep apnea is a condition in which a person stops breathing for short periods of time (apneas) without taking a breath. This is because the brain does not send signals to the breathing muscles.  Central sleep apnea is a common condition. It is more common in older people.  Central sleep apnea is often confused with sleep apnea due to airway obstruction. However, the main difference is that people with central sleep apnea have a problem with the brain's control of breathing, whereas people with sleep apnea due to airway obstruction have a problem with the airway.)</answer>", 'What are the symptoms of Acetaminophen and codeine overdose ? <answer>Symptoms of acetaminophen and codeine overdose may include:  - Abdominal pain  - Abdominal tenderness  - Abdominal pain  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal tenderness  - Abdominal ten

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

benchmark_result.answer_rate=1.0  benchmark_result.bleu_score=0.2782720444128082  benchmark_result.rouge_score=0.30205028165680864  benchmark_result.bertscore=0.8590093117952347


In [15]:
print(f"{finetuned_result.samples[0].question=}")
print(f"{finetuned_result.samples[0].answer=}")
print(f"{finetuned_result.samples[0].correct_answer=}")
print(f"{finetuned_result.samples[0].bleu_score=}")
print(f"{finetuned_result.samples[0].rouge_score=}")
print(f"{finetuned_result.samples[0].bertscore=}")


finetuned_result.samples[0].question='What is (are) Central sleep apnea ?'
finetuned_result.samples[0].answer="Central sleep apnea is a condition in which a person stops breathing for short periods of time (apneas) without taking a breath. This is because the brain does not send signals to the breathing muscles.  Central sleep apnea is a common condition. It is more common in older people.  Central sleep apnea is often confused with sleep apnea due to airway obstruction. However, the main difference is that people with central sleep apnea have a problem with the brain's control of breathing, whereas people with sleep apnea due to airway obstruction have a problem with the airway.)"
finetuned_result.samples[0].correct_answer='Central sleep apnea is a sleep disorder in which breathing stops over and over during sleep.)'
finetuned_result.samples[0].bleu_score=0.12253343606116894
finetuned_result.samples[0].rouge_score=0.29850745938070844
finetuned_result.samples[0].bertscore=0.87914884090