## Setup

In [1]:
### CHOOSE GPU ###

import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(torch.cuda.device_count())

1


In [2]:
### SETUP CONSTANTS ###

import torch
from unsloth import is_bfloat16_supported

RESPONSE_REGEX = r"### Response:\s*(.*?)(?=\n###|\Z)"

BASE = False    # Load the base model or fine-tuned model
MODE = "BASE" if BASE else "FT" 

MODEL_LABEL = "qwen-7B"     # used to create results file name
ITERATION = 20      # iteration to target for fine-tuned model
CHECKPOINT_DIR = "checkpoint-522"

BASE_MODEL_PATH = "unsloth/Qwen2.5-Coder-7B"    # used to load the base model
FINE_TUNED_MODEL_PATH = f"./{MODEL_LABEL}/{MODEL_LABEL}-{ITERATION}/checkpoints/{CHECKPOINT_DIR}"       # can also sub with any path to a fine-tuned model
EMBEDDING_MODEL_PATH = "nvidia/NV-Embed-v2"

METRICS_DIR = "./summary_stats/metrics"

LOAD_IN_4BIT = False

# LoRA Parameters
LORA_ALPHA = 64
LORA_R = 32
LORA_DROPOUT = 0

# File paths
TRAIN_FILE = "../train.json"
VALID_FILE = "../valid.json"
TEST_FILE = "../test.json"

# Training Hyperparameters
PER_DEVICE_TRAIN_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 100      # 5-10% of total steps was suggested
NUM_TRAIN_EPOCHS = 3
MAX_STEPS = 0
LEARNING_RATE = 1e-4
FP16 = not is_bfloat16_supported()
BF16 = is_bfloat16_supported()
LOGGING_STEPS = 10
OPTIM = "adamw_8bit"
WEIGHT_DECAY = 0.01
LR_SCHEDULER_TYPE = "linear"
SEED = 3407
REPORT_TO = "none"  # Use this for WandB etc
EVAL_STEPS = 10

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ALPACA_PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
### LOAD THE BASE MODEL ###

from unsloth import FastLanguageModel

model_name = BASE_MODEL_PATH if BASE else FINE_TUNED_MODEL_PATH

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = LOAD_IN_4BIT # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_R, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = load_in_4bit,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

model.eval()

==((====))==  Unsloth 2025.3.8: Fast Qwen2 patching. Transformers: 4.46.2.
   \\   /|    NVIDIA RTX 6000 Ada Generation. Num GPUs = 1. Max memory: 47.467 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2025.3.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584, padding_idx=151665)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

## Data Prep

In [4]:
### read the Alpaca JSON file ###

import json
import pandas as pd
from datasets import Dataset

# Function to load JSON file into a Pandas DataFrame
def load_json_to_df(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)  # Assumes JSON contains a list of dictionary entries
    return pd.DataFrame(data)

# Load datasets as DataFrames
test_df = load_json_to_df(TEST_FILE)

# Convert DataFrames into Hugging Face datasets
test_dataset = Dataset.from_pandas(test_df)

# Display first few rows of Train DataFrame
test_df.head()

Unnamed: 0,instruction,input,output
0,Write a header comment for this x86 assembly c...,\tSECTION .text\n\tglobal main\nmain:\n\tpush\...,; This is the funniest snippet so far. It illu...
1,Write a header comment for this x86 assembly c...,section .text\n\n global _start\n\n_start:\...,"/*\n * $Id: gets-linux.c,v 1.3 2004/06/02 12:2..."
2,Write a header comment for this x86 assembly c...,_start:\n push 0x66\n pop eax\n ...,/*\n* Title: Shell Reverse TCP Shellcode -...
3,Write a header comment for this x86 assembly c...,\t\n\t\n\tSECTION .text\n\tglobal main\nmain:\...,; This snippet copies rcx bytes of the 3-bytes...
4,Write a header comment for this x86 assembly c...,\tSECTION .text\n\tglobal main\nmain:\n\t\n\tn...,; This snippet sets rax to 1 for all its initi...


In [5]:
### Load the alpaca format dataset into the correct format ###

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = ALPACA_PROMPT.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts, }
pass

test_dataset = Dataset.from_pandas(test_df)

test_dataset = test_dataset.map(formatting_prompts_func, batched = True)


Map:   0%|          | 0/1199 [00:00<?, ? examples/s]

## Metric Calculation

### Cross Entropy Loss

In [6]:
### TRAINER FOR FT MODEL ###

from trl import SFTTrainer
from transformers import TrainingArguments

def generate_trainer(model, tokenizer, train_dataset, valid_dataset):

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,  # train dataset
        eval_dataset=valid_dataset,     # Validation data
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            warmup_steps=WARMUP_STEPS,
            num_train_epochs=NUM_TRAIN_EPOCHS, 
            max_steps=MAX_STEPS,
            learning_rate=LEARNING_RATE,
            fp16=FP16,
            bf16=BF16,
            logging_steps=LOGGING_STEPS,
            evaluation_strategy="steps",  
            eval_steps=EVAL_STEPS,
            save_strategy="epoch",
            optim=OPTIM,
            weight_decay=WEIGHT_DECAY,
            lr_scheduler_type=LR_SCHEDULER_TYPE,
            seed=SEED,
            output_dir=METRICS_DIR,
            report_to=REPORT_TO,
        ),
    )

    return trainer


In [7]:
### CALCULATE CROSS ENTROPY LOSS ###

def calculate_cross_entropy_loss(trainer, tokenizer, test_dataset):

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", max_length=2048, truncation=True)

    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

    test_results = trainer.evaluate(tokenized_test_dataset)

    cross_entropy_loss = test_results['eval_loss']

    print(f"Test Loss: {test_results['eval_loss']:.16f}")
    return cross_entropy_loss
    

In [None]:
trainer = generate_trainer(model, tokenizer, test_dataset, test_dataset)

# cross_entropy_loss = og_calculate_cross_entropy_loss(tokenizer, trainer, test_dataset)
cross_entropy_loss = calculate_cross_entropy_loss(trainer, tokenizer, test_dataset)


### Semantic Embedding Cosine Similarity

In [14]:
### FUNCTIONS FOR CALCULATING COSINE SIMILARITY ### 

import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import re


# Function to move model to CUDA
def move_model(model, device):
    return model.to(device)


# Function to move model to CPU (offload from GPU)
def offload_model(model):
    return model.to("cpu")


# Function to extract only the response part of the model output
def extract_response(text):
    match = re.search(RESPONSE_REGEX, text, re.DOTALL)
    return match.group(1).strip() if match else ""


# Function to generate responses and then offload the model
def generate_responses(model, tokenizer, test_dataset, device, batch_size=8, max_length=512):
    """Generates responses, then moves model off CUDA to free memory."""
    
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    print(f"{start_gpu_memory} GB of memory reserved.")

    # Move model to CUDA
    model = move_model(model, device)
    FastLanguageModel.for_inference(model)  # Enable optimized inference
    
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    print(f"{start_gpu_memory} GB of memory reserved.")

    test_texts = [
        ALPACA_PROMPT.format(sample["instruction"], sample["input"], "")
        for sample in test_dataset
    ]

    dataloader = DataLoader(test_texts, batch_size=batch_size, shuffle=False)
    generated_responses = []

    print("============= GENERATING RESPONSES =============")
    count = 0

    for batch_inputs in dataloader:
        print("Num Completed: " + str(count))

        input_ids = tokenizer(batch_inputs, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to("cuda")
        # input_ids = {k: v.to(device) for k, v in input_ids.items()}  # Move to GPU

        with torch.no_grad():

            # try settign either of these to false: include_prompt_in_result or return_full_text
            output_ids = model.generate(input_ids["input_ids"], 
                                        max_new_tokens=max_length, 
                                        use_cache=True, 
                                        eos_token_id=tokenizer.eos_token_id,
                                        attention_mask=input_ids["attention_mask"]
                                    )
            batch_outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for output in batch_outputs:
                print(output)
                response_text = extract_response(output)
                print(response_text)
                generated_responses.append(response_text)
                print("---------------------------------------------")

        count += batch_size
        start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
        print(f"{start_gpu_memory} GB of memory reserved.")

    return generated_responses


# Function to get batch embeddings
def get_batch_embeddings(texts, tokenizer, model, device, max_length=512):
    """Encodes a batch of texts into embeddings."""
    
    model.eval()

    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs['sentence_embeddings']  # Extract sentence embeddings
    return embeddings.mean(dim=1).cpu().numpy()  # Move embeddings back to CPU and convert to numpy


# Function to compute cosine similarity
def compute_cosine_similarity(generated_texts, reference_texts, embedding_tokenizer, embedding_model, device, batch_size=8, max_length=512):
    """Computes cosine similarity, then moves embedding model off CUDA after use."""

    dataloader = DataLoader(list(zip(generated_texts, reference_texts)), batch_size=batch_size, shuffle=False)
    cosine_similarities = []

    for batch_generated, batch_references in dataloader:

        generated_embeddings = get_batch_embeddings(batch_generated, embedding_tokenizer, embedding_model, device, max_length)
        reference_embeddings = get_batch_embeddings(batch_references, embedding_tokenizer, embedding_model, device, max_length)

        batch_similarities = torch.nn.functional.cosine_similarity(
            torch.tensor(generated_embeddings), torch.tensor(reference_embeddings)
        ).cpu().tolist()

        cosine_similarities.extend(batch_similarities)

    average_similarity = sum(cosine_similarities) / len(cosine_similarities)
    return average_similarity, cosine_similarities

In [10]:
### FUNCTIONS TO MAP RESPONSES AND SIMILARITIES TO DATASET ENTRIES ###

import json
from datasets import Dataset

def map_responses_and_save(test_dataset, generated_responses, output_file):
    """
    Maps generated responses to the corresponding entries in the test dataset and saves to a JSON file.

    Args:
        test_dataset (Dataset): The test dataset in Alpaca format.
        generated_responses (list): List of generated responses from the model.
        output_file (str): Path to the output JSON file.
    """
    if len(test_dataset) != len(generated_responses):
        raise ValueError("Mismatch between dataset size and number of generated responses.")

    updated_data = []
    for entry, response in zip(test_dataset, generated_responses):
        entry["generated_response"] = response  # Add generated response to entry
        updated_data.append(entry)

    # Convert all float32 values to regular floats for JSON serialization
    for entry in updated_data:
        for key, value in entry.items():
            if isinstance(value, float):
                entry[key] = float(value)

    # Save to JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=4, ensure_ascii=False)

    print(f"Updated dataset saved to {output_file}")

    
def map_similarities_and_save(input_file, cosine_similarities):
    """
    Reads a JSON object from a file, adds cosine similarity scores, and writes it back to the same file.

    Args:
        input_file (str): Path to the input JSON file containing the dataset entries.
        cosine_similarities (list): List of cosine similarity scores corresponding to the entries.
    """
    # Read the existing data from the JSON file
    with open(input_file, 'r', encoding='utf-8') as file:
        updated_data = json.load(file)

    # Check if the length of data matches the length of cosine similarities
    if len(updated_data) != len(cosine_similarities):
        raise ValueError("Mismatch between dataset size and number of cosine similarities.")

    # Add cosine similarity to each entry
    for entry, similarity in zip(updated_data, cosine_similarities):
        entry["cosine_similarity"] = float(similarity)  # Add cosine similarity to entry

    # Save the updated dataset back to the JSON file
    with open(input_file, "w", encoding="utf-8") as file:
        json.dump(updated_data, file, indent=4, ensure_ascii=False)

    print(f"Updated dataset saved to {input_file}")

In [11]:
### FUNCTION TO EXTRACT GENERATED RESPONSES FROM JSON FILE ###

import json

def extract_generated_responses(json_file):
    """
    Extracts the generated responses from a JSON file.

    Args:
        json_file (str): Path to the JSON file containing the dataset entries.

    Returns:
        list: A list of generated responses.
    """
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)  # Load the JSON data

    generated_responses = []
    reference_texts = []
    for entry in data:
        # Append the generated response to the list
        generated_responses.append(entry.get("generated_response"))
        reference_texts.append(entry.get("output"))


    return generated_responses, reference_texts


In [12]:
### FUNCTIONS TO CATEGORIZE SIMILARITY BY DATASET QUESTION ###

import json
import os
from collections import defaultdict

def categorize_and_save_entries(input_file, output_dir):
    """
    Categorizes dataset entries by their prompts and saves them into separate JSON files
    using a predefined mapping of instructions to file names.
    """

    instruction_to_name = {
        "Write a header comment for this x86 assembly code snippet:": "header_comment",
        "Comment the x86 assembly code snippet by generating a structured json object where the keys are the integer line numbers (starting at 1) and the values are the string comments. For example: {1: \"comment for line 1\", 2: \"comment for line 2\", 3: \"comment for line 3\"}": "inline_comments",
        "Describe the intent of the given snippet of assembly code": "intent",
        "An x86 Assembly code snippet has been partially masked. Complete the code by filling in the lines labeled '# <MASKED>'.": "masked",
        "Answer the following question about the x86 Assembly Language:": "question"
    }

    # Load the existing data from the JSON file
    with open(input_file, 'r', encoding='utf-8') as file:
        entries = json.load(file)

    # Dictionary to hold lists of entries for each prompt
    categorized_entries = defaultdict(list)

    # Categorize entries based on their prompts
    for entry in entries:
        prompt = entry.get("instruction")
        if prompt in instruction_to_name:
            file_key = instruction_to_name[prompt]
            categorized_entries[file_key].append(entry)

    # Save categorized entries into separate files
    for file_key, entries in categorized_entries.items():
        output_file = os.path.join(output_dir, f"{file_key}.json" )
        
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(entries, file, indent=4, ensure_ascii=False)

        print(f"Saved {len(entries)} entries to {output_file}")


def calculate_average_cosine_similarities(combined_average_cimilarity, output_directory, output_file):
    """
    Calculates the average cosine similarity for each JSON file in the specified directory
    and writes the results to a new JSON file.
    """
    
    results = {}

    results["average"] = combined_average_cimilarity

    # Iterate through each file in the output directory
    for filename in os.listdir(output_directory):
        if filename.endswith(".json"):  # Process only JSON files
            file_path = os.path.join(output_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                entries = json.load(file)  # Load the entries from the file

            # Extract cosine similarities and calculate the average
            cosine_similarities = [entry.get("cosine_similarity") for entry in entries if "cosine_similarity" in entry]
            
            if cosine_similarities:  # Check if there are similarities to calculate
                average_similarity = sum(cosine_similarities) / len(cosine_similarities)
                results[filename] = average_similarity

    # Write the results to the output JSON file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(results, file, indent=4, ensure_ascii=False)

    print(f"Average cosine similarities written to {output_file}")

In [None]:
### GENERATE RESPONSES ###

generated_responses = generate_responses(model, tokenizer, test_dataset, "cuda", batch_size=16)

output_file = f"./summary_stats/metrics/cosine_similarities/{MODEL_LABEL}_{MODE}_test_dataset_with_responses.json"
map_responses_and_save(test_dataset, generated_responses, output_file)  

In [None]:
### CLEAN UP TORCH MEMORY ###

import torch
import gc

# Delete the model from memory
# del SELECTED_TOKENIZER  
# del SELECTED_MODEL
del model
del tokenizer
# del embedding_model
# del embedding_tokenizer
gc.collect()  # Run garbage collection
torch.cuda.empty_cache() 
torch.cuda.ipc_collect()  

torch.cuda.reset_max_memory_allocated()

In [17]:
### LOAD EMBEDDING MODEL ###

from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

embedding_model = AutoModel.from_pretrained(
    EMBEDDING_MODEL_PATH,
    trust_remote_code=True,
    quantization_config=quantization_config
)

embedding_tokenizer = AutoTokenizer.from_pretrained(
    EMBEDDING_MODEL_PATH, 
    trust_remote_code=True,
)


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
### CURRENT MEMORY STATS ###

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX 6000 Ada Generation. Max memory = 47.467 GB.
22.748 GB of memory reserved.


In [None]:
### CALCULATE SIMILARITY OVER WHOLE DATASET ###

reference_texts = [sample['output'] for sample in test_dataset]

# need to format the generated_responses

average_similarity, cosine_similarities = compute_cosine_similarity(
    generated_responses, reference_texts, embedding_tokenizer, embedding_model, "cuda"
)

print(average_similarity)
print(cosine_similarities)

input_file = f"./summary_stats/metrics/cosine_similarities/{MODEL_LABEL}_{MODE}_test_dataset_with_responses.json"
output_dir = f"./summary_stats/metrics/cosine_similarities/{MODEL_LABEL}-{MODE}"

map_similarities_and_save(input_file, cosine_similarities)

categorize_and_save_entries(input_file, output_dir)

calculate_average_cosine_similarities(average_similarity, output_dir, f"./summary_stats/metrics/{MODEL_LABEL}_{MODE}_similarity_metrics.json")

  self.gen = func(*args, **kwds)


0.4354910424990491
[0.50244140625, 0.353515625, 0.57177734375, 0.45458984375, 0.45166015625, 0.44873046875, 0.1990966796875, 0.53173828125, 0.2435302734375, 0.416748046875, 0.2193603515625, 0.5048828125, 0.391357421875, 0.57275390625, 0.76611328125, 0.414794921875, 0.357421875, 0.435302734375, 0.4619140625, 0.1732177734375, 0.466064453125, 0.303466796875, 0.44384765625, 0.431640625, 0.358154296875, 0.53662109375, 0.422119140625, 0.373291015625, 0.3701171875, 0.2119140625, 0.130126953125, 0.1685791015625, 0.0528564453125, 0.1285400390625, 0.1353759765625, 0.0928955078125, 0.10235595703125, 0.0673828125, 0.13720703125, 0.135986328125, 0.1436767578125, 0.373779296875, 0.451416015625, 0.428955078125, 0.229736328125, 0.238037109375, 0.426513671875, 0.1553955078125, 0.474609375, 0.10888671875, 0.368408203125, 0.4091796875, 0.350341796875, 0.4169921875, 0.1575927734375, 0.460693359375, 0.30908203125, 0.319580078125, 0.423828125, 0.2298583984375, 0.374755859375, 0.1378173828125, 0.431884765625

## Generation Testing

In [39]:
### TEST WITH A SANDBOX EXAMPLE ###

MODEL = model
TOKENIZER = tokenizer


prompt = '''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Write a header comment for this x86 assembly code snippet:

### Input:
{}

### Response:
{}'''
inputs = tokenizer(
[
    prompt.format(
      '''
        section .data
            message db "Hello, World!", 0xA
            msg_len equ $ - message

        section .text
            global _start

        _start:
            mov rax, 1
            mov rdi, 1
            mov rsi, message
            mov rdx, msg_len
            syscall

            mov rax, 60
            xor rdi, rdi
            syscall

      ''',
      ""

    )
], return_tensors = "pt").to("cuda")

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

outputs = MODEL.generate(**inputs, max_new_tokens = 300, use_cache = True)
output_text = TOKENIZER.batch_decode(outputs, skip_special_tokens=True)[0]

print(output_text)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Write a header comment for this x86 assembly code snippet:

### Input:

        section .data
            message db "Hello, World!", 0xA
            msg_len equ $ - message

        section .text
            global _start

        _start:
            mov rax, 1
            mov rdi, 1
            mov rsi, message
            mov rdx, msg_len
            syscall

            mov rax, 60
            xor rdi, rdi
            syscall

      

### Response:
; This snippet prints "Hello, World!" to the console and then exits.
; It uses the write syscall to print the message and the exit syscall
; to terminate the program.

