# Finetune `meta-llama/Meta-Llama-3-8B-Instruct` on an EC2 instance using `Unsloth`
---

Unsloth makes finetuning large language models like Llama-3, Mistral, Phi-4 and Gemma 2x faster, use 70% less memory, and with no degradation in accuracy!

**Note**: ***This notebook is run on a `g6e.12xlarge` instance. Follow the prerequisite steps [here](README.md)***

In this example, we will be fine tuning the llama3 8b instruct model. There are several 4bit pre quantized models that `unsloth` provides that are not gated. This supports 4x faster downloading with no OOMs. In this case, we will be using the standard `meta-llama/Meta-Llama-3-8B-Instruct` model from hugging face. 

In [1]:
import os
import logging
import globals as g
from dotenv import load_dotenv
from unsloth import to_sharegpt
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth import standardize_sharegpt
from ec2_metrics import EC2MetricsCallback

# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Remove existing handlers
logger.handlers.clear()

# Add a simple handler
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
  source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
  "self.rotary_emb = .+?\)", function,
  "self.rotary_emb = .+?\)", function,
  from .autonotebook import tqdm as notebook_tqdm
  left = re.match("[\s\n]{4,}", leftover).span()[1]
  .replace("*", "\*").replace("^", "\^")\
  .replace("*", "\*").replace("^", "\^")\
  .replace("-", "\-").replace("_", "\_")\
  .replace("-", "\-").replace("_", "\_")\
  .replace(":", "\:").replace("+", "\+")\
  .replace(":", "\:").replace("+", "\+")\
  .replace(".", "\.").replace(",", "\,")\
  .replace(".", "\.").replace(",", "\,")\
  .replace("(", "\(").replace(")", "\)")\
  .replace("(", "\(").replace(")", "\)")\
  .replace("[", "\[").replace("]", "\]")\
  .replace("[", "\[").replace("]", "\]")\
  r"for ([^\s]{1,}) in "

🦥 Unsloth Zoo will now patch everything to make training faster!


  f"def {function_name}\(.*?\).*?\:\n",
  gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
  mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
  f"   \\\   /|    [0] Installing llama.cpp might take 3 minutes.\n"\
  f"O^O/ \_/ \\    [1] Converting HF to GGUF 16bits might take 3 minutes.\n"\
  f"\        /    [2] Converting GGUF 16bits to {quantization_method} might take 10 minutes each.\n"\
  "def __init__\(.*?\).*?\:\n",
2025-03-18 21:52:46,996	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
  f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
  f"O^O/ \_/ \\    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
  f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers =

In [7]:
# Load environment variables from .env file
import getpass
load_dotenv()
if not os.getenv("HF_TOKEN"):
    os.environ["HF_TOKEN"] = getpass.getpass("Enter your HuggingFace token: ")
hf_token = os.getenv("HF_TOKEN")

if not os.getenv("HF_MODEL_ID"):
    hf_model_id  = input("Enter the model id to use for fine-tuning (e.g. meta-llama/Llama-3.1-8B-Instruct): ")
else:
    hf_model_id = os.getenv("HF_MODEL_ID")
logger.info(f"hf_model_id={hf_model_id}")


[2025-03-18 21:54:53,230] p148384 {2478216038.py:12} INFO - hf_model_id=meta-llama/Llama-3.1-8B-Instruct


In [None]:
import os
import huggingface_hub
from huggingface_hub import HfApi

# Set Hugging Face token
hf_token = "My token that I removed for security"
os.environ["HF_TOKEN"] = hf_token

api = HfApi()

# Check if token is valid
try:
    user_info = huggingface_hub.whoami(token=hf_token)
    print(f"HF Token is valid! Logged in as: {user_info['name']}")
except huggingface_hub.utils._errors.HfHubHTTPError:
    raise ValueError("❌ Invalid Hugging Face token! Double-check your token.")

# Check model access
model_id = "meta-llama/Llama-3.2-1B-Instruct"

try:
    api.model_info(model_id, token=hf_token)
    print(f"You have access to the model: {model_id}")
except Exception as e:
    print(f"You do NOT have access to the model: {model_id}")
    print("Error:", e)

✅ HF Token is valid! Logged in as: keyvanip
✅ You have access to the model: meta-llama/Llama-3.2-1B-Instruct


In [9]:
max_seq_length = 100 # Choose any! We auto support RoPE Scaling internally!
dtype = "bfloat16" # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

DATASET_OF_INTEREST: str = 'PolyAI/banking77'

BANKING_PROMPT: str = """Classify the following customer query into one of the 77 banking categories.

### Customer Query:
{}

### Predicted Category:
{}"""

In [12]:
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = hf_model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = hf_token,
        device_map="auto",  # ✅ Automatically uses GPU if available
        use_gradient_checkpointing=True,  # Reduce memory usage during training
        gpu_memory_utilization=0.1,  # Lower GPU memory usage
        trust_remote_code=True
    )
except Exception as e:
    logger.error(f"Error occurred while loading the model: {e}")
    raise

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.045 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


[2025-03-18 21:56:32,915] p148384 {modeling.py:957} INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Data Prep

We now use the Alpaca dataset from vicgalle, which is a version of 52K of the original Alpaca dataset generated from GPT4. You can replace this code section with your own data prep.

In [45]:
dataset = load_dataset("PolyAI/banking77", split="train", trust_remote_code=True)
logger.info(f"Columns in the dataset: {dataset.column_names}")

[2025-03-18 22:23:42,867] p148384 {4047342736.py:2} INFO - Columns in the dataset: ['text', 'label']


In [46]:
dataset = dataset.map(lambda example: {
    "input": example["text"],       # User's banking query
    "output": str(example["label"]) # Convert category label to string
})

In [47]:
from pprint import pprint
pprint(dataset[:3])

{'input': ['I am still waiting on my card?',
           "What can I do if my card still hasn't arrived after 2 weeks?",
           'I have been waiting over a week. Is the card still coming?'],
 'label': [11, 11, 11],
 'output': ['11', '11', '11'],
 'text': ['I am still waiting on my card?',
          "What can I do if my card still hasn't arrived after 2 weeks?",
          'I have been waiting over a week. Is the card still coming?']}


In [48]:
print(dataset.column_names)

['text', 'label', 'input', 'output']


In [49]:
import pyarrow

print("Dataset Columns:", dataset.column_names)
print("\nSchema:")
print(dataset._data)  # Print raw dataset schema

Dataset Columns: ['text', 'label', 'input', 'output']

Schema:
MemoryMappedTable
text: string
label: int64
input: string
output: string
----
text: [["I am still waiting on my card?","What can I do if my card still hasn't arrived after 2 weeks?","I have been waiting over a week. Is the card still coming?","Can I track my card while it is in the process of delivery?","How do I know if I will get my card, or if it is lost?",...,"I would like to know which fiat currencies are recognized by your organization.","Can you change my currency to EUR?","Are you okay with exchanges to EUR?","Can I exchange my money for EUR?","Can I have money of different country of origin."],["What are the currency types that I can keep money in.","I need assistance with understanding which fiat currencies that you support.","Could you tell me the fiat currencies that you work with?","Which fiat currencies are supported?","Which flat currencies do you support for holding and exchange?",...,"Why hasn't my top up g

In [50]:
chat_template = """Classify the customer query into one of the 77 banking categories.

### Customer Query:
{INPUT}

### Predicted Category:
{OUTPUT}"""


In [51]:
# Step 1: Create a new dataset without "conversations"
columns_to_keep = [col for col in dataset.column_names if col != "conversations"]  # Drop all "conversations"
dataset = dataset.select_columns(columns_to_keep)  # Rebuild dataset

# Step 2: Convert dataset into a structured chat format
dataset = dataset.map(lambda example: {
    "conversations": [
        {"role": "user", "content": example["text"]},  # User's query
        {"role": "assistant", "content": str(example["label"])}  # Predicted category
    ]
}, remove_columns=["text", "label", "input", "output"])  # Remove extra fields

# Step 3: Apply chat template
dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
)

# Step 4: Print dataset structure for final verification
print("Fixed Dataset Columns:", dataset.column_names)
pprint(dataset[:3])  # Print sample

Map: 100%|██████████| 10003/10003 [00:00<00:00, 27937.17 examples/s]
Unsloth: We automatically added an EOS token to stop endless generations.
Map: 100%|██████████| 10003/10003 [00:00<00:00, 14192.37 examples/s]

Fixed Dataset Columns: ['conversations', 'text']
{'conversations': [[{'content': 'I am still waiting on my card?',
                     'role': 'user'},
                    {'content': '11', 'role': 'assistant'}],
                   [{'content': "What can I do if my card still hasn't arrived "
                                'after 2 weeks?',
                     'role': 'user'},
                    {'content': '11', 'role': 'assistant'}],
                   [{'content': 'I have been waiting over a week. Is the card '
                                'still coming?',
                     'role': 'user'},
                    {'content': '11', 'role': 'assistant'}]],
 'text': ['<|begin_of_text|>Classify the customer query into one of the 77 '
          'banking categories.\n'
          '\n'
          '### Customer Query:\n'
          'I am still waiting on my card?\n'
          '\n'
          '### Predicted Category:\n'
          '11<|eot_id|>',
          '<|begin_of_text|>Classify the cu




In [52]:
%%time
# train the model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 600,
        # num_train_epochs = 1, # For longer training runs!
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
    callbacks=[EC2MetricsCallback],
)

Converting train dataset to ChatML (num_proc=2): 100%|██████████| 10003/10003 [00:00<00:00, 24052.85 examples/s]
Applying chat template to train dataset (num_proc=2): 100%|██████████| 10003/10003 [00:01<00:00, 7412.49 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 10003/10003 [00:01<00:00, 5849.48 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 10003/10003 [00:00<00:00, 19418.76 examples/s]


CPU times: user 833 ms, sys: 394 ms, total: 1.23 s
Wall time: 4.72 s


In [None]:
%%time
# this will initiate the training process and also log the EC2 utilization metrics, such as the GPU
# utilization, CPU utilization, etc.
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,003 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 600
 "-____-"     Number of trainable parameters = 41,943,040
[2025-03-18 22:28:14,076] p148384 {ec2_metrics.py:184} INFO - Training started. Initiating EC2 metrics collection.
[2025-03-18 22:28:14,076] p148384 {ec2_metrics.py:170} INFO - Writing header: ['timestamp', 'cpu_percent_mean', 'memory_percent_mean', 'memory_used_mean', 'gpu_utilization_mean', 'gpu_memory_used_mean', 'gpu_memory_free_mean', 'gpu_memory_total_mean']
[2025-03-18 22:28:14,077] p148384 {ec2_metrics.py:41} INFO - Starting collection
[2025-03-18 22:28:14,338] p148384 {ec2_metrics.py:143} INFO - Starting daemon collector to run in background


Step,Training Loss
1,3.8218
2,4.1342
3,4.0936
4,3.9428
5,3.6009
6,3.0507
7,2.664
8,2.022
9,1.654
10,1.5804


[2025-03-18 22:48:20,985] p148384 {ec2_metrics.py:191} INFO - Training ended. Stopping EC2 metrics collection.
[2025-03-18 22:48:20,985] p148384 {ec2_metrics.py:33} INFO - Stopped collection


CPU times: user 14min 42s, sys: 5min 58s, total: 20min 41s
Wall time: 20min 8s


[2025-03-18 22:48:24,388] p148384 {ec2_metrics.py:33} INFO - Stopped collection


### Log the trainer stats
---

In this step, we log some of the trainer stats, such as the number of global steps it took to get to a specific training loss, the train runtime, samples per second, steps per second, etc.

In [54]:
# Format the training stats in a readable way
output_text = f"""Training Statistics:
Global Steps: {trainer_stats.global_step}
Training Loss: {trainer_stats.training_loss:.4f}

Metrics:
- Train Runtime: {trainer_stats.metrics['train_runtime']:.3f} seconds
- Training Samples/Second: {trainer_stats.metrics['train_samples_per_second']:.3f}
- Training Steps/Second: {trainer_stats.metrics['train_steps_per_second']:.3f}
- Total FLOPS: {trainer_stats.metrics['total_flos']:.2e}
- Final Train Loss: {trainer_stats.metrics['train_loss']:.4f}
"""

# Save to a text file
with open(os.path.join(g.RESULTS_DIR, g.TRAINING_STATS), 'w') as f:
    f.write(output_text)

In [55]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
    {"role": "user", "content": "I see a charge on my credit card statement but I paid on time, why?"},
    {"role": "user", "content": "Do you have a branch in Timbuktu?"},
    {"role": "user", "content": "I lost my card and my replacement card has not arrived."},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


11<|eot_id|>


In [None]:
FastLanguageModel.for_inference(model)  # Enable optimized inference

messages = [
    {"role": "user", "content": "I see a charge on my credit card statement but I paid on time, why?"},
    {"role": "user", "content": "Do you have a branch in Timbuktu?"},
    {"role": "user", "content": "I lost my card and my replacement card has not arrived."},
]

# Initialize list to store results
predictions = []

# Process each query separately to prevent decoding issues
for msg in messages:
    input_ids = tokenizer.apply_chat_template(
        [msg],  # Process one input at a time
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate model response
    outputs = model.generate(input_ids, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)

    # Decode the prediction
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Extract only the category (number)
    category = pred_text.split("### Predicted Category:")[-1].strip() if "### Predicted Category:" in pred_text else pred_text
    predictions.append(category if category else "<No Response>")

# Save results
with open("problem1_task1.txt", "w") as f:
    for i, msg in enumerate(messages):
        f.write(f"input: {msg['content']}\n")
        f.write(f"category: {predictions[i]}\n\n")

print(" output saved in `problem1_task1.txt`.")

Fixed output saved in `problem1_task1.txt`.


In [16]:
# save the model
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')