In [1]:
import os
import logging
import globals as g
from dotenv import load_dotenv
from unsloth import to_sharegpt
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth import standardize_sharegpt
from ec2_metrics import EC2MetricsCallback

# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Remove existing handlers
logger.handlers.clear()

# Add a simple handler
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


2025-03-17 10:37:19,784	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# Load environment variables from .env file
import getpass
load_dotenv()
# if not os.getenv("HF_TOKEN"):
#     os.environ["HF_TOKEN"] = getpass.getpass("Enter your HuggingFace token: ")
# hf_token = os.getenv("HF_TOKEN")
os.environ["HF_TOKEN"] = ""
hf_token = ""
# ""

# if not os.getenv("HF_MODEL_ID"):
#     hf_model_id  = input("Enter the model id to use for fine-tuning (e.g. meta-llama/Llama-3.1-8B-Instruct): ")
# else:
#     hf_model_id = os.getenv("HF_MODEL_ID")
# logger.info(f"hf_model_id={hf_model_id}")
os.environ["HF_MODEL_ID"] = "meta-llama/Llama-3.2-1B-Instruct"
hf_model_id = "meta-llama/Llama-3.2-1B-Instruct"

# "meta-llama/Llama-3.2-1B-Instruct"

In [3]:
from datasets import load_dataset
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

DATASET_OF_INTEREST: str = 'mteb/banking77'

ALPACA_PROMPT: str = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [4]:
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = hf_model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = hf_token # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
except Exception as e:
    logger.error(f"Error occurred while loading the model: {e}")
    raise

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.045 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.15 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [6]:
dataset = load_dataset(DATASET_OF_INTEREST, split="train", trust_remote_code=True)
logger.info(f"Columns in the dataset: {dataset.column_names}")
dataset = dataset.map(lambda x: {**x, "instructions": ""})
dataset

[2025-03-17 10:37:30,901] p15964 {1272966976.py:2} INFO - Columns in the dataset: ['text', 'label', 'label_text']



Map:   0%|                                                                                                                                                             | 0/10003 [00:00<?, ? examples/s]


Map:  26%|█████████████████████████████████████▌                                                                                                         | 2630/10003 [00:00<00:00, 26130.75 examples/s]


Map:  54%|████████████████████████████████████████████████████████████████████████████▊                                                                  | 5369/10003 [00:00<00:00, 26858.94 examples/s]


Map:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 8524/10003 [00:00<00:00, 23608.30 examples/s]


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 23310.17 examples/s]




Dataset({
    features: ['text', 'label', 'label_text', 'instructions'],
    num_rows: 10003
})

In [7]:
dataset = to_sharegpt(
    dataset,
    merged_prompt="{text}[[\nYour input is:\n{instructions}]]",
    output_column_name="label",
    conversation_extension=3,
)
# Use the standardize_sharegpt function to just make the dataset in a correct format for finetuning
dataset = standardize_sharegpt(dataset)
from pprint import pprint
pprint(dataset[:3])


Merging columns:   0%|                                                                                                                                                 | 0/10003 [00:00<?, ? examples/s]


Merging columns: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 179246.21 examples/s]





Converting to ShareGPT:   0%|                                                                                                                                          | 0/10003 [00:00<?, ? examples/s]


Converting to ShareGPT: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 95828.04 examples/s]


Converting to ShareGPT: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 93916.13 examples/s]





Flattening the indices:   0%|                                                                                                                                          | 0/10003 [00:00<?, ? examples/s]


Flattening the indices: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 801169.09 examples/s]





Flattening the indices:   0%|                                                                                                                                          | 0/10003 [00:00<?, ? examples/s]


Flattening the indices:  30%|█████████████████████████████████████▏                                                                                      | 3000/10003 [00:00<00:00, 15977.29 examples/s]


Flattening the indices:  50%|█████████████████████████████████████████████████████████████▉                                                              | 5000/10003 [00:00<00:00, 13135.92 examples/s]


Flattening the indices:  70%|██████████████████████████████████████████████████████████████████████████████████████▊                                     | 7000/10003 [00:00<00:00, 12222.97 examples/s]


Flattening the indices:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 9000/10003 [00:00<00:00, 11798.43 examples/s]


Flattening the indices: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 11949.23 examples/s]





Flattening the indices:   0%|                                                                                                                                          | 0/10003 [00:00<?, ? examples/s]


Flattening the indices:  30%|█████████████████████████████████████▏                                                                                      | 3000/10003 [00:00<00:00, 16522.31 examples/s]


Flattening the indices:  50%|█████████████████████████████████████████████████████████████▉                                                              | 5000/10003 [00:00<00:00, 13567.62 examples/s]


Flattening the indices:  70%|██████████████████████████████████████████████████████████████████████████████████████▊                                     | 7000/10003 [00:00<00:00, 12611.95 examples/s]


Flattening the indices:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 9000/10003 [00:00<00:00, 12133.39 examples/s]


Flattening the indices: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 12250.87 examples/s]





Extending conversations:   0%|                                                                                                                                         | 0/10003 [00:00<?, ? examples/s]


Extending conversations:  40%|█████████████████████████████████████████████████▏                                                                         | 4000/10003 [00:00<00:00, 29410.18 examples/s]


Extending conversations:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 8000/10003 [00:00<00:00, 29750.49 examples/s]


Extending conversations: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 26622.99 examples/s]





Standardizing format:   0%|                                                                                                                                            | 0/10003 [00:00<?, ? examples/s]


Standardizing format:  40%|██████████████████████████████████████████████████▍                                                                           | 4000/10003 [00:00<00:00, 32183.54 examples/s]


Standardizing format:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 8000/10003 [00:00<00:00, 32976.43 examples/s]


Standardizing format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 27417.44 examples/s]

{'conversations': [[{'content': 'I am still waiting on my card?',
                     'role': 'user'},
                    {'content': '11', 'role': 'assistant'},
                    {'content': 'How can I convert currencies?',
                     'role': 'user'},
                    {'content': '33', 'role': 'assistant'},
                    {'content': 'Will be Apple Watch be able to let me top up?',
                     'role': 'user'},
                    {'content': '2', 'role': 'assistant'}],
                   [{'content': "What can I do if my card still hasn't arrived "
                                'after 2 weeks?',
                     'role': 'user'},
                    {'content': '11', 'role': 'assistant'},
                    {'content': 'Hello. Can you help figure out why the recent '
                                "transfer from my UK bank account isn't "
                                'showing up?',
                     'role': 'user'},
                    {'con




In [8]:
chat_template = """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template

dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template
)


Unsloth: We automatically added an EOS token to stop endless generations.



Map:   0%|                                                                                                                                                             | 0/10003 [00:00<?, ? examples/s]


Map:  20%|████████████████████████████▌                                                                                                                  | 2000/10003 [00:00<00:00, 18709.79 examples/s]


Map:  50%|███████████████████████████████████████████████████████████████████████▍                                                                       | 5000/10003 [00:00<00:00, 19537.76 examples/s]


Map:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 8000/10003 [00:00<00:00, 19729.91 examples/s]


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 10000/10003 [00:00<00:00, 19685.95 examples/s]


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 15331.95 examples/s]




In [9]:
dataset[0]['conversations']

[{'content': 'I am still waiting on my card?', 'role': 'user'},
 {'content': '11', 'role': 'assistant'},
 {'content': 'How can I convert currencies?', 'role': 'user'},
 {'content': '33', 'role': 'assistant'},
 {'content': 'Will be Apple Watch be able to let me top up?', 'role': 'user'},
 {'content': '2', 'role': 'assistant'}]

In [10]:
%%time
# train the model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        max_steps = 600,
        num_train_epochs = 1, # For longer training runs!
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
    callbacks=[EC2MetricsCallback],
)


Converting train dataset to ChatML (num_proc=2):   0%|                                                                                                                 | 0/10003 [00:00<?, ? examples/s]


Converting train dataset to ChatML (num_proc=2):  20%|███████████████████▋                                                                               | 1983/10003 [00:00<00:00, 13451.09 examples/s]


Converting train dataset to ChatML (num_proc=2):  48%|███████████████████████████████████████████████▊                                                   | 4828/10003 [00:00<00:00, 20211.29 examples/s]


Converting train dataset to ChatML (num_proc=2):  75%|██████████████████████████████████████████████████████████████████████████▋                        | 7548/10003 [00:00<00:00, 23038.04 examples/s]


Converting train dataset to ChatML (num_proc=2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 19844.91 examples/s]


Converting train dataset to ChatML (num_proc=2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 17160.92 examples/s]





Applying chat template to train dataset (num_proc=2):   0%|                                                                                                            | 0/10003 [00:00<?, ? examples/s]


Applying chat template to train dataset (num_proc=2):   4%|████▎                                                                                            | 446/10003 [00:00<00:15, 609.63 examples/s]


Applying chat template to train dataset (num_proc=2):  18%|████████████████▊                                                                              | 1773/10003 [00:00<00:03, 2601.57 examples/s]


Applying chat template to train dataset (num_proc=2):  38%|████████████████████████████████████▏                                                          | 3811/10003 [00:00<00:01, 5564.67 examples/s]


Applying chat template to train dataset (num_proc=2):  54%|███████████████████████████████████████████████████▌                                           | 5431/10003 [00:01<00:00, 7670.09 examples/s]


Applying chat template to train dataset (num_proc=2):  69%|█████████████████████████████████████████████████████████████████▉                             | 6937/10003 [00:01<00:00, 9303.49 examples/s]


Applying chat template to train dataset (num_proc=2):  91%|█████████████████████████████████████████████████████████████████████████████████████▍        | 9090/10003 [00:01<00:00, 11699.68 examples/s]


Applying chat template to train dataset (num_proc=2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:01<00:00, 6786.25 examples/s]





Tokenizing train dataset (num_proc=2):   0%|                                                                                                                           | 0/10003 [00:00<?, ? examples/s]


Tokenizing train dataset (num_proc=2):   2%|██▍                                                                                                             | 216/10003 [00:00<00:35, 278.11 examples/s]


Tokenizing train dataset (num_proc=2):   9%|█████████▌                                                                                                     | 863/10003 [00:00<00:07, 1222.18 examples/s]


Tokenizing train dataset (num_proc=2):  16%|█████████████████▌                                                                                            | 1599/10003 [00:00<00:03, 2287.62 examples/s]


Tokenizing train dataset (num_proc=2):  24%|██████████████████████████▋                                                                                   | 2424/10003 [00:01<00:02, 3488.47 examples/s]


Tokenizing train dataset (num_proc=2):  32%|███████████████████████████████████▌                                                                          | 3238/10003 [00:01<00:01, 4526.88 examples/s]


Tokenizing train dataset (num_proc=2):  40%|████████████████████████████████████████████▌                                                                 | 4051/10003 [00:01<00:01, 5394.42 examples/s]


Tokenizing train dataset (num_proc=2):  49%|██████████████████████████████████████████████████████▍                                                       | 4949/10003 [00:01<00:00, 6212.19 examples/s]


Tokenizing train dataset (num_proc=2):  58%|███████████████████████████████████████████████████████████████▌                                              | 5781/10003 [00:01<00:00, 6762.89 examples/s]


Tokenizing train dataset (num_proc=2):  66%|████████████████████████████████████████████████████████████████████████▊                                     | 6625/10003 [00:01<00:00, 7223.03 examples/s]


Tokenizing train dataset (num_proc=2):  75%|██████████████████████████████████████████████████████████████████████████████████▉                           | 7542/10003 [00:01<00:00, 7578.83 examples/s]


Tokenizing train dataset (num_proc=2):  85%|█████████████████████████████████████████████████████████████████████████████████████████████                 | 8459/10003 [00:01<00:00, 8022.28 examples/s]


Tokenizing train dataset (num_proc=2):  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 9519/10003 [00:01<00:00, 7822.05 examples/s]


Tokenizing train dataset (num_proc=2): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:02<00:00, 4675.34 examples/s]





Tokenizing train dataset (num_proc=2):   0%|                                                                                                                           | 0/10003 [00:00<?, ? examples/s]


Tokenizing train dataset (num_proc=2):  12%|█████████████▎                                                                                                | 1212/10003 [00:00<00:00, 9193.89 examples/s]


Tokenizing train dataset (num_proc=2):  25%|███████████████████████████▌                                                                                 | 2534/10003 [00:00<00:00, 11204.35 examples/s]


Tokenizing train dataset (num_proc=2):  45%|█████████████████████████████████████████████████▌                                                           | 4547/10003 [00:00<00:00, 14195.56 examples/s]


Tokenizing train dataset (num_proc=2):  66%|███████████████████████████████████████████████████████████████████████▍                                     | 6557/10003 [00:00<00:00, 15675.37 examples/s]


Tokenizing train dataset (num_proc=2):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▎               | 8568/10003 [00:00<00:00, 16723.80 examples/s]


Tokenizing train dataset (num_proc=2): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10003/10003 [00:00<00:00, 13853.26 examples/s]

CPU times: user 841 ms, sys: 347 ms, total: 1.19 s
Wall time: 5.85 s





In [11]:
%%time
# this will initiate the training process and also log the EC2 utilization metrics, such as the GPU
# utilization, CPU utilization, etc.
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,003 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 600
 "-____-"     Number of trainable parameters = 11,272,192


[2025-03-17 10:37:41,735] p15964 {ec2_metrics.py:184} INFO - Training started. Initiating EC2 metrics collection.


[2025-03-17 10:37:41,736] p15964 {ec2_metrics.py:170} INFO - Writing header: ['timestamp', 'cpu_percent_mean', 'memory_percent_mean', 'memory_used_mean', 'gpu_utilization_mean', 'gpu_memory_used_mean', 'gpu_memory_free_mean', 'gpu_memory_total_mean']


[2025-03-17 10:37:41,737] p15964 {ec2_metrics.py:41} INFO - Starting collection


[2025-03-17 10:37:42,018] p15964 {ec2_metrics.py:143} INFO - Starting daemon collector to run in background


Step,Training Loss
1,3.5523
2,3.7647
3,3.7465
4,3.6506
5,3.5389
6,3.1787
7,2.9846
8,2.7375
9,2.4555
10,2.3794


[2025-03-17 10:49:02,505] p15964 {ec2_metrics.py:191} INFO - Training ended. Stopping EC2 metrics collection.


[2025-03-17 10:49:02,506] p15964 {ec2_metrics.py:33} INFO - Stopped collection


CPU times: user 11min 19s, sys: 11.6 s, total: 11min 30s
Wall time: 11min 21s


In [12]:

output_text = f"""Training Statistics:
Global Steps: {trainer_stats.global_step}
Training Loss: {trainer_stats.training_loss:.4f}

Metrics:
- Train Runtime: {trainer_stats.metrics['train_runtime']:.3f} seconds
- Training Samples/Second: {trainer_stats.metrics['train_samples_per_second']:.3f}
- Training Steps/Second: {trainer_stats.metrics['train_steps_per_second']:.3f}
- Total FLOPS: {trainer_stats.metrics['total_flos']:.2e}
- Final Train Loss: {trainer_stats.metrics['train_loss']:.4f}
"""

# Save to a text file
with open(os.path.join(g.RESULTS_DIR, g.TRAINING_STATS), 'w') as f:
    f.write(output_text)

In [13]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                    # Change below!
    {"role": "user", "content": "I see a charge on my credit card statement but I paid on time, why?"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
model = model.to("cuda")
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


28<|eot_id|>


In [14]:
question_list = ['I see a charge on my credit card statement but I paid on time, why?',
'Do you have a branch in Timbuktu?',
'I lost my card and my replacement card has not arrived.']

output_file = "problem1_task1.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for i in question_list:
        messages = [                    
            {"role": "user", "content": i},
        ]
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # Move model to CUDA
        model = model.to("cuda")

        # Generate output
        output_ids = model.generate(input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
        response_text = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)

        # Prepare output text
        output_str = f"input: {i}\ncategory: {response_text}\n\n"

        # Print and save to file
        print(output_str)
        f.write(output_str)

input: I see a charge on my credit card statement but I paid on time, why?
category: 15


input: Do you have a branch in Timbuktu?
category: 76




input: I lost my card and my replacement card has not arrived.
category: 11


