In [None]:
# MINE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Installing Dependencies**

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes
# !pip install trl
# !pip install -U bitsandbytes

## **Loading the Model**

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally
dtype = None # None for auto detection. Float16 for Telsa T4,
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.8.9: Fast Mistral patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## **Add LoRA Adapters**

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number >  0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj","k_proj","v_proj", "o_proj",
                      "gate_proj", "up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 30,
    use_rslora = False,
    loftq_config = None # And LoftQ
)

## **Data Preparation**

Always remember to add the **EOS_TOKEN** to avoid infinite generation by the model.

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # EOS must be added
def format_prompt(examples):
  instructions = examples['instruction']
  inputs = examples['input']
  outputs = examples['output']
  texts = []
  for instruction, input, output in zip(instructions, inputs, outputs):
    text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
    texts.append(text)
  return {"text": texts, }

In [None]:
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Before formatting
dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [None]:
# After formatting
dataset = dataset.map(format_prompt, batched=True)
dataset

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'input', 'instruction', 'text'],
    num_rows: 51760
})

In [None]:
print(dataset['text'][0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Input:


### Response:
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.<|end

In [None]:
print(dataset['text'][22]) # notice the EOS_token; "<|endoftext|>" token

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Based on the information provided, rewrite the sentence by changing its tense from past to future.

### Input:
She played the piano beautifully for hours and then stopped as it was midnight.

### Response:
She will play the piano beautifully for hours and then stop as it will be midnight.<|endoftext|>


## **Setting up Weights and Biases for Logging**

In [None]:
import wandb

In [None]:
wandb.login()



True

In [None]:
import os
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth"

# save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true" # throws an error, must use 'checkpoint' or 'end'
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    per_device_train_batch_size=1, # Reduced from 2 to 1 to save memory
    gradient_accumulation_steps=1, # Reduced from 4 to 2 to 1 to save memory
    warmup_steps=5,
    max_steps=100,
    # num_train_epochs=100,
    learning_rate=2e-4,
    fp16= not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 5,
    # This needs the eval_dataset to be used
    # eval_strategy="steps",
    save_strategy="steps",
    save_steps=5,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 30,
    run_name="Fine_Tune_Phi_3_mini_4k_instruct_model-unsloth",
    output_dir="outputs",
    report_to = ["wandb"], # reporting to Weights and biases project
)

In [None]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    dataset_num_proc = 2,
    packing = False,
    args = training_args,
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
11.752 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Step,Training Loss
5,3.2773
10,3.9343
15,2.2025
20,2.1203
25,2.1337
30,1.5683
35,0.559
40,0.4556
45,0.0112
50,0.3061


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-5)... Done. 4.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-10)... Done. 4.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-15)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-20)... Done. 0.8s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.8s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-30)... Done. 0.7s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-35)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-40)... Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-45)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-55)... Done. 0.8s
[34m[1mwandb[0m: Ad

In [None]:
trainer_stats.metrics

{'train_runtime': 210.709,
 'train_samples_per_second': 0.475,
 'train_steps_per_second': 0.475,
 'total_flos': 499895663892480.0,
 'train_loss': 0.8565672340616584}

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

210.709 seconds used for training.
3.51 minutes used for training.
Peak reserved memory = 11.752 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 79.723 %.
Peak reserved memory for training % of max memory = 0.0 %.


## **Inference**

In [None]:
FastLanguageModel.for_inference(model) # using the unfine-tuned model
inputs = tokenizer(
    [
    alpaca_prompt.format(
        "List all metals in Africa?", # instruction
            "", # input
                "", # model generates response
                )
    ],
    return_tensors="pt",).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=200,use_cache=True)
# print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
List all metals in Africa?

### Input:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


###


 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
    alpaca_prompt.format(
        "List all metals in Africa?", # instruction
            "Gold, Silver, Bronze,", # input
                "", # model generates response
                )
    ],
    return_tensors="pt",).to("cuda")

streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer= streamer, max_new_tokens=200)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
List all metals in Africa?

### Input:
Gold, Silver, Bronze,

### Response:
Gold, Silver, Bronze,<|endoftext|>


In [None]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
    alpaca_prompt.format(
        "Give a brief summary about the universe", # instruction
            "The universe is verse and big", # input
                "", # model generates response
                )
    ],
    return_tensors="pt",).to("cuda")

streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer= streamer, max_new_tokens=100)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give a brief summary about the universe

### Input:
The universe is verse and big

### Response:
The universe is verse and big.<|endoftext|>


**A little tweaking**

In [None]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
    alpaca_prompt.format(
        "List all metals", # instruction
            "Gold, Silver, Bronze", # input
                "" # model generates response
                )
    ],
    return_tensors="pt",).to("cuda")

# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=200,use_cache=True)
# print(outputs)
streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs,streamer=streamer, max_new_tokens=200)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
List all metals

### Input:
Gold, Silver, Bronze

### Response:
Gold, Silver, Bronze<|endoftext|>


## **Saving, Loading Finetuned models**

You coud save locally and push to hub

In [None]:
# import os
# import sys

# google_colab = "google.colab" in sys.modules and not os.environ.get("VERTEX_PRODUCT")

# if google_colab:
#     # Use secret if running in Google Colab
#     from google.colab import userdata
#     os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
# else:
#     # Store Hugging Face data under `/content` if running in Colab Enterprise
#     if os.environ.get("VERTEX_PRODUCT") == "COLAB_ENTERPRISE":
#         os.environ["HF_HOME"] = "/content/hf"
#     # Authenticate with Hugging Face
#     from huggingface_hub import get_token
#     if get_token() is None:
#         from huggingface_hub import notebook_login
#         notebook_login()

In [None]:
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get('HF_TOKEN')
if hf_token:
   login(hf_token)
   print("Successfully logged in to Hugging Face!")
else:
   print("Token is not set. Please save the token first.")

Successfully logged in to Hugging Face!


In [None]:
# model.save_pretrained("Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth") # Local saving
# tokenizer.save_pretrained("Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth") # Local saving
# first create the model card on Huggingface,
# copy the repo name and paste it here
# After which, you can run the code
# Pushing to Huggingface
model.push_to_hub("DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model",token=hf_token)
tokenizer.push_to_hub("DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model",token=hf_token)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pdsewk2xe/adapter_model.safetensors:   0%|          | 29.3kB /  120MB            

Saved model to https://huggingface.co/DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp8c21wcmj/tokenizer.model      : 100%|##########|  500kB /  500kB            

No files have been modified since last commit. Skipping to prevent empty commit.


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if True:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = hf_token
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


==((====))==  Unsloth 2025.8.9: Fast Mistral patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

In [None]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Based on the information provided, rewrite the sentence by changing its tense from past to future.?", # instruction
        "She played the piano beautifully for hours and then stopped as it was midnight.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Based on the information provided, rewrite the sentence by changing its tense from past to future.?

### Input:
She played the piano beautifully for hours and then stopped as it was midnight.

### Response:
She will play the piano beautifully for hours and then stop as it is midnight.<|endoftext|>


In [None]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
    [
    alpaca_prompt.format(
        "List all metals", # instruction
            "Gold, Silver, Bronze,", # input
                "" # model generates response
                )
    ],
    return_tensors="pt",).to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
List all metals

### Input:
Gold, Silver, Bronze,

### Response:
Gold, Silver, Bronze, Copper, Iron, Aluminum, Zinc, Lead, Mercury, Tin, Nickel, Titanium, Vanadium, Chromium, Cobalt, Molybdenum, Niobium, Rhenium, Ruthenium


In [None]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is a famous tall tower in Paris?

### Input:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:


### Response:




Most likely overfitting

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Saving to 8bit
# if False: model.save_pretrained_gguf("Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-modelgguf",tokenizer)
if True: model.push_to_hub_gguf("DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf", tokenizer, token=hf_token)


# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-modelgguf", tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf("DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf", tokenizer, quantization_method = "f16", token=hf_token)

# # Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-modelgguf", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf", tokenizer, quantization_method = "q4_k_m", token=hf_token)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.3G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.18 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:01<00:00, 18.16it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf/pytorch_model-00001-of-00002.bin...
Unsloth: Saving DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf into q8_0 GGUF format.
The output location will be /content/DannyAI/Fine-Tune-Phi-3-mini-4k-instruct-model-unsloth-lora-model-gguf/unsloth.Q8_0.gguf
This might take 3 minutes...


TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
# # Downgrade protobuf to a compatible version
# !pip install protobuf==3.20.3

[Microsoft-phi-cook-book-fine-tuning-github](https://github.com/microsoft/PhiCookBook/blob/main/code/03.Finetuning/Phi-3-finetune-qlora-python.ipynb)