### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

### Unsloth

`FastModel` supports loading nearly any model now! This includes Vision and Text models!

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name="unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    max_seq_length=512,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-03 08:58:16 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-03 08:58:16 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Gemma3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers=False,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,

    r=8,
    lora_alpha=8,
    lora_dropout=0,
    bias="none",
    random_state=42,
)

Unsloth: Making `model.base_model.model.model` require gradients


<a name="Data"></a>
### Data Prep


In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('dataset_light.txt', sep='<--->', engine='python')
print(df.shape)
dataset = Dataset.from_pandas(df)

(1786, 2)


In [None]:
prompt = """Translate the text I will give you in PIECE to modern and simpler english, easily understandable also for non-native speakers,
keeping the meaning intact. If the text is already modern and simple, return it as is. Do not add any introduction, extra text or
explanation, just answer with the translation.

### PIECE:
{}

### TRANSLATION:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(data):
    pieces = data['ORIGINAL']
    translations = data['TRANSLATED']
    texts = []
    for p, tr in zip(pieces, translations):
        text = prompt.format(p, tr) + EOS_TOKEN
        texts.append(text)
    return {"text" : texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1786 [00:00<?, ? examples/s]

In [None]:
inputs = tokenizer(
[
    prompt.format(
        "And therefore is wing'd Cupid painted blind.",
#        "Love looks not with the eyes, but with the mind; And therefore is wing'd Cupid painted blind. Nor hath love's mind of any judgment taste; Wings and no eyes figure unheedy haste: And therefore is love said to be a child, Because in choice he is so oft beguil'd.",
        "",
    )
], return_tensors="pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
tokenizer.batch_decode(outputs)

["<bos>Translate the text I will give you in PIECE to modern and simpler english, easily understandable also for non-native speakers,\nkeeping the meaning intact. If the text is already modern and simple, return it as is. Do not add any introduction, extra text or\nexplanation, just answer with the translation.\n\n### PIECE:\nAnd therefore is wing'd Cupid painted blind.\n\n### TRANSLATION:\nBecause of that, Cupid's wings are painted in the dark.<end_of_turn>"]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
from trl import SFTTrainer, SFTConfig

dataset = dataset.shuffle()

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=2e-5,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        report_to="none",
        dataset_num_proc=1,
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"]:   0%|          | 0/1786 [00:00<?, ? examples/s]

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,786 | Num Epochs = 1 | Total steps = 223
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,4.2371
2,4.3638
3,4.2025
4,4.3311
5,4.177
6,4.2801
7,4.1328
8,4.118
9,4.0654
10,3.9958


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
inputs = tokenizer(
[
    prompt.format(
        "And therefore is wing'd Cupid painted blind.",
#        "Love looks not with the eyes, but with the mind; And therefore is wing'd Cupid painted blind. Nor hath love's mind of any judgment taste; Wings and no eyes figure unheedy haste: And therefore is love said to be a child, Because in choice he is so oft beguil'd.",
        "",
    )
], return_tensors="pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
tokenizer.batch_decode(outputs)

["<bos>Translate the text I will give you in PIECE to modern and simpler english, easily understandable also for non-native speakers,\nkeeping the meaning intact. If the text is already modern and simple, return it as is. Do not add any introduction, extra text or\nexplanation, just answer with the translation.\n\n### PIECE:\nAnd therefore is wing'd Cupid painted blind.\n\n### TRANSLATION:\nSo, Cupid has been painted with blind eyes.<end_of_turn>"]

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.push_to_hub('davide710/pf_gemma_1b_v3', token='hf_token')
tokenizer.push_to_hub('davide710/pf_gemma_1b_v3', token='hf_token')

README.md:   0%|          | 0.00/610 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/26.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/davide710/pf_gemma_1b_v3


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]