<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Finetuningusingunsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers}
!pip install trl peft accelerate bitsandbytes triton


In [None]:
from unsloth import FastLanguageModel
import torch
model , tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = 1024,
    dtype=None, #dynamically select the dtype supported
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.4: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
#load dataset
from datasets import load_dataset
#load the dataset from Huggging face
dataset  = load_dataset("ai-makerspace-space/acronyms_and_intialalisms_translated", split = "train")
print(f"Dataset Size : {len(dataset)}")
print(dataset[1]["acronym_sentence"])
print(dataset[1]["english_translation"])


In [None]:
#create prompt
def create_prompt_with_template(example , return_response = True):
  prompt_template="<|begin_of_text|>"
  prompt_template+="<|start_header_id|>system<|end_header_id|>\n\n"
  prompt_template+="You are provided with an English sentence , and are expected to translate it into a 'text' speak sentence<|eot_id>"
  prompt_template+=f"Sentence:{example['english_translation']}<|eot_id|><|start_header_id|>assitant<|end_header_id|>"
  if return_response:
    prompt_template+=f"\n{example['acronym_sentence']}<|end_of_text|>"
  return {"text":prompt_template}

In [None]:
#let's look at an example of the formmated prompt template
create_prompt_with_template(dataset[1])["text"]

In [None]:
#now map this across the dataset
dataset = dataset.map(create_prompt_with_template)

In [None]:
#creating a trainable PEFT Model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing ="unsloth",
    random_state = 42,
)
#compare this with the normal fine tuning without unsloth

Unsloth 2025.9.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
#train the model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epoch=2,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "paged_adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    output_dir = "llama3_1_8b_instruct_ft",
    )

In [None]:
#loading SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    tokenizer = tokenizer,
    args = training_args,
    packing = True,
)

#let's train

In [None]:
#faster inference
FastLanguageModel.for_inference(model)
prompt = create_prompt_with_template(dataset[1], return_response=False)["text"]
inputs = tokenizer(
    [prompt],
    return_tensors="pt",
    padding=True,
    max_length=1024,
    truncation=True,
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 1024,
    use_cache = True,
)
tokenizer.batch_decode(outputs)[0]

In [None]:
#try a sentence that model has not been trained on
example = {
    "english_translation" : "Nobody ever figures out what life is all about, and it doesn't matter. Explore the world. Nearly everything is really interesting if you go into it deeply enough"
}
prompt = create_prompt_with_template(example, return_response=False)["text"]
inputs = tokenizer(
    [prompt],
    return_tensor="pt"
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 1024,
    use_cache = True,
)
tokenizer.batch_decode(outputs)[0]

#export the fine tuned model on unsloth to do online inference using vLLM or llama.cpp
