In [1]:
# hides the output of the cell.
%%capture
# installs the unsloth library from GitHub. We will use this for fine-tuning
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# installs a version of the xformers package older than version 0.0.26 and trl, peft, accelerate and bitsandbytes.
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # maximum length of the sequence
dtype = None # type of data, none for auto detection
load_in_4bit = True # quantization to reduce memory usage


# model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)



config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


"Positional Encoding Fine-Tuning" involves customizing the positional encoding of a language model for a specific task or dataset and adjusting the model's weights to achieve better results.

LoRA is an improved finetuning method where instead of finetuning all the weights that constitute the weight matrix of the pre-trained large language model, two smaller matrices that approximate this larger matrix are fine-tuned


In [3]:
model = FastLanguageModel.get_peft_model(
    model, # The base FastLanguageModel instance
    r = 16, # Rank for the PEFT (Positional Encoding Fine-Tuning) transformation

    # # Modules to apply PEFT to (query, key, value, output projections, gate projection, upscale/downscale projections)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],

    lora_alpha = 16, # LoRA alpha hyperparameter (controls the strength of low-rank linearizations)
    lora_dropout = 0, # LoRA dropout rate, 0 for optimized performance
    bias = "none",    # Bias mode for LoRA linearizations, "none" for optimized performance


    use_gradient_checkpointing = "unsloth", # Enables gradient checkpointing for memory efficiency. True or "unsloth" for very long context
    random_state = 3407, # Random seed for reproducibility
    use_rslora = False,  # Disables rank-stabilized LoRA
    loftq_config = None, # Disables LoftQ quantization
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


We use the Alpaca dataset from yahma, which is a filtered version of 52K of the original Alpaca dataset.

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
# An EOS (End of Sentence) token is retrieved from the tokenizer object and assigned to the EOS_TOKEN variable.
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"] # Creates a list of instructions from the dataset.
    inputs       = examples["input"] # Creates a list of entries from the data set.
    outputs      = examples["output"] # Creates a list of outputs from the dataset.
    texts = []
    # Instructions loop between inputs and outputs.
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Text is filled with the instruction, input and output, followed by EOS_TOKEN.
        # This ensures that an object exhaustion marker is added at the end of each text.
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
# The formatting_prompts_func function is applied to the loaded dataset
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments


# trl library is a library for Transfer Reinforcement Learning (TRL). This library uses transfer learning and 
# PEFT techniques to improve the performance of a model on a task.
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.

    # Training arguments are created.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # The total number of samples to be used for training on each instrument is specified.
        gradient_accumulation_steps = 4, # Gradient accumulation steps are specified.
        warmup_steps = 5, # The warmup steps are indicated. warmup_steps is used to gradually increase the learning rate up to a certain number of steps.
        max_steps = 35, # The maximum number of steps is specified. YOU CAN CHANGE IT 
        learning_rate = 2e-4, # Learning rate
        fp16 = not torch.cuda.is_bf16_supported(), # Specifies whether to use the half-fraction point (FP16) sensitivity.
        bf16 = torch.cuda.is_bf16_supported(), # Specifies whether BF16 (Brain Floating Point 16) can be used.
        logging_steps = 1, # Step for logging. At each logging_steps step, information such as training statistics (e.g. loss value, accuracy) and optionally the outputs of the model are logged in a specified log 
        optim = "adamw_8bit", # Specify the optimizer to use.
        weight_decay = 0.01, # The weight reduction value is specified.
        lr_scheduler_type = "linear", # The weight reduction value is specified.
        seed = 3407, # Seed
        output_dir = "outputs", # The output directory is specified.
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


TRAIN

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 35
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.8194
2,2.2928
3,1.6911
4,1.9463
5,1.6431
6,1.6018
7,1.1929
8,1.2569
9,1.1082
10,1.1673


In [35]:
# Optimizes the model for natural inference and makes inference 2x faster.
FastLanguageModel.for_inference(model)
# Input text is prepared using the tokenizer.
# A list of text based on the alpaca_prompt format is tokenized.
inputs = tokenizer(
[
    # A text is created using a predefined format template
    alpaca_prompt.format(
        "What is an atom? Can you give me example?", # instruction
        "", # input - Optional. Can be left blank
        "", # output - Leave this blank for generation
    )
], return_tensors = "pt").to("cuda")

# The model produces output based on input.
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# Model tarafından üretilen çıktılar, tokenizer kullanılarak bir metin listesine dönüştürülür
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is an atom? Can you give me example?\n\n### Input:\n\n\n### Response:\nAn atom is the smallest particle of an element that retains the properties of that element. An example of an atom is hydrogen, which has one proton, one electron, and no neutrons.<|end_of_text|>']

TextStreamer for continuous inference

In [36]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is an atom? Can you give me example?", # instruction
         "",# input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is an atom? Can you give me example?

### Input:


### Response:
An atom is the smallest particle of an element that retains the properties of that element. An example of an atom is hydrogen, which is the smallest particle of the element hydrogen and has the properties of hydrogen, such as being a gas at room temperature.<|end_of_text|>


In [None]:
!pip install gradio
import gradio as gr

In [37]:
def generate_response(instruction):
  # It generates an answer according to the user instruction.
  formatted_prompt = alpaca_prompt.format(instruction, "", "")
  inputs = tokenizer(
      [
          formatted_prompt
      ], return_tensors="pt").to("cuda")

  # The model is enabled to produce output based on the given input. YOU CAN CHANGE MAX TOKEN
  outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
  # The output produced by the model is converted to text format, omitting special tokens.
  response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  # Split to extract only the response part from the output of the model
  return response.split("### Response:\n")[1].strip()


# Gradio Interface
interface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(label="Enter a Question:")
    ],
    outputs=[
        gr.Textbox(label="Answer:")
    ],
    title="Question-Answering",
    description="A chat-based question answering model built with Gradio",
)

next_instruction_input = gr.Textbox(label="Enter a new question or instruction to continue:")

# Start Gradio
interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://1c73754352e15898e3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [16]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")


# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving



('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [39]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response = response.split("### Response:\n")[1].strip()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
