In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab_ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

!pip install "git+https://github.com/huggingface/transformers.git" # Native 4bit loading works!

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
==((====))==  Unsloth: Fast Llama patching release 2024.1
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \     Pytorch: 2.1.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.22.post7. FA = False.
 "-____-"     Apache 2 free license: http://github.com/unslothai/unsloth
Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!
You passed `quantization_config` to `from_pretrained` but the model you're loa

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit", # "unsloth/tinyllama" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = False, # With Unsloth, we can turn this off!
    random_state = 3407,
    max_seq_length = max_seq_length,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama-bnb-4bit/snapshots/fc56510003ea9d49362400b8a362345150802c31/config.json
Model config LlamaConfig {
  "_name_or_path": "unsloth/tinyllama-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=False,
    random_state=3407,
    max_seq_length=max_seq_length
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.1 patched 22 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [7]:
#@title Alpaca dataset preparation code
alpaca_prompt = """Respond in JSON


### Instruction:
{}

### Input:
{}

### Response:
{}
"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
#import your custom dataset
dataset = load_dataset("isaiahbjork/instruct-function-calling", split = "train[:1%]")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers.utils import logging
logging.set_verbosity_info()

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = True, # Packs short sequences together to save time!
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1, #10 was fine for me to get the model to respond with JSON but adjust this as needed
        learning_rate = 2e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama/snapshots/b391f31ac8766558fa3adea224a200f0472fcba5/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama/snapshots/b391f31ac8766558fa3adea224a200f0472fcba5/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama/snapshots/b391f31ac8766558fa3adea224a200f0472fcba5/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama/snapshots/b391f31ac8766558fa3adea224a

Step,Training Loss
1,2.203
2,2.1539
3,2.2021
4,2.0804
5,2.0315
6,1.9966
7,2.0083
8,1.9914




Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
# Test the output of the model
inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is the famous tower in France called?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

In [10]:
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "What is the famous tower in France called?",
            "",
            ""
        )
    ]*1, return_tensors="pt"
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
tokenizer.batch_decode(outputs)

['<s> Respond in JSON\n\n\n### Instruction:\nWhat is the famous tower in France called?\n\n### Input:\n\n\n### Response:\n\n## Instruction\n## Inst: What is the tower in France called?\n## Input\n## Response\n##:\n## Inst: What the tower in called France?\n##\n## Inst: What is the tower called?\n## Response\n##: France\n##: What is the tower called?\n## Inst: What is the tower called?\n##: France: What the tower?\n\n\n##: What the tower?\n: France\n: What the?\n: France\n: What?\n: France\n: What?\n: France\n: What?\n: France\n: What?\n']

In [11]:
#import huggingface_hub
#huggingface_hub.login(token='')
#model.push_to_hub("yourname/lora_model") # Online saving
model.save_pretrained("lora_model") # Local saving

In [12]:
# Load the LoRA Adapter & Model Together to start inference
from peft import PeftModel
model = PeftModel.from_pretrained(model, "lora_model")
inputs = tokenizer(
[
    alpaca_prompt.format(
        "{'message': '', 'function_call': None, 'args':[]}", #instruction
        "What's 2+2?", # input
        "", # output
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

["<s> Respond in JSON\n\n\n### Instruction:\n{'message': '', 'function_call': None, 'args':[]}\n\n### Input:\nWhat's 2+2?\n\n### Response:\n\n{'message': '22',functioncall:None,args:[]}\n## Input: 2+2\n## Response\n{'message':2,function:None,args:[]}\n## Input 2+\nResponse\n{'message':2,function:None,args:}\n## \n# Input 2+2\n# Response\n{'message':2,function:None,args:}\n##\n# 2+2\n Response\n{'message':2,function:,args:}\n## 22\n Response\n# 2\n#\n# 2\n"]

In [13]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
from peft import PeftModel
model = PeftModel.from_pretrained(model, "isaiahbjork/tinyllama-function-calling-v0.2-adapter")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--tinyllama-bnb-4bit/snapshots/fc56510003ea9d49362400b8a362345150802c31/config.json
Model config LlamaConfig {
  "_name_or_path": "unsloth/tinyllama-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

v0.1 isaiahbjork/tinyllama-function-calling-v0.1

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "{'message': '', 'function_call': None, 'args':[]}",
        "Whats 2+2?", # input
        "", # output
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

V0.2 isaiahbjork/tinyllama-function-calling-v0.2-adapter

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "{'message': '', 'function_call': ''} You are a helpful assistant with access to the following functions. Use them if required -\n{\n    \"name\": \"perform_calculator_operation\",\n    \"description\": \"Perform a calculator operation\",\n    \"parameters\": {\n        \"type\": \"object\",\n        \"properties\": {\n            \"operand1\": {\n                \"type\": \"number\",\n                \"description\": \"The first operand\"\n            },\n            \"operand2\": {\n                \"type\": \"number\",\n                \"description\": \"The second operand\"\n            },\n            \"operator\": {\n                \"type\": \"string\",\n                \"description\": \"The operator (+, -, *, /)\"\n            }\n        },\n        \"required\": [\n            \"operand1\",\n            \"operand2\",\n            \"operator\"\n        ]\n    }\n}\n",
        "Whats 2+2?", # input
        "", # output
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)