In [19]:
import torch; torch.version.cuda

'12.1'

In [20]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

max_seq_length = 2048

# 4bit pre quantized models we support - 4x faster downloading!
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
]

In [21]:
# Load Llama model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


In [22]:

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
)

In [27]:
#@title Alpaca dataset preparation code
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["system_prompt"]
    inputs       = examples["question"]
    outputs      = examples["response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }
pass


In [25]:
dataset = load_dataset("Open-Orca/OpenOrca", split = "train")

Downloading readme: 100%|██████████| 12.0k/12.0k [00:00<00:00, 43.2MB/s]
Downloading data: 100%|██████████| 1.01G/1.01G [01:30<00:00, 11.1MB/s]
Downloading data: 100%|██████████| 3.09G/3.09G [06:26<00:00, 8.00MB/s]
Generating train split: 4233923 examples [00:15, 269935.25 examples/s]
Map:   0%|          | 0/4233923 [00:00<?, ? examples/s]


KeyError: 'reponse'

In [28]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 4233923/4233923 [00:57<00:00, 73116.38 examples/s] 


In [30]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    packing=False,
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 100,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)

Map:  60%|██████    | 2561000/4233923 [10:09<06:34, 4239.63 examples/s]

In [17]:
trainer.train()

  2%|▏         | 1/60 [00:01<01:24,  1.43s/it]

{'loss': 1.1693, 'learning_rate': 5e-06, 'epoch': 0.0}


  3%|▎         | 2/60 [00:02<01:17,  1.34s/it]

{'loss': 1.2413, 'learning_rate': 1e-05, 'epoch': 0.0}


  5%|▌         | 3/60 [00:04<01:27,  1.53s/it]

{'loss': 1.1137, 'learning_rate': 1.5e-05, 'epoch': 0.0}


  7%|▋         | 4/60 [00:05<01:21,  1.46s/it]

{'loss': 1.0858, 'learning_rate': 2e-05, 'epoch': 0.0}


  8%|▊         | 5/60 [00:07<01:18,  1.42s/it]

{'loss': 1.128, 'learning_rate': 2.5e-05, 'epoch': 0.0}


 10%|█         | 6/60 [00:08<01:12,  1.34s/it]

{'loss': 1.188, 'learning_rate': 3e-05, 'epoch': 0.0}


 12%|█▏        | 7/60 [00:09<01:14,  1.41s/it]

{'loss': 1.2689, 'learning_rate': 3.5e-05, 'epoch': 0.0}


 13%|█▎        | 8/60 [00:11<01:10,  1.36s/it]

{'loss': 1.1552, 'learning_rate': 4e-05, 'epoch': 0.0}


 15%|█▌        | 9/60 [00:12<01:08,  1.35s/it]

{'loss': 1.2095, 'learning_rate': 4.5e-05, 'epoch': 0.0}


 17%|█▋        | 10/60 [00:13<01:05,  1.31s/it]

{'loss': 1.1787, 'learning_rate': 5e-05, 'epoch': 0.0}


 18%|█▊        | 11/60 [00:15<01:04,  1.31s/it]

{'loss': 1.0598, 'learning_rate': 4.9e-05, 'epoch': 0.0}


 20%|██        | 12/60 [00:16<01:02,  1.29s/it]

{'loss': 1.093, 'learning_rate': 4.8e-05, 'epoch': 0.0}


 22%|██▏       | 13/60 [00:17<01:06,  1.41s/it]

{'loss': 1.1062, 'learning_rate': 4.7e-05, 'epoch': 0.0}


 23%|██▎       | 14/60 [00:19<01:03,  1.38s/it]

{'loss': 1.2354, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.0}


 25%|██▌       | 15/60 [00:20<00:55,  1.22s/it]

{'loss': 0.9621, 'learning_rate': 4.5e-05, 'epoch': 0.0}


 27%|██▋       | 16/60 [00:21<00:52,  1.20s/it]

{'loss': 1.1109, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.0}


 28%|██▊       | 17/60 [00:22<00:54,  1.26s/it]

{'loss': 1.2944, 'learning_rate': 4.3e-05, 'epoch': 0.0}


 30%|███       | 18/60 [00:24<00:57,  1.38s/it]

{'loss': 1.0582, 'learning_rate': 4.2e-05, 'epoch': 0.0}


 32%|███▏      | 19/60 [00:25<00:51,  1.26s/it]

{'loss': 0.727, 'learning_rate': 4.1e-05, 'epoch': 0.0}


 33%|███▎      | 20/60 [00:26<00:49,  1.23s/it]

{'loss': 0.8982, 'learning_rate': 4e-05, 'epoch': 0.0}


 35%|███▌      | 21/60 [00:27<00:50,  1.29s/it]

{'loss': 1.0957, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.0}


 37%|███▋      | 22/60 [00:29<00:49,  1.29s/it]

{'loss': 1.2537, 'learning_rate': 3.8e-05, 'epoch': 0.0}


 38%|███▊      | 23/60 [00:30<00:52,  1.42s/it]

{'loss': 1.0592, 'learning_rate': 3.7e-05, 'epoch': 0.0}


 40%|████      | 24/60 [00:32<00:55,  1.54s/it]

{'loss': 0.9886, 'learning_rate': 3.6e-05, 'epoch': 0.0}


 42%|████▏     | 25/60 [00:34<00:54,  1.56s/it]

{'loss': 1.1791, 'learning_rate': 3.5e-05, 'epoch': 0.0}


 43%|████▎     | 26/60 [00:35<00:49,  1.47s/it]

{'loss': 0.9073, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.0}


 45%|████▌     | 27/60 [00:36<00:44,  1.35s/it]

{'loss': 1.1269, 'learning_rate': 3.3e-05, 'epoch': 0.0}


 47%|████▋     | 28/60 [00:37<00:42,  1.31s/it]

{'loss': 1.0687, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.0}


 48%|████▊     | 29/60 [00:39<00:43,  1.40s/it]

{'loss': 1.1524, 'learning_rate': 3.1e-05, 'epoch': 0.0}


 50%|█████     | 30/60 [00:40<00:37,  1.27s/it]

{'loss': 0.9314, 'learning_rate': 3e-05, 'epoch': 0.0}


 52%|█████▏    | 31/60 [00:41<00:36,  1.27s/it]

{'loss': 1.0393, 'learning_rate': 2.9e-05, 'epoch': 0.0}


 53%|█████▎    | 32/60 [00:43<00:41,  1.48s/it]

{'loss': 1.0027, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.0}


 55%|█████▌    | 33/60 [00:45<00:42,  1.56s/it]

{'loss': 1.1395, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.01}


 57%|█████▋    | 34/60 [00:46<00:38,  1.49s/it]

{'loss': 1.0968, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.01}


 58%|█████▊    | 35/60 [00:47<00:33,  1.32s/it]

{'loss': 0.9439, 'learning_rate': 2.5e-05, 'epoch': 0.01}


 60%|██████    | 36/60 [00:49<00:31,  1.31s/it]

{'loss': 0.9658, 'learning_rate': 2.4e-05, 'epoch': 0.01}


 62%|██████▏   | 37/60 [00:50<00:30,  1.33s/it]

{'loss': 1.0508, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.01}


 63%|██████▎   | 38/60 [00:51<00:30,  1.38s/it]

{'loss': 1.0368, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.01}


 65%|██████▌   | 39/60 [00:53<00:29,  1.42s/it]

{'loss': 1.1474, 'learning_rate': 2.1e-05, 'epoch': 0.01}


 67%|██████▋   | 40/60 [00:54<00:28,  1.44s/it]

{'loss': 1.1822, 'learning_rate': 2e-05, 'epoch': 0.01}


 68%|██████▊   | 41/60 [00:56<00:27,  1.47s/it]

{'loss': 1.0093, 'learning_rate': 1.9e-05, 'epoch': 0.01}


 70%|███████   | 42/60 [00:57<00:23,  1.33s/it]

{'loss': 0.8058, 'learning_rate': 1.8e-05, 'epoch': 0.01}


 72%|███████▏  | 43/60 [00:58<00:20,  1.23s/it]

{'loss': 0.9184, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}


 73%|███████▎  | 44/60 [00:59<00:21,  1.32s/it]

{'loss': 0.9221, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.01}


 75%|███████▌  | 45/60 [01:01<00:20,  1.39s/it]

{'loss': 1.2516, 'learning_rate': 1.5e-05, 'epoch': 0.01}


 77%|███████▋  | 46/60 [01:02<00:19,  1.39s/it]

{'loss': 0.9179, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}


 78%|███████▊  | 47/60 [01:04<00:20,  1.57s/it]

{'loss': 1.0067, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


 80%|████████  | 48/60 [01:06<00:18,  1.53s/it]

{'loss': 1.0593, 'learning_rate': 1.2e-05, 'epoch': 0.01}


 82%|████████▏ | 49/60 [01:07<00:15,  1.42s/it]

{'loss': 0.9847, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


 83%|████████▎ | 50/60 [01:08<00:13,  1.33s/it]

{'loss': 1.238, 'learning_rate': 1e-05, 'epoch': 0.01}


 85%|████████▌ | 51/60 [01:10<00:12,  1.38s/it]

{'loss': 1.0335, 'learning_rate': 9e-06, 'epoch': 0.01}


 87%|████████▋ | 52/60 [01:11<00:10,  1.31s/it]

{'loss': 0.9683, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


 88%|████████▊ | 53/60 [01:12<00:09,  1.38s/it]

{'loss': 1.151, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


 90%|█████████ | 54/60 [01:13<00:07,  1.30s/it]

{'loss': 1.1957, 'learning_rate': 6e-06, 'epoch': 0.01}


 92%|█████████▏| 55/60 [01:15<00:06,  1.30s/it]

{'loss': 1.0638, 'learning_rate': 5e-06, 'epoch': 0.01}


 93%|█████████▎| 56/60 [01:16<00:05,  1.29s/it]

{'loss': 1.177, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


 95%|█████████▌| 57/60 [01:17<00:03,  1.17s/it]

{'loss': 1.004, 'learning_rate': 3e-06, 'epoch': 0.01}


 97%|█████████▋| 58/60 [01:19<00:02,  1.44s/it]

{'loss': 1.1946, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}


 98%|█████████▊| 59/60 [01:20<00:01,  1.33s/it]

{'loss': 1.0384, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


100%|██████████| 60/60 [01:22<00:00,  1.37s/it]

{'loss': 1.1878, 'learning_rate': 0.0, 'epoch': 0.01}
{'train_runtime': 82.0261, 'train_samples_per_second': 5.852, 'train_steps_per_second': 0.731, 'train_loss': 1.0796640833218893, 'epoch': 0.01}





TrainOutput(global_step=60, training_loss=1.0796640833218893, metrics={'train_runtime': 82.0261, 'train_samples_per_second': 5.852, 'train_steps_per_second': 0.731, 'train_loss': 1.0796640833218893, 'epoch': 0.01})

In [18]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nThe fibonnaci sequence is a sequence of numbers that is continued from the first number to the second number. The first number in the sequence is 1, and the second number is 1. The next number in the sequence is 1, and the next number is 2. The next number in the sequence is 3, and the next number is 5. The next number in the sequence is 8, and the next number is 8. The next number in the sequence is 16, and the next number is 16. The next number in the sequence is 32, and the next']