In [19]:
import torch; torch.version.cuda

'12.1'

In [20]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

max_seq_length = 2048

# 4bit pre quantized models we support - 4x faster downloading!
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
]

In [21]:
# Load Llama model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


In [22]:

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
)

In [27]:
#@title Alpaca dataset preparation code
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["system_prompt"]
    inputs       = examples["question"]
    outputs      = examples["response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }
pass


In [25]:
dataset = load_dataset("Open-Orca/OpenOrca", split = "train")

Downloading readme: 100%|██████████| 12.0k/12.0k [00:00<00:00, 43.2MB/s]
Downloading data: 100%|██████████| 1.01G/1.01G [01:30<00:00, 11.1MB/s]
Downloading data: 100%|██████████| 3.09G/3.09G [06:26<00:00, 8.00MB/s]
Generating train split: 4233923 examples [00:15, 269935.25 examples/s]
Map:   0%|          | 0/4233923 [00:00<?, ? examples/s]


KeyError: 'reponse'

In [28]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 4233923/4233923 [00:57<00:00, 73116.38 examples/s] 


In [30]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    packing=False,
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 100,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)

Map: 100%|██████████| 4233923/4233923 [16:48<00:00, 4199.11 examples/s]


In [31]:
trainer.train()

  1%|          | 1/100 [00:02<03:21,  2.04s/it]

{'loss': 2.1798, 'learning_rate': 5e-06, 'epoch': 0.0}


  2%|▏         | 2/100 [00:05<04:55,  3.01s/it]

{'loss': 2.0042, 'learning_rate': 1e-05, 'epoch': 0.0}


  3%|▎         | 3/100 [00:07<03:52,  2.40s/it]

{'loss': 1.998, 'learning_rate': 1.5e-05, 'epoch': 0.0}


  4%|▍         | 4/100 [00:09<03:34,  2.24s/it]

{'loss': 1.8388, 'learning_rate': 2e-05, 'epoch': 0.0}


  5%|▌         | 5/100 [00:11<03:23,  2.14s/it]

{'loss': 2.2891, 'learning_rate': 2.5e-05, 'epoch': 0.0}


  6%|▌         | 6/100 [00:13<03:13,  2.06s/it]

{'loss': 2.2881, 'learning_rate': 3e-05, 'epoch': 0.0}


  7%|▋         | 7/100 [00:15<03:10,  2.05s/it]

{'loss': 2.0472, 'learning_rate': 3.5e-05, 'epoch': 0.0}


  8%|▊         | 8/100 [00:17<03:04,  2.00s/it]

{'loss': 2.2733, 'learning_rate': 4e-05, 'epoch': 0.0}


  9%|▉         | 9/100 [00:21<03:58,  2.62s/it]

{'loss': 2.0454, 'learning_rate': 4.5e-05, 'epoch': 0.0}


 10%|█         | 10/100 [00:23<03:41,  2.46s/it]

{'loss': 2.1145, 'learning_rate': 5e-05, 'epoch': 0.0}


 11%|█         | 11/100 [00:25<03:33,  2.40s/it]

{'loss': 1.9207, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.0}


 12%|█▏        | 12/100 [00:27<03:22,  2.30s/it]

{'loss': 2.1894, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.0}


 13%|█▎        | 13/100 [00:29<03:02,  2.10s/it]

{'loss': 2.1851, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.0}


 14%|█▍        | 14/100 [00:31<02:59,  2.08s/it]

{'loss': 2.3936, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.0}


 15%|█▌        | 15/100 [00:33<02:57,  2.09s/it]

{'loss': 2.094, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.0}


 16%|█▌        | 16/100 [00:35<02:48,  2.01s/it]

{'loss': 2.3885, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.0}


 17%|█▋        | 17/100 [00:37<03:05,  2.23s/it]

{'loss': 2.0142, 'learning_rate': 4.6111111111111115e-05, 'epoch': 0.0}


 18%|█▊        | 18/100 [00:40<03:08,  2.30s/it]

{'loss': 1.7899, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.0}


 19%|█▉        | 19/100 [00:42<03:08,  2.33s/it]

{'loss': 2.1488, 'learning_rate': 4.5e-05, 'epoch': 0.0}


 20%|██        | 20/100 [00:44<02:49,  2.12s/it]

{'loss': 2.262, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.0}


 21%|██        | 21/100 [00:48<03:22,  2.56s/it]

{'loss': 1.647, 'learning_rate': 4.388888888888889e-05, 'epoch': 0.0}


 22%|██▏       | 22/100 [00:50<03:11,  2.45s/it]

{'loss': 1.927, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.0}


 23%|██▎       | 23/100 [00:53<03:17,  2.56s/it]

{'loss': 2.052, 'learning_rate': 4.277777777777778e-05, 'epoch': 0.0}


 24%|██▍       | 24/100 [00:54<02:50,  2.24s/it]

{'loss': 1.7798, 'learning_rate': 4.222222222222222e-05, 'epoch': 0.0}


 25%|██▌       | 25/100 [00:57<03:07,  2.50s/it]

{'loss': 2.0273, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.0}


 26%|██▌       | 26/100 [00:59<02:58,  2.41s/it]

{'loss': 2.1456, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.0}


 27%|██▋       | 27/100 [01:01<02:39,  2.18s/it]

{'loss': 1.9947, 'learning_rate': 4.055555555555556e-05, 'epoch': 0.0}


 28%|██▊       | 28/100 [01:04<03:04,  2.56s/it]

{'loss': 1.9869, 'learning_rate': 4e-05, 'epoch': 0.0}


 29%|██▉       | 29/100 [01:06<02:39,  2.24s/it]

{'loss': 1.7683, 'learning_rate': 3.944444444444445e-05, 'epoch': 0.0}


 30%|███       | 30/100 [01:08<02:43,  2.33s/it]

{'loss': 1.8185, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.0}


 31%|███       | 31/100 [01:11<02:40,  2.33s/it]

{'loss': 1.8003, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.0}


 32%|███▏      | 32/100 [01:13<02:34,  2.27s/it]

{'loss': 1.9191, 'learning_rate': 3.777777777777778e-05, 'epoch': 0.0}


 33%|███▎      | 33/100 [01:15<02:24,  2.16s/it]

{'loss': 1.8527, 'learning_rate': 3.722222222222222e-05, 'epoch': 0.0}


 34%|███▍      | 34/100 [01:16<02:07,  1.94s/it]

{'loss': 1.7663, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.0}


 35%|███▌      | 35/100 [01:18<02:04,  1.92s/it]

{'loss': 2.0634, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.0}


 36%|███▌      | 36/100 [01:20<01:59,  1.87s/it]

{'loss': 1.5847, 'learning_rate': 3.555555555555556e-05, 'epoch': 0.0}


 37%|███▋      | 37/100 [01:23<02:26,  2.32s/it]

{'loss': 1.851, 'learning_rate': 3.5e-05, 'epoch': 0.0}


 38%|███▊      | 38/100 [01:25<02:17,  2.22s/it]

{'loss': 1.8796, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.0}


 39%|███▉      | 39/100 [01:27<02:13,  2.19s/it]

{'loss': 1.6589, 'learning_rate': 3.388888888888889e-05, 'epoch': 0.0}


 40%|████      | 40/100 [01:29<01:55,  1.93s/it]

{'loss': 1.6911, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.0}


 41%|████      | 41/100 [01:31<02:00,  2.05s/it]

{'loss': 1.7962, 'learning_rate': 3.277777777777778e-05, 'epoch': 0.0}


 42%|████▏     | 42/100 [01:32<01:43,  1.78s/it]

{'loss': 1.8281, 'learning_rate': 3.222222222222223e-05, 'epoch': 0.0}


 43%|████▎     | 43/100 [01:35<01:51,  1.96s/it]

{'loss': 1.7929, 'learning_rate': 3.1666666666666666e-05, 'epoch': 0.0}


 44%|████▍     | 44/100 [01:38<02:10,  2.34s/it]

{'loss': 1.6175, 'learning_rate': 3.111111111111111e-05, 'epoch': 0.0}


 45%|████▌     | 45/100 [01:41<02:16,  2.47s/it]

{'loss': 1.6841, 'learning_rate': 3.055555555555556e-05, 'epoch': 0.0}


 46%|████▌     | 46/100 [01:43<02:13,  2.48s/it]

{'loss': 1.6396, 'learning_rate': 3e-05, 'epoch': 0.0}


 47%|████▋     | 47/100 [01:45<02:00,  2.27s/it]

{'loss': 1.7481, 'learning_rate': 2.9444444444444448e-05, 'epoch': 0.0}


 48%|████▊     | 48/100 [01:47<01:59,  2.29s/it]

{'loss': 1.4772, 'learning_rate': 2.8888888888888888e-05, 'epoch': 0.0}


 49%|████▉     | 49/100 [01:52<02:30,  2.95s/it]

{'loss': 1.9175, 'learning_rate': 2.8333333333333335e-05, 'epoch': 0.0}


 50%|█████     | 50/100 [01:57<02:58,  3.57s/it]

{'loss': 1.6492, 'learning_rate': 2.777777777777778e-05, 'epoch': 0.0}


 51%|█████     | 51/100 [01:59<02:32,  3.11s/it]

{'loss': 1.3782, 'learning_rate': 2.7222222222222223e-05, 'epoch': 0.0}


 52%|█████▏    | 52/100 [02:01<02:12,  2.76s/it]

{'loss': 1.5024, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.0}


 53%|█████▎    | 53/100 [02:03<02:07,  2.71s/it]

{'loss': 1.7541, 'learning_rate': 2.6111111111111114e-05, 'epoch': 0.0}


 54%|█████▍    | 54/100 [02:05<01:57,  2.56s/it]

{'loss': 1.6315, 'learning_rate': 2.5555555555555554e-05, 'epoch': 0.0}


 55%|█████▌    | 55/100 [02:09<02:01,  2.71s/it]

{'loss': 1.7557, 'learning_rate': 2.5e-05, 'epoch': 0.0}


 56%|█████▌    | 56/100 [02:11<01:55,  2.63s/it]

{'loss': 1.8621, 'learning_rate': 2.4444444444444445e-05, 'epoch': 0.0}


 57%|█████▋    | 57/100 [02:13<01:47,  2.49s/it]

{'loss': 1.649, 'learning_rate': 2.3888888888888892e-05, 'epoch': 0.0}


 58%|█████▊    | 58/100 [02:15<01:39,  2.37s/it]

{'loss': 1.6261, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.0}


 59%|█████▉    | 59/100 [02:17<01:28,  2.17s/it]

{'loss': 1.4399, 'learning_rate': 2.277777777777778e-05, 'epoch': 0.0}


 60%|██████    | 60/100 [02:19<01:28,  2.21s/it]

{'loss': 1.8804, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.0}


 61%|██████    | 61/100 [02:21<01:19,  2.05s/it]

{'loss': 1.7054, 'learning_rate': 2.1666666666666667e-05, 'epoch': 0.0}


 62%|██████▏   | 62/100 [02:24<01:27,  2.31s/it]

{'loss': 1.6423, 'learning_rate': 2.111111111111111e-05, 'epoch': 0.0}


 63%|██████▎   | 63/100 [02:27<01:34,  2.56s/it]

{'loss': 1.9028, 'learning_rate': 2.0555555555555555e-05, 'epoch': 0.0}


 64%|██████▍   | 64/100 [02:29<01:23,  2.33s/it]

{'loss': 1.6697, 'learning_rate': 2e-05, 'epoch': 0.0}


 65%|██████▌   | 65/100 [02:32<01:26,  2.48s/it]

{'loss': 1.7782, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.0}


 66%|██████▌   | 66/100 [02:34<01:19,  2.34s/it]

{'loss': 1.6069, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.0}


 67%|██████▋   | 67/100 [02:36<01:15,  2.28s/it]

{'loss': 1.8148, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.0}


 68%|██████▊   | 68/100 [02:39<01:17,  2.43s/it]

{'loss': 1.6006, 'learning_rate': 1.777777777777778e-05, 'epoch': 0.0}


 69%|██████▉   | 69/100 [02:41<01:11,  2.32s/it]

{'loss': 1.8387, 'learning_rate': 1.7222222222222224e-05, 'epoch': 0.0}


 70%|███████   | 70/100 [02:43<01:08,  2.28s/it]

{'loss': 1.5337, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.0}


 71%|███████   | 71/100 [02:45<01:05,  2.25s/it]

{'loss': 1.4261, 'learning_rate': 1.6111111111111115e-05, 'epoch': 0.0}


 72%|███████▏  | 72/100 [02:47<01:01,  2.18s/it]

{'loss': 1.6119, 'learning_rate': 1.5555555555555555e-05, 'epoch': 0.0}


 73%|███████▎  | 73/100 [02:49<00:57,  2.13s/it]

{'loss': 1.6739, 'learning_rate': 1.5e-05, 'epoch': 0.0}


 74%|███████▍  | 74/100 [02:51<00:53,  2.07s/it]

{'loss': 1.792, 'learning_rate': 1.4444444444444444e-05, 'epoch': 0.0}


 75%|███████▌  | 75/100 [02:53<00:55,  2.22s/it]

{'loss': 1.6029, 'learning_rate': 1.388888888888889e-05, 'epoch': 0.0}


 76%|███████▌  | 76/100 [02:56<00:54,  2.26s/it]

{'loss': 1.9354, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.0}


 77%|███████▋  | 77/100 [02:58<00:52,  2.28s/it]

{'loss': 1.4473, 'learning_rate': 1.2777777777777777e-05, 'epoch': 0.0}


 78%|███████▊  | 78/100 [03:00<00:48,  2.18s/it]

{'loss': 1.4519, 'learning_rate': 1.2222222222222222e-05, 'epoch': 0.0}


 79%|███████▉  | 79/100 [03:02<00:43,  2.05s/it]

{'loss': 1.593, 'learning_rate': 1.1666666666666668e-05, 'epoch': 0.0}


 80%|████████  | 80/100 [03:05<00:48,  2.41s/it]

{'loss': 1.4471, 'learning_rate': 1.1111111111111112e-05, 'epoch': 0.0}


 81%|████████  | 81/100 [03:07<00:44,  2.36s/it]

{'loss': 1.6627, 'learning_rate': 1.0555555555555555e-05, 'epoch': 0.0}


 82%|████████▏ | 82/100 [03:10<00:42,  2.35s/it]

{'loss': 1.3654, 'learning_rate': 1e-05, 'epoch': 0.0}


 83%|████████▎ | 83/100 [03:13<00:46,  2.75s/it]

{'loss': 1.6282, 'learning_rate': 9.444444444444445e-06, 'epoch': 0.0}


 84%|████████▍ | 84/100 [03:16<00:41,  2.59s/it]

{'loss': 1.5347, 'learning_rate': 8.88888888888889e-06, 'epoch': 0.0}


 85%|████████▌ | 85/100 [03:17<00:34,  2.33s/it]

{'loss': 1.7884, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.0}


 86%|████████▌ | 86/100 [03:19<00:30,  2.21s/it]

{'loss': 1.6951, 'learning_rate': 7.777777777777777e-06, 'epoch': 0.0}


 87%|████████▋ | 87/100 [03:21<00:28,  2.21s/it]

{'loss': 1.7436, 'learning_rate': 7.222222222222222e-06, 'epoch': 0.0}


 88%|████████▊ | 88/100 [03:24<00:29,  2.47s/it]

{'loss': 1.7466, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.0}


 89%|████████▉ | 89/100 [03:27<00:28,  2.58s/it]

{'loss': 1.5631, 'learning_rate': 6.111111111111111e-06, 'epoch': 0.0}


 90%|█████████ | 90/100 [03:29<00:23,  2.31s/it]

{'loss': 1.5599, 'learning_rate': 5.555555555555556e-06, 'epoch': 0.0}


 91%|█████████ | 91/100 [03:32<00:22,  2.47s/it]

{'loss': 1.7553, 'learning_rate': 5e-06, 'epoch': 0.0}


 92%|█████████▏| 92/100 [03:34<00:18,  2.29s/it]

{'loss': 1.601, 'learning_rate': 4.444444444444445e-06, 'epoch': 0.0}


 93%|█████████▎| 93/100 [03:36<00:16,  2.41s/it]

{'loss': 1.9067, 'learning_rate': 3.888888888888889e-06, 'epoch': 0.0}


 94%|█████████▍| 94/100 [03:38<00:13,  2.22s/it]

{'loss': 1.5364, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.0}


 95%|█████████▌| 95/100 [03:40<00:10,  2.04s/it]

{'loss': 1.4926, 'learning_rate': 2.777777777777778e-06, 'epoch': 0.0}


 96%|█████████▌| 96/100 [03:44<00:10,  2.56s/it]

{'loss': 1.613, 'learning_rate': 2.2222222222222225e-06, 'epoch': 0.0}


 97%|█████████▋| 97/100 [03:46<00:07,  2.43s/it]

{'loss': 1.7404, 'learning_rate': 1.6666666666666667e-06, 'epoch': 0.0}


 98%|█████████▊| 98/100 [03:49<00:05,  2.73s/it]

{'loss': 1.5646, 'learning_rate': 1.1111111111111112e-06, 'epoch': 0.0}


 99%|█████████▉| 99/100 [03:52<00:02,  2.63s/it]

{'loss': 1.5708, 'learning_rate': 5.555555555555556e-07, 'epoch': 0.0}


100%|██████████| 100/100 [03:53<00:00,  2.34s/it]

{'loss': 1.5269, 'learning_rate': 0.0, 'epoch': 0.0}
{'train_runtime': 233.9709, 'train_samples_per_second': 3.419, 'train_steps_per_second': 0.427, 'train_loss': 1.789785120487213, 'epoch': 0.0}





TrainOutput(global_step=100, training_loss=1.789785120487213, metrics={'train_runtime': 233.9709, 'train_samples_per_second': 3.419, 'train_steps_per_second': 0.427, 'train_loss': 1.789785120487213, 'epoch': 0.0})

In [32]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nThe fibonnaci sequence is:\n1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025']

In [33]:
model.save_pretrained("tinyllama-1b-openorca") # Local saving

In [36]:
model.push_to_hub("dylan9n/tinyllama-1b-openorca") # Online saving