### Setup

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Mistral patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.11.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


### Data Prep

In [8]:
# Wikipedia provides a title and an article text.
# Use https://translate.google.com!
_wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""
# becomes:
wikipedia_prompt = """Wikipedia-Artikel
### Titel: {}

### Artikel:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    titles = examples["title"]
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }


In [10]:
from datasets import load_dataset

# Load the German Wikipedia dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.de", split="train")

# Select 1% of the data to make training faster
dataset = dataset.train_test_split(train_size=0.01)["train"]

# Apply the formatting function to prepare the data
dataset = dataset.map(formatting_prompts_func, batched=True)


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/20 [00:00<?, ?files/s]

train-00000-of-00020.parquet:   0%|          | 0.00/781M [00:00<?, ?B/s]

train-00001-of-00020.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

train-00002-of-00020.parquet:   0%|          | 0.00/369M [00:00<?, ?B/s]

train-00003-of-00020.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

train-00004-of-00020.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

train-00005-of-00020.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00006-of-00020.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

train-00007-of-00020.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00008-of-00020.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

train-00009-of-00020.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00010-of-00020.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00011-of-00020.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00012-of-00020.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00013-of-00020.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00014-of-00020.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

train-00015-of-00020.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00016-of-00020.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00017-of-00020.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00018-of-00020.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00019-of-00020.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2845308 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Map:   0%|          | 0/28453 [00:00<?, ? examples/s]

### Continued Pretraining


In [14]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        max_steps = 30,
        warmup_steps = 5,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/28453 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer_stats = trainer.train()

Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 28,453 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 30
 "-____-"     Number of trainable parameters = 603,979,776


Step,Training Loss
1,1.2116
2,1.1448
3,1.2782
4,1.4184
5,1.2679
6,1.4371
7,1.2997
8,1.3131
9,1.4113
10,1.2815


### Instruction Finetuning

In [18]:
from datasets import load_dataset

# Load the German version of the Alpaca-GPT4 dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-deutsch", split="train")


README.md:   0%|          | 0.00/152 [00:00<?, ?B/s]

alpaca-gpt4-deutsch.json:   0%|          | 0.00/53.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49969 [00:00<?, ? examples/s]

In [19]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'Schlagen Sie einen Slogan für eine Recycling-Kampagne vor.\n'}, {'from': 'gpt', 'value': '1. "Reduziere, wiederverwende, recycel: Zusammen für eine grünere Zukunft."\n2. "Recycle heute für eine bessere Zukunft."\n3. "Mache aus deinem Müll einen Schatz - Recycle!"\n4. "Recycle für den Lebenszyklus."\n5. "Ressourcen sparen, mehr recyceln."'}], 'id': '23712'}


In [20]:
_alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""
# Becomes:
alpaca_prompt = """Nachfolgend finden Sie eine Anweisung, die eine Aufgabe beschreibt. Schreiben Sie eine Antwort, die die Anfrage angemessen erfüllt.

### Anweisung:
{}

### Antwort:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(convo[0]["value"], convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# Apply formatting to the German Alpaca dataset
alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/49969 [00:00<?, ? examples/s]

We again employ `UnslothTrainer` and do instruction finetuning!

In [21]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 10,
        warmup_steps = 4,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=8):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 49,969 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 10
 "-____-"     Number of trainable parameters = 603,979,776


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


Step,Training Loss
1,1.3741
2,1.4349
3,1.1103
4,1.0213
5,0.987
6,0.9088


Step,Training Loss
1,1.3741
2,1.4349
3,1.1103
4,1.0213
5,0.987
6,0.9088
7,1.0211
8,0.9939
9,0.9614
10,0.9301


In [23]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

332.9514 seconds used for training.
5.55 minutes used for training.
Peak reserved memory = 13.236 GB.
Peak reserved memory for training = 6.869 GB.
Peak reserved memory % of max memory = 89.748 %.
Peak reserved memory for training % of max memory = 46.576 %.


### Inference

In [24]:
# Use the German prompt for generation
alpaca_prompt = """Nachfolgend finden Sie eine Anweisung, die eine Aufgabe beschreibt. Schreiben Sie eine Antwort, die die Anfrage angemessen erfüllt.

### Anweisung:
{}

### Antwort:
{}"""

# Use the FastLanguageModel for faster inference
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Set up inputs with a German instruction
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Setzen Sie die Fibonacci-Folge fort: 1, 1, 2, 3, 5, 8,",  # German instruction
            "",  # output - leave this blank for generation!
        )
    ], return_tensors="pt").to("cuda")

# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# Decode and print the generated output
print(tokenizer.batch_decode(outputs))


['<s> Nachfolgend finden Sie eine Anweisung, die eine Aufgabe beschreibt. Schreiben Sie eine Antwort, die die Anfrage angemessen erfüllt.\n\n### Anweisung:\nSetzen Sie die Fibonacci-Folge fort: 1, 1, 2, 3, 5, 8,\n\n### Antwort:\nDie Fibonacci-Folge setzt sich fort wie folgt: 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233']


In [25]:
# Use the German prompt for generation
alpaca_prompt = """Nachfolgend finden Sie eine Anweisung, die eine Aufgabe beschreibt. Schreiben Sie eine Antwort, die die Anfrage angemessen erfüllt.

### Anweisung:
{}

### Antwort:
{}"""

# Enable native 2x faster inference with FastLanguageModel
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Set up inputs with a German instruction
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Wie ist die Musik in Deutschland?",  # German instruction
            "",  # output - leave this blank for generation!
        )
    ], return_tensors="pt").to("cuda")

# Set up the text streamer for generating outputs
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate output from the model with streaming
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)


<s> Nachfolgend finden Sie eine Anweisung, die eine Aufgabe beschreibt. Schreiben Sie eine Antwort, die die Anfrage angemessen erfüllt.

### Anweisung:
Wie ist die Musik in Deutschland?

### Antwort:
Die Musik in Deutschland ist sehr vielfältig und repräsentiert eine Vielzahl von Stilen und Genres. Es gibt eine Vielzahl von Musikern und Bands, die in Deutschland aktiv sind, und sie reichen von klassischer Musik bis hin zu Pop, Rock, Hip-Hop und Elektronischer Musik.

Einer der bekanntesten deutschen Musiker ist David Hasselhoff, der in den 1980er Jahren mit seiner Pop-Musik weltweit bekannt wurde. Ein weiterer bekannter deutscher Musiker ist Ramm


By using https://translate.google.com/ we get
```
Music in Germany is very diverse and represents a variety of styles and genres. There are a variety of musicians and bands active in Germany, and they range from classical music to pop, rock, hip-hop and electronic music.

One of the most famous German musicians is David Hasselhoff, who became known worldwide with his pop music in the 1980s. Another well-known German musician is Ramm
```

### Saving, loading finetuned models

In [26]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [31]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

In [32]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion

In [33]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")