In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b", # using gemma-2b for faster training
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.9.post4 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


### Data Prep

In [None]:
# Wikipedia prompt in German
wikipedia_prompt = """Wikipedia-Artikel
### Titel: {}

### Artikel:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Muss EOS_TOKEN hinzufügen
def formatting_prompts_func(examples):
    titles = examples["title"]
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Muss EOS_TOKEN hinzufügen, sonst läuft die Generierung endlos weiter!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass


In [None]:
from datasets import load_dataset

# Load the German Wikipedia dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.de", split="train")

# We select 1% of the data to make training faster!
dataset = dataset.train_test_split(train_size=0.01)["train"]

# Apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)


README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

train-00000-of-00013.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/168M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1841155 [00:00<?, ? examples/s]

Map:   0%|          | 0/18411 [00:00<?, ? examples/s]

### Continued Pretraining

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        max_steps = 120,
        warmup_steps = 10,
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/18411 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


### Show current memory stats

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
8.604 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 18,411 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,345,781,760


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


AUTOTUNE bmm(16x648x256, 16x256x648)
  bmm 0.0614 ms 100.0%
  triton_bmm_10 0.0860 ms 71.4%
  triton_bmm_6 0.0891 ms 69.0%
  triton_bmm_13 0.0891 ms 69.0%
  triton_bmm_9 0.0911 ms 67.4%
  triton_bmm_5 0.0932 ms 65.9%
  triton_bmm_14 0.0973 ms 63.2%
  triton_bmm_15 0.1198 ms 51.3%
  triton_bmm_2 0.1265 ms 48.6%
  triton_bmm_3 0.1280 ms 48.0%
SingleProcess AUTOTUNE benchmarking takes 2.3546 seconds and 0.0117 seconds precompiling
AUTOTUNE bmm(16x648x648, 16x648x256)
  bmm 0.0410 ms 100.0%
  triton_bmm_33 0.0983 ms 41.7%
  triton_bmm_29 0.1116 ms 36.7%
  triton_bmm_37 0.1167 ms 35.1%
  triton_bmm_32 0.1188 ms 34.5%
  triton_bmm_24 0.1219 ms 33.6%
  triton_bmm_34 0.1270 ms 32.3%
  triton_bmm_22 0.1311 ms 31.2%
  triton_bmm_25 0.1393 ms 29.4%
  triton_bmm_28 0.1485 ms 27.6%
SingleProcess AUTOTUNE benchmarking takes 2.3292 seconds and 0.0018 seconds precompiling
AUTOTUNE bmm(16x648x648, 16x648x256)
  bmm 0.0410 ms 100.0%
  triton_bmm_81 0.1280 ms 32.0%
  triton_bmm_78 0.1331 ms 30.8%
  trito

Step,Training Loss
1,2.1612
2,2.1664
3,1.9944
4,2.2268
5,1.8851
6,1.8479
7,1.9022
8,2.0712
9,2.1104
10,2.1157


AUTOTUNE bmm(16x256x256, 16x256x256)
  bmm 0.0184 ms 100.0%
  triton_bmm_158 0.0215 ms 85.7%
  triton_bmm_162 0.0236 ms 78.3%
  triton_bmm_157 0.0246 ms 75.0%
  triton_bmm_161 0.0246 ms 75.0%
  triton_bmm_165 0.0246 ms 75.0%
  triton_bmm_167 0.0256 ms 72.0%
  triton_bmm_166 0.0266 ms 69.2%
  triton_bmm_154 0.0276 ms 66.7%
  triton_bmm_155 0.0276 ms 66.7%
SingleProcess AUTOTUNE benchmarking takes 2.1826 seconds and 0.0106 seconds precompiling
AUTOTUNE bmm(16x256x256, 16x256x256)
  bmm 0.0164 ms 100.0%
  triton_bmm_185 0.0266 ms 61.5%
  triton_bmm_177 0.0276 ms 59.3%
  triton_bmm_181 0.0276 ms 59.3%
  triton_bmm_176 0.0287 ms 57.1%
  triton_bmm_184 0.0297 ms 55.2%
  triton_bmm_189 0.0297 ms 55.2%
  triton_bmm_174 0.0307 ms 53.3%
  triton_bmm_186 0.0307 ms 53.3%
  triton_bmm_173 0.0328 ms 50.0%
SingleProcess AUTOTUNE benchmarking takes 2.1682 seconds and 0.0018 seconds precompiling
AUTOTUNE bmm(16x256x256, 16x256x256)
  bmm 0.0164 ms 100.0%
  triton_bmm_233 0.0297 ms 55.2%
  triton_bmm_22

### Instruction Finetuning

We now use the [Alpaca in GPT4 Dataset](https://huggingface.co/datasets/FreedomIntelligence/alpaca-gpt4-korean) but translated in Korean!

Go to [vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4) for the original GPT4 dataset for Alpaca or [MultilingualSIFT project](https://github.com/FreedomIntelligence/MultilingualSIFT) for other translations of the Alpaca dataset.

In [None]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-deutsch", split = "train")

README.md:   0%|          | 0.00/124 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


alpaca-gpt4-italian.json:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49969 [00:00<?, ? examples/s]

We print 1 example:

In [None]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'Suggerisci uno slogan per una campagna di riciclaggio.\n'}, {'from': 'gpt', 'value': '1. "Riduci, riutilizza, ricicla: Insieme per un futuro più verde."\n2. "Ricicla oggi, per un domani migliore."\n3. "Trasforma la tua spazzatura in tesoro - Ricicla!"\n4. "Ricicla per il ciclo della vita."\n5. "Risparmia risorse, ricicla di più."'}], 'id': '23712'}


We again use https://translate.google.com/ to translate the Alpaca format into Korean

We again employ `UnslothTrainer` and do instruction finetuning!

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=8):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 49,969 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,345,781,760


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


Step,Training Loss
1,2.2106
2,2.1517
3,1.8209
4,1.6858
5,1.5251
6,1.335
7,1.453
8,1.4304
9,1.4109
10,1.336


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

Remember to use https://translate.google.com/!

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(

        "Was ist ein Sonnensystem?", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<bos>Di seguito è riportata un'istruzione che descrive un compito. Scrivi una risposta che completi adeguatamente la richiesta.

### Istruzione:
Com'è la musica latina?

### Risposta:
La musica latina è una vasta categoria che comprende una varietà di stili musicali che hanno origine in varie parti del mondo latino. Alcuni dei principali stili musicali latini includono la salsa, il merengue, il bachata, il cumbia, il reggaeton e il hip-hop latino. Questi stili sono spesso caratterizzati da ritmi vivaci, melodie accattivanti e una forte base ritmica. La musica latina è spesso associata a una forte componente danzabile, con molti stili che richiedono una certa conoscenza della danza.<eos>
