In [None]:
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
#!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes transformers datasets

In [None]:
#!pip install huggingface-hub

In [None]:
#from huggingface_hub import notebook_login
#notebook_login()

In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [1]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-1.1-7b-it-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.2.0+cu121)
    Python  3.10.14 (you have 3.10.14)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = 'none',
    use_gradient_checkpointing = "unsloth",
    use_rslora = False,
    loftq_config = None
)

Unsloth 2024.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func(examples):
    question = examples["question"]
    answer       = examples["answer"]
    source      = examples["source"]
    texts = []
    for question, answer, source in zip(question, answer, source):
        prompt = """As a Minecraft expert, provide a detailed and comprehensive answer to the following question. Ensure your answer:
        1. Is thorough and in-depth, covering all relevant aspects of Minecraft related to the question.
        2. Focuses on the main ideas and essential information related to the question, eliminating any extraneous details.
        3. Relies strictly on knowledge about Minecraft, without including external information.
        4. Is formatted in a clear and easily understandable paragraph.
        
        By following these guidelines, produce a response that encapsulates the essence of the Minecraft query in a clear, detailed, and reader-friendly manner. Optimize the output as markdown file."""
        text = f"{BOS_TOKEN} Prompt: {prompt}\n Question: {question}\n Source: {source}\n Answer: {answer} {EOS_TOKEN}"
        texts.append(text)
    return { "text" : texts, }
pass

In [4]:
from datasets import load_dataset

dataset = load_dataset("naklecha/minecraft-question-answer-700k", split="train")

In [5]:
dataset = dataset.map(formatting_prompts_func, batched = True)
dataset

Map:   0%|          | 0/694814 [00:00<?, ? examples/s]

Dataset({
    features: ['answer', 'question', 'source', 'text'],
    num_rows: 694814
})

In [6]:
text_dataset = dataset.remove_columns(["answer", "question", "source"])

In [7]:
print(text_dataset['text'][0])

<bos> Prompt: As a Minecraft expert, provide a detailed and comprehensive answer to the following question. Ensure your answer:
        1. Is thorough and in-depth, covering all relevant aspects of Minecraft related to the question.
        2. Focuses on the main ideas and essential information related to the question, eliminating any extraneous details.
        3. Relies strictly on knowledge about Minecraft, without including external information.
        4. Is formatted in a clear and easily understandable paragraph.
        
        By following these guidelines, produce a response that encapsulates the essence of the Minecraft query in a clear, detailed, and reader-friendly manner. Optimize the output as markdown file.
 Question: What is the first statistic to decrease when a player performs energy-intensive actions in Minecraft?
 Source: https://minecraft.wiki/w/Food#Nourishment_value
 Answer: Saturation is the first statistic to decrease when a player performs energy-intensive a

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = text_dataset,
    dataset_text_field = "text",
    max_seq_length = 7680,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/694814 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.394 GB.
5.861 GB of memory reserved.


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 694,814 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 50,003,968


Step,Training Loss
1,7.1875
2,7.5312
3,7.2188
4,6.0
5,5.1562
6,4.2812
7,4.0312
8,3.75
9,2.8906
10,2.5


In [13]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

148.6209 seconds used for training.
2.48 minutes used for training.
Peak reserved memory = 7.205 GB.
Peak reserved memory for training = 1.344 GB.
Peak reserved memory % of max memory = 18.29 %.
Peak reserved memory for training % of max memory = 3.412 %.


In [23]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "What is the primary use of redstone in Minecraft?",
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<bos>What is the primary use of redstone in Minecraft?
A. To create a redstone circuit
B. To power a redstone lamp
C. To create a redstone torch
D. To power a redstone repeater<eos>


In [24]:
model.push_to_hub("emre570/gemma1.1-7b-it-minecraft")

README.md:   0%|          | 0.00/584 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/200M [00:00<?, ?B/s]

Saved model to https://huggingface.co/emre570/gemma1.1-7b-it-minecraft
