In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from transformers import TextStreamer
import torch
import gc
import wandb  #Remove if you arent interested in the analytics
import os
from datasets import load_dataset

In [None]:
wandb.init(project="Simpler")

In [None]:
HUGGING_FACE_HUB_TOKEN='<INSERT HUGGING FACE API KEY>'
WANDB_API_KEY='<INSERT WANDB API KEY>'

In [None]:
MODEL_NAME="unsloth/gemma-2-2b-it-bnb-4bit"

In [9]:
filepath = '../../Step-1-Data-Processing/pretraining/split-pretrain.jsonl'


### Load Model

In [10]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 550 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Load Lora adapter via peft ( this allows for updating only 1 to 10% of parameters)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Make sure to always add EOS_TOKEN or model will ramble

In [None]:
EOS_TOKEN = tokenizer.eos_token
# Load the dataset
dataset = load_dataset('json', data_files=file_path, split="train")

def formatting_prompts_func(examples):
    texts = examples["text"]
    outputs = []
    for text in texts:
        formatted_text = text + EOS_TOKEN
        outputs.append(formatted_text)
    return {"text": outputs}

In [11]:
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

### Train the model
feel free to play around with hyper parameters on both the lora adapters as well as the trainer arguments such as learning rate, epochs or steps, batch size or accumulation steps (memory intensive)

In [None]:
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        #max_steps = 120,
        #warmup_steps = 10,
        warmup_ratio = 0.2,
        num_train_epochs = 5,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        gradient_checkpointing=True,
        seed = 3407,
        output_dir = "outputs",       
        report_to="wandb",
    ),
)

In [None]:
# Train the model
trainer_stats = trainer.train()

Check how much memory was used..

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

### Run inference

In [None]:
# Enable native faster inference
FastLanguageModel.for_inference(model)

# Define the question
question = "What is John Carter's Sandbox Strategy?"

# Format the input
formatted_input = question

# Tokenize the input
inputs = tokenizer(
    [formatted_input],
    return_tensors="pt"
).to("cuda")

# Initialize the text streamer
text_streamer = TextStreamer(tokenizer)

# Generate the output using the model
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

### Get ready to save model -- input your own huggingface or local variables

In [None]:
username = "davidbzyk"
model_name = "simpler-gemma-2-2b"

lora adapters

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

save to float 16-- needed to quantize when llama.cpp fails
`f"{model_name}"` or `f"{username}/{model_name}"`

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")