In [1]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from transformers import TextStreamer
import torch
import gc
import wandb  #Remove if you arent interested in the analytics
import os
from datasets import load_dataset
from dotenv import load_dotenv
load_dotenv()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


True

In [2]:
HUGGING_FACE_HUB_TOKEN=os.getenv("HUGGING_FACE_HUB_TOKEN")
WANDB_API_KEY=os.getenv("WANDB_API_KEY")
WANDB_NOTEBOK_NAME=os.getenv("WANDB_NOTEBOK_NAME")

In [3]:
wandb.init(project="Simpler")

[34m[1mwandb[0m: Currently logged in as: [33mdelraycapitalmanagement[0m ([33mdavidbzyk[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
##USE THIS FOR FIRST TIME TRAINING OR PICK ANOTHER SMALL MODEL FROM https://huggingface.co/unsloth
#MODEL_NAME="unsloth/gemma-2-2b-it-bnb-4bit"

In [5]:
MODEL_NAME='unsloth/gemma-2-2b-it-bnb-4bit'

In [6]:
file_path = '../../Step-1-Data-Processing/pretraining/split-pretrain.jsonl'


### Load Model

In [7]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 550 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.668 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


### Load Lora adapter via peft ( this allows for updating only 1 to 10% of parameters)

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


### Make sure to always add EOS_TOKEN or model will ramble

In [9]:
EOS_TOKEN = tokenizer.eos_token
# Load the dataset
dataset = load_dataset('json', data_files=file_path, split="train")

def formatting_prompts_func(examples):
    texts = examples["text"]
    outputs = []
    for text in texts:
        formatted_text = text + EOS_TOKEN
        outputs.append(formatted_text)
    return {"text": outputs}

In [10]:
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

### Train the model
feel free to play around with hyper parameters on both the lora adapters as well as the trainer arguments such as learning rate, epochs or steps, batch size or accumulation steps (memory intensive)

In [11]:
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        #max_steps = 120,
        #warmup_steps = 10,
        warmup_ratio = 0.2,
        num_train_epochs = 15,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        gradient_checkpointing=True,
        seed = 3407,
        output_dir = "outputs",       
        report_to="wandb",
    ),
)

In [12]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 184 | Num Epochs = 15
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 165
 "-____-"     Number of trainable parameters = 1,345,781,760


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.




  0%|          | 0/165 [00:00<?, ?it/s]

{'loss': 3.1468, 'grad_norm': 7.354604244232178, 'learning_rate': 1.5151515151515152e-06, 'epoch': 0.09}
{'loss': 3.3101, 'grad_norm': 7.639742374420166, 'learning_rate': 3.0303030303030305e-06, 'epoch': 0.17}
{'loss': 3.1517, 'grad_norm': 6.832216739654541, 'learning_rate': 4.5454545454545455e-06, 'epoch': 0.26}
{'loss': 3.2621, 'grad_norm': 7.00938606262207, 'learning_rate': 6.060606060606061e-06, 'epoch': 0.35}
{'loss': 3.0344, 'grad_norm': 5.7711944580078125, 'learning_rate': 7.5757575757575764e-06, 'epoch': 0.43}
{'loss': 2.793, 'grad_norm': 4.695980548858643, 'learning_rate': 9.090909090909091e-06, 'epoch': 0.52}
{'loss': 2.9096, 'grad_norm': 3.711344003677368, 'learning_rate': 1.0606060606060607e-05, 'epoch': 0.61}
{'loss': 2.88, 'grad_norm': 3.3542118072509766, 'learning_rate': 1.2121212121212122e-05, 'epoch': 0.7}
{'loss': 2.8675, 'grad_norm': 3.893371820449829, 'learning_rate': 1.3636363636363637e-05, 'epoch': 0.78}
{'loss': 2.5508, 'grad_norm': 3.101470708847046, 'learning_r

Check how much memory was used..

In [13]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.668 GB.
20.855 GB of memory reserved.


### Run inference

In [14]:
# Enable native faster inference
FastLanguageModel.for_inference(model)

# Define the question
question = "What is John Carter's Sandbox Strategy?"

# Format the input
formatted_input = question

# Tokenize the input
inputs = tokenizer(
    [formatted_input],
    return_tensors="pt"
).to("cuda")

# Initialize the text streamer
text_streamer = TextStreamer(tokenizer)

# Generate the output using the model
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

<bos>What is John Carter's Sandbox 

Strategy?

By: 
John Carter

Available on: 
ThinkorSwim, TradingView

This product is designed for: 
Options, Futures, Stocks

Trading in any market successfully requires a trading plan. The best trading plans are designed to help traders to identify high-probability setups while protecting their profits. In this class, John Carter, Director of Options and Futures, will share his “Sandbox Strategy” which he’s used to reliably day trade the Newhouse 3 (NH3) index within a pre-defined “sandbox.”

Each trading day begins with an interactive session where John and NH3 traders come together


### Get ready to save model -- input your own huggingface or local variables
- Save a base model to keep training and quantized formats for testing inference locally

In [19]:
username = "davidbzyk"
base_model_name = "simpler-gemma-2-2b"
base_repo=f"{username}/{base_model_name}"



In [20]:
model.push_to_hub(base_model_name, token=HUGGING_FACE_HUB_TOKEN)
tokenizer.push_to_hub(base_model_name, token=HUGGING_FACE_HUB_TOKEN)

adapter_model.safetensors:   0%|          | 0.00/5.38G [00:00<?, ?B/s]

Saved model to https://huggingface.co/simpler-gemma-2-2b


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [21]:
multi_models_name = "simpler-gemma-2-2b-multi"
multi_repo=f"{username}/{multi_models_name}"

In [22]:
if True:
    model.push_to_hub_gguf(
        multi_models_name, # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token=HUGGING_FACE_HUB_TOKEN, # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 88.52 out of 125.61 RAM for saving.


100%|██████████| 26/26 [00:00<00:00, 115.72it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting gemma2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at simpler-gemma-2-2b-multi into bf16 GGUF format.
The output location will be ./simpler-gemma-2-2b-multi/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: simpler-gemma-2-2b-multi
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> BF16,

unsloth.BF16.gguf:   0%|          | 0.00/5.24G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/2.78G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi


lora adapters

In [None]:
#model.save_pretrained("lora_model") # Local saving
#tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

if you want to save individually core models

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")