In [1]:
import json
import os
import wandb
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback, TrainerState, TrainerControl, EarlyStoppingCallback
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from datasets import Dataset
from dotenv import load_dotenv
load_dotenv()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


True

In [2]:
HUGGING_FACE_HUB_TOKEN=os.getenv("HUGGING_FACE_HUB_TOKEN")
WANDB_API_KEY=os.getenv("WANDB_API_KEY")

In [3]:
# Initialize wandb
wandb.init(project="simpler")



[34m[1mwandb[0m: Currently logged in as: [33mdelraycapitalmanagement[0m ([33mdavidbzyk[0m). Use [1m`wandb login --relogin`[0m to force relogin


Remember to pull your own repo/model you saved in previous step.
```python
##USE THIS FOR FIRST TIME TRAINING OR PICK ANOTHER SMALL MODEL FROM https://huggingface.co/unsloth
#MODEL_NAME="unsloth/gemma-2-2b-it-bnb-4bit"
```

In [4]:
MODEL_NAME = 'davidbzyk/simpler-gemma-2-2b'

In [5]:
TRAINING_DATA_PATH ='../../Step-1-Data-Processing/finetuning/all-qa-final.jsonl'

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(    
    model_name = MODEL_NAME,
    
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.668 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


adapter_model.safetensors:   0%|          | 0.00/5.38G [00:00<?, ?B/s]

Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [7]:
EOS_TOKEN = tokenizer.eos_token

In [8]:
def combine_texts(question, answer):
    return {
        "text": f"###{question}@@@{answer}{EOS_TOKEN}",
    }

def load_data_from_jsonl(file_path):
    questions = []
    answers = []

    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_number, line in enumerate(f, start=1):
                if line.strip():  # Skip empty lines
                    try:
                        entry = json.loads(line)
                        questions.append(entry['question'])
                        answers.append(entry['answer'])
                        print(f"Successfully parsed line {line_number}")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON on line {line_number}: {e}")
                        print(f"Problematic line: {line}")
                    except KeyError as e:
                        print(f"Missing key in JSON entry on line {line_number}: {e}")
    except Exception as e:
        raise Exception(f"Unexpected error while reading the file: {e}")
    
    return questions, answers

In [9]:
# Load data from JSONL file
try:
    questions, answers = load_data_from_jsonl(TRAINING_DATA_PATH)
    print(f"Total parsed entries: {len(questions)}")
except Exception as e:
    print(f"Failed to load data: {e}")
    questions, answers = [], []

Successfully parsed line 1
Successfully parsed line 2
Successfully parsed line 3
Successfully parsed line 4
Successfully parsed line 5
Successfully parsed line 6
Successfully parsed line 7
Successfully parsed line 8
Successfully parsed line 9
Successfully parsed line 10
Successfully parsed line 11
Successfully parsed line 12
Successfully parsed line 13
Successfully parsed line 14
Successfully parsed line 15
Successfully parsed line 16
Successfully parsed line 17
Successfully parsed line 18
Successfully parsed line 19
Successfully parsed line 20
Successfully parsed line 21
Successfully parsed line 22
Successfully parsed line 23
Successfully parsed line 24
Successfully parsed line 25
Successfully parsed line 26
Successfully parsed line 27
Successfully parsed line 28
Successfully parsed line 29
Successfully parsed line 30
Successfully parsed line 31
Successfully parsed line 32
Successfully parsed line 33
Successfully parsed line 34
Successfully parsed line 35
Successfully parsed line 36
S

In [10]:
# Split the data into training and evaluation sets
train_size = int(0.9 * len(questions))
train_questions = questions[:train_size]
train_answers = answers[:train_size]
eval_questions = questions[train_size:]
eval_answers = answers[train_size:]

In [11]:
# Prepare the fine-tuning training dataset
if train_questions and train_answers:
    combined_texts_train = [combine_texts(question, answer) for question, answer in zip(train_questions, train_answers)]
    combined_texts_eval = [combine_texts(question, answer) for question, answer in zip(eval_questions, eval_answers)]

    # Create the fine-tuning datasets
    train_dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts_train]})
    eval_dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts_eval]})

    # Display example training record
    if len(train_dataset) > 0:
        print("Example training record:\n")
        print(train_dataset[0]['text'])
    else:
        print("The fine-tuning training dataset is empty.")
    
    if len(eval_dataset) > 0:
        print("Example evaluation record:\n")
        print(eval_dataset[0]['text'])
    else:
        print("The fine-tuning evaluation dataset is empty.")
else:
    print("Failed to create the fine-tuning datasets.")

Example training record:

###According to Simpler Trading, What asset classes is John Carter's Sandbox Strategy traded in?@@@John Carter's Sandbox Strategy is traded in Options<eos>
Example evaluation record:

###According to Simpler Trading, What functionalities does Raghee Horner’s HPMR Indicator offer?@@@The Raghee Horner’s HPMR Indicator - The Hourly Price Movement Range (HPMR) indicator is designed to forecast volatility-based support and resistance in a 24-hour timeframe. 

This indicator allows Raghee to measure time and volatility during special economic events or highly volatile hours in the day, such as the opening bell, midday, and the close. 

The HPMR indicator gives Raghee a next-level view of market direction.<eos>


In [12]:
max_qna = 0
max_q = 0
max_a = 0
for question, answer in zip(questions, answers):
    q_tokens = tokenizer.encode_plus(question, add_special_tokens=False, max_length=None)["input_ids"]
    a_tokens = tokenizer.encode_plus(answer, add_special_tokens=False, max_length=None)["input_ids"]
    qna_tokens = tokenizer.encode_plus(combine_texts(question, answer)["text"], add_special_tokens=False, max_length=None)["input_ids"]

    max_q = max(max_q, len(q_tokens))
    max_a = max(max_a, len(a_tokens))
    max_qna = max(max_qna, len(qna_tokens))

buffer = 10
max_seq_length = max_qna + buffer

table_title = "Training Data Token Counts"
print(f"\n{table_title:-^70}")
print(f"{'Measure':<14}{'Question':<14}{'Answer':<14}{'Combined':<14}")

print(f"{'Maximums':<14}{max_q:<14}{max_a:<14}{max_qna:<14}")
print(f"{'Max Seq Len':<14}{'':<14}{'':<14}{max_seq_length:<14}\n")

print(f"Set max_seq_length in FastLanguageModel to {max_seq_length} to handle the maximum number of tokens required by the input training data (Combined Maximum + Buffer).")



----------------------Training Data Token Counts----------------------
Measure       Question      Answer        Combined      
Maximums      33            475           499           
Max Seq Len                               509           

Set max_seq_length in FastLanguageModel to 509 to handle the maximum number of tokens required by the input training data (Combined Maximum + Buffer).


In [13]:
dtype = None 
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(    
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # add a Hugging Face access token if using a private or gated model
) 

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.668 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [15]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.668 GB.
16.17 GB of memory reserved.


clear memory if needed.. if not needed.. no worries.. skip

In [16]:
import gc
def clear_cuda_memory():
    torch.cuda.empty_cache()
    gc.collect()
    print("CUDA memory cleared.")

# Call the function to clear memory
clear_cuda_memory()

CUDA memory cleared.


In [17]:
# Logging configuration to wandb
config = {
    "learning_rate": 2e-5,    
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "num_train_epochs": 20,  # Increased to allow early stopping to take effect
    "warmup_steps": 100,
    "max_seq_length": max_seq_length,  # To be calculated later
}

In [18]:
wandb.config.update(config)

In [19]:
training_args = TrainingArguments(
    per_device_train_batch_size=config['batch_size'],
    gradient_accumulation_steps=config['gradient_accumulation_steps'],
    warmup_steps=config['warmup_steps'],
    num_train_epochs=config['num_train_epochs'],
    learning_rate=config['learning_rate'],
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,  # Limit to 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
    greater_is_better=False,  # Lower loss is better
)



setup callbacks for WANDB logging

In [20]:
class WandbCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

    def on_train_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.init(project="your_project_name", config=args)

    def on_train_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.finish()

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

    def on_save(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.log({"global_step": state.global_step, "saving_checkpoint": True})

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

In [21]:
# Initialize trainer with the enhanced WandbCallback
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add evaluation dataset for monitoring
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    callbacks=[WandbCallback(), EarlyStoppingCallback(early_stopping_patience=4)],  # Add early stopping callback
)

Map (num_proc=2):   0%|          | 0/1598 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/178 [00:00<?, ? examples/s]

In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,598 | Num Epochs = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3,980
 "-____-"     Number of trainable parameters = 1,345,781,760


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112175899971691, max=1.0…

  0%|          | 0/3980 [00:00<?, ?it/s]

{'loss': 4.6367, 'grad_norm': 65.17147064208984, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 4.1585, 'grad_norm': 47.89892578125, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 3.0793, 'grad_norm': 40.855445861816406, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 2.6244, 'grad_norm': 27.010238647460938, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'loss': 1.8499, 'grad_norm': 26.952211380004883, 'learning_rate': 5e-06, 'epoch': 0.13}
{'loss': 1.2679, 'grad_norm': 20.45762062072754, 'learning_rate': 6e-06, 'epoch': 0.15}
{'loss': 0.8423, 'grad_norm': 13.93791675567627, 'learning_rate': 7e-06, 'epoch': 0.18}
{'loss': 0.613, 'grad_norm': 7.822998523712158, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2}
{'loss': 0.6006, 'grad_norm': 13.41559886932373, 'learning_rate': 9e-06, 'epoch': 0.23}
{'loss': 0.498, 'grad_norm': 17.745641708374023, 'learning_rate': 1e-05, 'epoch': 0.25}
{'loss': 0.5708, 'grad_norm': 13.377673149108887, 'learni

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 1.383622169494629, 'eval_runtime': 3.0496, 'eval_samples_per_second': 58.368, 'eval_steps_per_second': 7.542, 'epoch': 1.0}
{'loss': 0.2434, 'grad_norm': 3.4169299602508545, 'learning_rate': 1.9484536082474227e-05, 'epoch': 1.0}
{'loss': 0.216, 'grad_norm': 6.130731582641602, 'learning_rate': 1.945876288659794e-05, 'epoch': 1.03}
{'loss': 0.1959, 'grad_norm': 6.7535552978515625, 'learning_rate': 1.943298969072165e-05, 'epoch': 1.05}
{'loss': 0.2002, 'grad_norm': 4.5460686683654785, 'learning_rate': 1.9407216494845362e-05, 'epoch': 1.08}
{'loss': 0.1855, 'grad_norm': 4.6880784034729, 'learning_rate': 1.9381443298969072e-05, 'epoch': 1.1}
{'loss': 0.1916, 'grad_norm': 4.584428787231445, 'learning_rate': 1.9355670103092786e-05, 'epoch': 1.13}
{'loss': 0.209, 'grad_norm': 3.383728265762329, 'learning_rate': 1.9329896907216497e-05, 'epoch': 1.15}
{'loss': 0.2333, 'grad_norm': 5.4529523849487305, 'learning_rate': 1.9304123711340207e-05, 'epoch': 1.18}
{'loss': 0.1984, 'grad_nor

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 1.7207645177841187, 'eval_runtime': 3.4883, 'eval_samples_per_second': 51.028, 'eval_steps_per_second': 6.593, 'epoch': 2.0}
{'loss': 0.1692, 'grad_norm': 2.5331082344055176, 'learning_rate': 1.8453608247422682e-05, 'epoch': 2.0}
{'loss': 0.1656, 'grad_norm': 3.2688934803009033, 'learning_rate': 1.8427835051546392e-05, 'epoch': 2.03}
{'loss': 0.1575, 'grad_norm': 2.992992401123047, 'learning_rate': 1.8402061855670103e-05, 'epoch': 2.05}
{'loss': 0.1863, 'grad_norm': 2.554945230484009, 'learning_rate': 1.8376288659793817e-05, 'epoch': 2.08}
{'loss': 0.1822, 'grad_norm': 1.718431830406189, 'learning_rate': 1.8350515463917527e-05, 'epoch': 2.1}
{'loss': 0.155, 'grad_norm': 2.428297281265259, 'learning_rate': 1.8324742268041237e-05, 'epoch': 2.13}
{'loss': 0.138, 'grad_norm': 2.4470460414886475, 'learning_rate': 1.8298969072164948e-05, 'epoch': 2.15}
{'loss': 0.1797, 'grad_norm': 2.008721351623535, 'learning_rate': 1.8273195876288662e-05, 'epoch': 2.18}
{'loss': 0.1765, 'grad

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 1.9425874948501587, 'eval_runtime': 3.3503, 'eval_samples_per_second': 53.13, 'eval_steps_per_second': 6.865, 'epoch': 3.0}
{'loss': 0.1465, 'grad_norm': 1.9999428987503052, 'learning_rate': 1.7422680412371137e-05, 'epoch': 3.0}
{'loss': 0.1565, 'grad_norm': 3.188136577606201, 'learning_rate': 1.7396907216494847e-05, 'epoch': 3.03}
{'loss': 0.1265, 'grad_norm': 2.0676491260528564, 'learning_rate': 1.7371134020618557e-05, 'epoch': 3.05}
{'loss': 0.1295, 'grad_norm': 1.6413792371749878, 'learning_rate': 1.734536082474227e-05, 'epoch': 3.08}
{'loss': 0.1632, 'grad_norm': 1.7946549654006958, 'learning_rate': 1.731958762886598e-05, 'epoch': 3.1}
{'loss': 0.1488, 'grad_norm': 2.109586238861084, 'learning_rate': 1.7293814432989692e-05, 'epoch': 3.13}
{'loss': 0.1318, 'grad_norm': 3.126887321472168, 'learning_rate': 1.7268041237113406e-05, 'epoch': 3.15}
{'loss': 0.1423, 'grad_norm': 3.2133989334106445, 'learning_rate': 1.7242268041237116e-05, 'epoch': 3.18}
{'loss': 0.1405, 'gra

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.1255507469177246, 'eval_runtime': 3.4799, 'eval_samples_per_second': 51.152, 'eval_steps_per_second': 6.609, 'epoch': 4.0}
{'loss': 0.1408, 'grad_norm': 0.6535236835479736, 'learning_rate': 1.6391752577319588e-05, 'epoch': 4.01}
{'loss': 0.1584, 'grad_norm': 4.755343437194824, 'learning_rate': 1.6365979381443298e-05, 'epoch': 4.03}
{'loss': 0.1324, 'grad_norm': 1.3328018188476562, 'learning_rate': 1.6340206185567012e-05, 'epoch': 4.06}
{'loss': 0.1712, 'grad_norm': 0.5289347171783447, 'learning_rate': 1.6314432989690722e-05, 'epoch': 4.08}
{'loss': 0.147, 'grad_norm': 1.132416844367981, 'learning_rate': 1.6288659793814433e-05, 'epoch': 4.11}
{'loss': 0.1474, 'grad_norm': 1.321629285812378, 'learning_rate': 1.6262886597938147e-05, 'epoch': 4.13}
{'loss': 0.1362, 'grad_norm': 1.2519906759262085, 'learning_rate': 1.6237113402061857e-05, 'epoch': 4.16}
{'loss': 0.1418, 'grad_norm': 0.7034764289855957, 'learning_rate': 1.6211340206185568e-05, 'epoch': 4.18}
{'loss': 0.1298, 

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.1527621746063232, 'eval_runtime': 3.4811, 'eval_samples_per_second': 51.134, 'eval_steps_per_second': 6.607, 'epoch': 5.0}
{'train_runtime': 1015.3287, 'train_samples_per_second': 31.477, 'train_steps_per_second': 3.92, 'train_loss': 0.27967734984262194, 'epoch': 5.0}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/loss,▁▄▆██
eval/runtime,▁█▆██
eval/samples_per_second,█▁▃▁▁
eval/steps_per_second,█▁▃▁▁
eval_loss,▁▁▄▄▆▆████
eval_runtime,▁▁██▆▆████
eval_samples_per_second,██▁▁▃▃▁▁▁▁
eval_steps_per_second,██▁▁▃▃▁▁▁▁
global_step,▁▃▅▆█

0,1
epoch,4.99625
eval/loss,2.15276
eval/runtime,3.4811
eval/samples_per_second,51.134
eval/steps_per_second,6.607
eval_loss,2.15276
eval_runtime,3.4811
eval_samples_per_second,51.134
eval_steps_per_second,6.607
global_step,998


In [23]:
model.push_to_hub(MODEL_NAME,token=HUGGING_FACE_HUB_TOKEN)
tokenizer.push_to_hub(MODEL_NAME,token=HUGGING_FACE_HUB_TOKEN)

adapter_model.safetensors:   0%|          | 0.00/5.38G [00:00<?, ?B/s]

Saved model to https://huggingface.co/davidbzyk/simpler-gemma-2-2b


In [24]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.668 GB.
19.348 GB of memory reserved.


In [25]:
from transformers import TextStreamer
# Enable native faster inference
FastLanguageModel.for_inference(model)

# Define the question
question = "Who is Raghee Horner and what is her trading style?"

# Format the input
formatted_input = question

# Tokenize the input
inputs = tokenizer(
    [formatted_input],
    return_tensors="pt"
).to("cuda")

# Initialize the text streamer
text_streamer = TextStreamer(tokenizer)

# Generate the output using the model
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

<bos>Who is Raghee Horner and what is her trading style?

Raghee Horner is the brains behind Raghee Horner’s DPMR Indicator. She’s a 30+ year veteran of the trading industry and her methods have helped countless traders over the years. Her background includes 15+ years of experience in top trading rooms and/or as a trusted money manager.

Raghee’s a big deal in the trading world. She’s the brains behind Raghee Horner’s DPMR Indicator. Her methods have helped countless traders over the years. Her passion is to help others achieve their full trading potential, one trader at a time.

When she’s not trading


### Replace the below with how you want to save the model to huggingface/locally  p.s. locally you will fill the hard drive real quick.

In [26]:
username = "davidbzyk"
model_name = "simpler-gemma-2-2b"


model.push_to_hub(model_name, token=HUGGING_FACE_HUB_TOKEN)
tokenizer.push_to_hub(model_name, token=HUGGING_FACE_HUB_TOKEN)

No files have been modified since last commit. Skipping to prevent empty commit.


Saved model to https://huggingface.co/simpler-gemma-2-2b


In [27]:
multi_model = 'simpler-gemma-2-2b-multi'
if True:
    model.push_to_hub_gguf(
        multi_model, # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token=HUGGING_FACE_HUB_TOKEN, # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


make: Entering directory '/home/dave/Desktop/simpler/simpler-prod-qa/Step-2-Training/step2b-finetune/llama.cpp'
I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/dave/miniconda3/envs/unsloth_env/include  -I/home/dave/miniconda3/envs/unsloth_env/targets/x86_64-linux/include  -L/home/dave/miniconda3/envs/unsloth_env/targets/x86_64-linux/lib -L/home/dave/miniconda3/envs/unsloth_env/targets/x86_64-linux/lib/stubs -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -march

100%|██████████| 26/26 [00:00<00:00, 99.30it/s] 


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting gemma2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at simpler-gemma-2-2b-multi into bf16 GGUF format.
The output location will be ./simpler-gemma-2-2b-multi/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: simpler-gemma-2-2b-multi
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> BF16,

unsloth.BF16.gguf:   0%|          | 0.00/5.24G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/2.78G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Saved GGUF to https://huggingface.co/davidbzyk/simpler-gemma-2-2b-multi
