In [None]:
import json
import os
import wandb
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback, TrainerState, TrainerControl, EarlyStoppingCallback
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from datasets import Dataset

In [None]:
HUGGING_FACE_HUB_TOKEN='<INSERT HUGGING FACE API KEY>'
WANDB_API_KEY='<INSERT WANDB API KEY>'

In [None]:
# Initialize wandb
wandb.init(project="simpler")

Remember to pull your own repo/model you saved in previous step.

In [None]:
MODEL_NAME = 'davidbzyk/simpler-gemma-2-2b'

In [None]:
TRAINING_DATA_PATH ='../../Step-1-Data-Processing/finetuning/all-qa-final.jsonl'

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(    
    model_name = MODEL_NAME,
    
)

In [None]:
EOS_TOKEN = tokenizer.eos_token

In [None]:
def combine_texts(question, answer):
    return {
        "text": f"###{question}@@@{answer}{EOS_TOKEN}",
    }

def load_data_from_jsonl(file_path):
    questions = []
    answers = []

    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_number, line in enumerate(f, start=1):
                if line.strip():  # Skip empty lines
                    try:
                        entry = json.loads(line)
                        questions.append(entry['question'])
                        answers.append(entry['answer'])
                        print(f"Successfully parsed line {line_number}")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON on line {line_number}: {e}")
                        print(f"Problematic line: {line}")
                    except KeyError as e:
                        print(f"Missing key in JSON entry on line {line_number}: {e}")
    except Exception as e:
        raise Exception(f"Unexpected error while reading the file: {e}")
    
    return questions, answers

In [None]:
# Load data from JSONL file
try:
    questions, answers = load_data_from_jsonl(TRAINING_DATA_PATH)
    print(f"Total parsed entries: {len(questions)}")
except Exception as e:
    print(f"Failed to load data: {e}")
    questions, answers = [], []

In [None]:
# Split the data into training and evaluation sets
train_size = int(0.9 * len(questions))
train_questions = questions[:train_size]
train_answers = answers[:train_size]
eval_questions = questions[train_size:]
eval_answers = answers[train_size:]

In [None]:
# Prepare the fine-tuning training dataset
if train_questions and train_answers:
    combined_texts_train = [combine_texts(question, answer) for question, answer in zip(train_questions, train_answers)]
    combined_texts_eval = [combine_texts(question, answer) for question, answer in zip(eval_questions, eval_answers)]

    # Create the fine-tuning datasets
    train_dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts_train]})
    eval_dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts_eval]})

    # Display example training record
    if len(train_dataset) > 0:
        print("Example training record:\n")
        print(train_dataset[0]['text'])
    else:
        print("The fine-tuning training dataset is empty.")
    
    if len(eval_dataset) > 0:
        print("Example evaluation record:\n")
        print(eval_dataset[0]['text'])
    else:
        print("The fine-tuning evaluation dataset is empty.")
else:
    print("Failed to create the fine-tuning datasets.")

In [None]:
max_qna = 0
max_q = 0
max_a = 0
for question, answer in zip(questions, answers):
    q_tokens = tokenizer.encode_plus(question, add_special_tokens=False, max_length=None)["input_ids"]
    a_tokens = tokenizer.encode_plus(answer, add_special_tokens=False, max_length=None)["input_ids"]
    qna_tokens = tokenizer.encode_plus(combine_texts(question, answer)["text"], add_special_tokens=False, max_length=None)["input_ids"]

    max_q = max(max_q, len(q_tokens))
    max_a = max(max_a, len(a_tokens))
    max_qna = max(max_qna, len(qna_tokens))

buffer = 10
max_seq_length = max_qna + buffer

table_title = "Training Data Token Counts"
print(f"\n{table_title:-^70}")
print(f"{'Measure':<14}{'Question':<14}{'Answer':<14}{'Combined':<14}")

print(f"{'Maximums':<14}{max_q:<14}{max_a:<14}{max_qna:<14}")
print(f"{'Max Seq Len':<14}{'':<14}{'':<14}{max_seq_length:<14}\n")

print(f"Set max_seq_length in FastLanguageModel to {max_seq_length} to handle the maximum number of tokens required by the input training data (Combined Maximum + Buffer).")


In [None]:
dtype = None 
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(    
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # add a Hugging Face access token if using a private or gated model
) 

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

clear memory if needed.. if not needed.. no worries.. skip

In [None]:
import gc
def clear_cuda_memory():
    torch.cuda.empty_cache()
    gc.collect()
    print("CUDA memory cleared.")

# Call the function to clear memory
clear_cuda_memory()

In [None]:
# Logging configuration to wandb
config = {
    "learning_rate": 2e-5,    
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "num_train_epochs": 20,  # Increased to allow early stopping to take effect
    "warmup_steps": 100,
    "max_seq_length": max_seq_length,  # To be calculated later
}

In [None]:
wandb.config.update(config)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=config['batch_size'],
    gradient_accumulation_steps=config['gradient_accumulation_steps'],
    warmup_steps=config['warmup_steps'],
    num_train_epochs=config['num_train_epochs'],
    learning_rate=config['learning_rate'],
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,  # Limit to 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
    greater_is_better=False,  # Lower loss is better
)

setup callbacks for WANDB logging

In [None]:
class WandbCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

    def on_train_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.init(project="your_project_name", config=args)

    def on_train_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.finish()

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

    def on_save(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        wandb.log({"global_step": state.global_step, "saving_checkpoint": True})

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.log_history:
            wandb.log(state.log_history[-1])

In [None]:
# Initialize trainer with the enhanced WandbCallback
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add evaluation dataset for monitoring
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    callbacks=[WandbCallback(), EarlyStoppingCallback(early_stopping_patience=4)],  # Add early stopping callback
)

In [None]:
trainer_stats = trainer.train()

In [None]:
model.push_to_hub(MODEL_NAME,token=HUGGING_FACE_HUB_TOKEN)
tokenizer.push_to_hub(MODEL_NAME,token=HUGGING_FACE_HUB_TOKEN)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
from transformers import TextStreamer
# Enable native faster inference
FastLanguageModel.for_inference(model)

# Define the question
question = "Who is Raghee Horner?"

# Format the input
formatted_input = question

# Tokenize the input
inputs = tokenizer(
    [formatted_input],
    return_tensors="pt"
).to("cuda")

# Initialize the text streamer
text_streamer = TextStreamer(tokenizer)

# Generate the output using the model
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

### Replace the below with how you want to save the model to huggingface/locally  p.s. locally you will fill the hard drive real quick.

In [None]:
username = "davidbzyk"
model_name = "simpler-gemma-2-2b"

In [None]:
# Save to 8bit Q8_0
#if True: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
#if True: model.push_to_hub_gguf("{username}/{model_name}-{}", tokenizer, token = "")

# Save to 16bit GGUF
#if True: model.save_pretrained_gguf(f"{username}/{model_name}-{quantization_method}", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf(f"{username}/{model_name}-f16", tokenizer, quantization_method = "merged_16bit", token =HUGGING_FACE_HUB_TOKEN)
if True: model.push_to_hub_gguf(f"{username}/{model_name}-f16", tokenizer, quantization_method = "f16", token =HUGGING_FACE_HUB_TOKEN)

# Save to q4_k_m GGUF
#if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf(f"{username}/{model_name}-q4_k_m", tokenizer, quantization_method = "q4_k_m", token =HUGGING_FACE_HUB_TOKEN)
if True: model.push_to_hub_gguf(f"{username}/{model_name}-q5_k_m", tokenizer, quantization_method = "q5_k_m", token =HUGGING_FACE_HUB_TOKEN)
if True: model.push_to_hub_gguf(f"{username}/{model_name}-q8_0", tokenizer, quantization_method = "q8_0", token =HUGGING_FACE_HUB_TOKEN)
if True: model.push_to_hub_gguf(f"{username}/{model_name}-q5_0", tokenizer, quantization_method = "q5_0", token =HUGGING_FACE_HUB_TOKEN)
if True: model.push_to_hub_gguf(f"{username}/{model_name}-q5_1", tokenizer, quantization_method = "q5_1", token =HUGGING_FACE_HUB_TOKEN)
# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )