Download these packages and any other missing packages
Use Python 3.10.11 Kernal if possible.

In [None]:
!pip install torch transformers datasets peft accelerate bitsandbytes xformers huggingface-cli NumPy

log into hugging face account

In [None]:
hf authen login

Importing packages needed

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForSeq2Seq, TrainerCallback
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
import os

Setting your configs and loading the dataset.
Insert the path of your .jsonl file and the directory you want your model to be

Example:
DATA_PATH = r"K:\ml_datasets\messages\prompt_response.jsonl"
OUTPUT_DIR = r"K:\ml_datasets\messages\llama3_8b_corrbot_qlora_3"

In [None]:
# === CONFIG ===
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" # Can change if you're using a different model
DATA_PATH = r"" # Remove the "r" of you are on mac or linux. I am on windows.
OUTPUT_DIR = r"" # insert path and name of model here
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 8
MAX_LENGTH = 490 # Change this number to your largest conversation. Use Token_counter.py to check the number of token of your longest convo
EPOCHS = 3

# === LOAD TOKENIZER AND MODEL ===
raw_dataset = load_dataset("json", data_files=DATA_PATH)["train"]
dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)


Setting up Tokenizers and Tokenizations Function

I used DataCollatorForSeq25Seq because it is good for Input-Output Sequence Pairs. It pads tokens with the label -100 during traiing as the loss function will usually ignore those tokens. Our labels would be very inaccurate if we have -100 repeating in our labels so we filter them out.

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Tokenization function
def format(example):
    tokens = tokenizer(example["raw_prompt"], truncation=True)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Split and tokenize datasets
train_dataset = dataset["train"]
val_dataset = dataset["test"]

tokenized_train = train_dataset.map(format, remove_columns=['prompt', 'raw_prompt', 'raw_response'])
tokenized_val = val_dataset.map(format, remove_columns=['prompt', 'raw_prompt', 'raw_response'])

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    pad_to_multiple_of=8,
    padding=True,
    return_tensors="pt"
)


Loading the Model and applying LoRA

In [None]:
# === LOAD MODEL ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

# === LoRA CONFIGURATION ===
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

Defining training argumments and callbacks to check your gpu status while training

In [None]:
# Training setup
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=EPOCHS,
    learning_rate=2e-4,
    bf16=True,
    optim ="paged_adamw_8bit",
    save_total_limit=3,
    save_strategy="steps", # For smaller models you can change this to "epoches" or even "no" if its too small
    save_steps=1000, # If "no" can delete this
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    report_to="none",
    remove_unused_columns=False,
    logging_steps=100,
    save_safetensors=True,
    resume_from_checkpoint=True,
    load_best_model_at_end=False,
)

class MemoryClearCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 50 == 0:
            print(torch.cuda.memory_summary())
        torch.cuda.empty_cache()


Setting up Trainer and Fine-tuning the model

If you have everything installed it should work, if not try to install everything neccessary to run the model. 

Make sure "resume_from_checkpoint=False" unless your training crashed
and you want to resume from where ever your checkpoints are. Then you can set to True.

In [None]:
# === TRAINER SETUP ===
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    args=args,
    data_collator=data_collator,
    callbacks=[MemoryClearCallback()]
)
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model and tokenizer saved to {OUTPUT_DIR}")