<a href="https://colab.research.google.com/github/deepkapha/LLM-fine-tuning/blob/main/Qlora_with_dataset_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## It is recommended to use A-100 with GPU RAM of 40GB or more to run this nb

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.3 MB/s[0m eta [36m0:

In [3]:
import os
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

# Path to the JSON file
json_file_path = '/content/qlora_formatted_data.json'

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load the JSON data
def load_json_dataset(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    return Dataset.from_list(data)

data = load_json_dataset(json_file_path)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    token = hf_token
)
model.config.use_cache = False

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(example):
    instruction = str(example['Instruction'])
    input_data = json.dumps(example['Input'])  # Ensure Input is serialized to a JSON string
    response = str(example['Response'])
    prompt = instruction + "\nInput: " + input_data + "\nResponse: " + response

    # Tokenize while respecting max sequence length
    tokens = tokenizer(
        prompt,
        truncation=True,  # Truncate sequences longer than max_length
        padding="max_length",  # Pad sequences to max_length
        max_length=512,  # Ensure length does not exceed model's max input size
        return_tensors="pt"  # Return PyTorch tensors for consistency
    )
    return {
        "input_ids": tokens["input_ids"][0],  # Extract the token IDs
        "attention_mask": tokens["attention_mask"][0],  # Extract attention masks
        "labels": tokens["input_ids"][0]  # For causal language modeling, input IDs are the labels
    }


print("Tokenizing data...")
tokenized_data = data.map(tokenize_function)




# Split data into train and evaluation sets
data = tokenized_data.train_test_split(test_size=0.2)
train_data = data['train']
eval_data = data['test']

# Load the model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto", llm_int8_enable_fp32_cpu_offload=True)


# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Task type for causal language modeling
    inference_mode=False,
    r=16,  # LoRA rank
    lora_alpha=32,  # Alpha scaling
    lora_dropout=0.05  # Dropout rate
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define training arguments
output_dir = "qlora_model"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    report_to="tensorboard"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

# Train the model
print("Training the model...")
trainer.train()

# Save the LoRA fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

import math

# Function to calculate perplexity
def calculate_perplexity(eval_results):
    loss = eval_results["eval_loss"]  # Access the evaluation loss directly from the dictionary
    perplexity = math.exp(loss)
    return perplexity

# Updated evaluation function
def evaluate_model(trainer, eval_data):
    print("Evaluating the model...")
    eval_results = trainer.evaluate(eval_data)  # Returns a dictionary
    print(f"Evaluation Results: {eval_results}")  # Log the raw results for debugging
    perplexity = calculate_perplexity(eval_results)
    print(f"Perplexity: {perplexity}")
    return perplexity

# Evaluate the model
evaluate_model(trainer, eval_data)

print(f"Model fine-tuned and saved to {output_dir}.")


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Tokenizing data...


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Loading model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training the model...


Step,Training Loss,Validation Loss


Saving the fine-tuned model...
Evaluating the model...


Evaluation Results: {'eval_loss': 0.3637350797653198, 'eval_runtime': 0.3359, 'eval_samples_per_second': 14.886, 'eval_steps_per_second': 2.977, 'epoch': 20.0}
Perplexity: 1.4386930247741292
Model fine-tuned and saved to qlora_model.


Perplexity (1.42):
A metric often used for language models to measure how well the model predicts a sample.
It is calculated as
$e^{evalloss}$

Interpretation:
Lower perplexity indicates that the model is more confident in its predictions.
A perplexity of 1.42 means that on average, the model is about 2.28 times uncertain about its predictions.