In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

# Path to the JSON file
json_file_path = '/content/qlora_formatted_data.json'

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load the JSON data
def load_json_dataset(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    return Dataset.from_list(data)

data = load_json_dataset(json_file_path)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    token = "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"
)
model.config.use_cache = False

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token = "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")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(example):
    instruction = str(example['Instruction'])
    input_data = json.dumps(example['Input'])  # Ensure Input is serialized to a JSON string
    response = str(example['Response'])
    prompt = instruction + "\nInput: " + input_data + "\nResponse: " + response

    # Tokenize while respecting max sequence length
    tokens = tokenizer(
        prompt,
        truncation=True,  # Truncate sequences longer than max_length
        padding="max_length",  # Pad sequences to max_length
        max_length=512,  # Ensure length does not exceed model's max input size
        return_tensors="pt"  # Return PyTorch tensors for consistency
    )
    return {
        "input_ids": tokens["input_ids"][0],  # Extract the token IDs
        "attention_mask": tokens["attention_mask"][0],  # Extract attention masks
        "labels": tokens["input_ids"][0]  # For causal language modeling, input IDs are the labels
    }


print("Tokenizing data...")
tokenized_data = data.map(tokenize_function)




# Split data into train and evaluation sets
data = tokenized_data.train_test_split(test_size=0.2)
train_data = data['train']
eval_data = data['test']

# Load the model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Task type for causal language modeling
    inference_mode=False,
    r=16,  # LoRA rank
    lora_alpha=32,  # Alpha scaling
    lora_dropout=0.05  # Dropout rate
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define training arguments
output_dir = "qlora_model"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    report_to="tensorboard"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

# Train the model
print("Training the model...")
trainer.train()

# Save the LoRA fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

import math

# Function to calculate perplexity
def calculate_perplexity(eval_results):
    loss = eval_results["eval_loss"]  # Access the evaluation loss directly from the dictionary
    perplexity = math.exp(loss)
    return perplexity

# Updated evaluation function
def evaluate_model(trainer, eval_data):
    print("Evaluating the model...")
    eval_results = trainer.evaluate(eval_data)  # Returns a dictionary
    print(f"Evaluation Results: {eval_results}")  # Log the raw results for debugging
    perplexity = calculate_perplexity(eval_results)
    print(f"Perplexity: {perplexity}")
    return perplexity

# Evaluate the model
evaluate_model(trainer, eval_data)

print(f"Model fine-tuned and saved to {output_dir}.")


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading tokenizer...
Tokenizing data...


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  trainer = Trainer(


Training the model...


Step,Training Loss,Validation Loss



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


Saving the fine-tuned model...



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


Evaluating the model...


Evaluation Results: {'eval_loss': 0.3493233323097229, 'eval_runtime': 0.3531, 'eval_samples_per_second': 14.159, 'eval_steps_per_second': 2.832, 'epoch': 20.0}
Perplexity: 1.4181076362402678
Model fine-tuned and saved to qlora_model.


Perplexity (1.42):
A metric often used for language models to measure how well the model predicts a sample.
It is calculated as
$e^{evalloss}$

Interpretation:
Lower perplexity indicates that the model is more confident in its predictions.
A perplexity of 1.42 means that on average, the model is about 2.28 times uncertain about its predictions.