In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
# from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from sentence_transformers import SentenceTransformer, util
import chromadb
from chromadb.config import Settings
import numpy as np
from typing import List, Dict

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Base model and tokenizer (QLoRA-ready model)
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
ADAPTER = "meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8"

PEFT_DIR = "./qlora_adapter"  # Directory to save/load adapter weights
VECTORSTORE_DIR = "./chroma_db"

# Sentence embedding model for vector store and evaluation
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Setup Chroma client and collection
from chromadb import PersistentClient

client = PersistentClient(path=VECTORSTORE_DIR)
collection = client.get_or_create_collection(name="style_memory")



In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
PEFT_DIR = "./qlora_adapter/checkpoint-18/"
device = "cuda" if torch.cuda.is_available() else "cpu"
LOAD_MODEL = False

def load_model_and_tokenizer_for_qlora():
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    # 4-bit quantization config for QLoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model in 4-bit for QLoRA
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # If adapter weights already exist, load them
    if LOAD_MODEL:
        print(f"Loading existing LoRA adapter from {PEFT_DIR}")
        # Load the fine-tuned QLoRA adapter weights with PeftModel.from_pretrained
        model = PeftModel.from_pretrained(model, PEFT_DIR)
    else:
        print("Preparing Model for Training")
        # Prepare for k-bit training (adds norm casting, disables gradients on frozen parts, etc.)
        model = prepare_model_for_kbit_training(model)

        # LoRA adapter configuration (adapt r, alpha, target_modules as needed)
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        # Wrap the model with PEFT
        model = get_peft_model(model, lora_config)

    model.to(device)

    model.print_trainable_parameters()
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer_for_qlora()

In [None]:
from datasets import Dataset
import json
from glob import glob

data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)
dataset = Dataset.from_dict(data)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

def group_texts(examples):
    # labels are input_ids with padding tokens masked as -100 to ignore in loss
    labels = []
    for input_ids in examples["input_ids"]:
        label = input_ids.copy()
        # Mask padding tokens
        label = [-100 if token == tokenizer.pad_token_id else token for token in label]
        labels.append(label)
    examples["labels"] = labels
    return examples

tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./qlora_adapter",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=100,
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_total_limit=2,
    bf16=True if torch.cuda.is_bf16_supported() else False,
    gradient_checkpointing=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()