In [7]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
import chromadb
from chromadb.config import Settings
import numpy as np
from typing import List, Dict

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Base model and tokenizer (QLoRA-ready model)
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
ADAPTER = "meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8"
PEFT_DIR = "./qlora_adapter/checkpoint-18/"
VECTORSTORE_DIR = "./chroma_db"

# Sentence embedding model for vector store and evaluation
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Setup Chroma client and collection
from chromadb import PersistentClient

client = PersistentClient(path=VECTORSTORE_DIR)
collection = client.get_or_create_collection(name="style_memory")


LOAD_MODEL = True


from datasets import Dataset
import json
from glob import glob


def load_model_and_tokenizer_for_qlora():
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    # 4-bit quantization config for QLoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model in 4-bit for QLoRA
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # If adapter weights already exist, load them
    if LOAD_MODEL:
        print(f"Loading existing LoRA adapter from {PEFT_DIR}")
        # Load the fine-tuned QLoRA adapter weights with PeftModel.from_pretrained
        model = PeftModel.from_pretrained(model, PEFT_DIR)
    else:
        print("Preparing Model for Training")
        # Prepare for k-bit training (adds norm casting, disables gradients on frozen parts, etc.)
        model = prepare_model_for_kbit_training(model)

        # LoRA adapter configuration (adapt r, alpha, target_modules as needed)
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        # Wrap the model with PEFT
        model = get_peft_model(model, lora_config)

    model.to(device)

    model.print_trainable_parameters()
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer_for_qlora()

data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)
dataset = Dataset.from_dict(data)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

def group_texts(examples):
    # labels are input_ids with padding tokens masked as -100 to ignore in loss
    labels = []
    for input_ids in examples["input_ids"]:
        label = input_ids.copy()
        # Mask padding tokens
        label = [-100 if token == tokenizer.pad_token_id else token for token in label]
        labels.append(label)
    examples["labels"] = labels
    return examples

tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']

# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./qlora_adapter",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     eval_steps=100,
#     save_steps=100,
#     logging_steps=10,
#     learning_rate=2e-4,
#     num_train_epochs=3,
#     save_total_limit=2,
#     bf16=True if torch.cuda.is_bf16_supported() else False,
#     gradient_checkpointing=False,
#     report_to="none",
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
# )

# trainer.train()


def generate_with_context(user_input: str, top_k: int = 10, max_new_tokens: int = 50) -> str:
    # Embed query and retrieve from vector DB
    query_vec = embedder.encode(user_input).tolist()
    results = collection.query(query_embeddings=[query_vec], n_results=top_k)
    print(f"results: {results}")

    docs = results.get("documents", [[]])[0]
    if not docs:
        context = "No relevant context found."
    else:
        # Optionally truncate context length for model input token limits
        # Here we join and limit length (e.g., first 1000 chars)
        context = "\n".join(docs)
        context = context[:1000]

    # Improved prompt with explicit instruction and clear delimiters
    prompt = f"""You are the person in the contextual statements. Use the context to answer the question briefly and only once.

Context:
{context}

Q: {user_input}
A:"""

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(device)

    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Set True to enable sampling
            # temperature=0.7,  # Uncomment if do_sample=True
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract text after last "A:" in case prompt or output has multiple
    answer = decoded.split("A:")[-1].strip()
    return answer


Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.45s/it]


Loading existing LoRA adapter from ./qlora_adapter/checkpoint-18/
trainable params: 0 || all params: 3,215,043,584 || trainable%: 0.0000


Map: 100%|██████████| 46/46 [00:00<00:00, 2004.82 examples/s]
Map: 100%|██████████| 46/46 [00:00<00:00, 2432.46 examples/s]


In [10]:
generate_with_context(user_input=input())

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


results: {'ids': [['doc_28', 'doc_32', 'doc_4', 'doc_18', 'doc_17', 'doc_45', 'doc_34', 'doc_19', 'doc_15', 'doc_14']], 'embeddings': None, 'documents': [['I love YOU!!!', 'I love you too sweetie.... Thanks for your help!', 'Love you both ..... I am fine', 'I love you Evan Woods!', 'Ditto!!!!', 'You are a good person', 'Thanks for bringing me to work tonight! Sweet dreams handsome!!! Love u bunches!!!!', 'Good night', "I'm bringing breakfast home!", "If it's going to b this cold, I want snow!!!!"]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'distances': [[0.6161439418792725, 1.017336368560791, 1.1762645244598389, 1.238104224205017, 1.344528317451477, 1.3923487663269043, 1.42818284034729, 1.4650744199752808, 1.4800461530685425, 1.5293219089508057]]}


'I love you, sweetie!'