# Fine-tuning LLMs on personal data

In [1]:
from glob import glob
import json

In [2]:
data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [3]:
data['text']

['Where r u?',
 'Where r u?',
 "How much was lowes? I'll reimburse you if I can....",
 "Ms Melissa just passed!!!! I bet she's bitching God out for taking her away from her family, about now!",
 'Love you both ..... I am fine',
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 'Me too Evan ... me too',
 'Me too Evan ... me too',
 'Yes .... Travis is a mess right now .... Those things will b determined over the next couple of days ....',
 "Of course honey .... I'm helping Travis with some arrangements & I'll let you know when I'm leaving here.... It shouldn't be much longer....",
 'Evan call me plz',
 'Thanks for sharing!!! It looks like you to are great pals!!!!',
 "I also saw Lindsay little belly!!!! Love it! Lindsay you look beautiful! I just can't wait to see your little girl!!!",
 "If it's going to b this cold, I want snow!!!!",
 "I'm 

## [Tutorial Link ](https://learn.deeplearning.ai/courses/finetuning-large-language-models/lesson/vl60i/training-process)

### Data preparation


In [None]:
import pandas as pd
import datasets
from pprint import pprint
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [None]:
text = "Hi, how are you?"

In [None]:
encoded_text = tokenizer(text)["input_ids"]

In [None]:
decoded_text = tokenizer.decode(encoded_text)

In [None]:
decoded_text

In [None]:
# Tokenize multiple texts at once
encoded_texts = tokenizer(data['text'])

In [None]:
print("Encoded several texts: ", encoded_texts)

### Padding and truncation

In [None]:
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(data['text'], max_length=3, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

In [None]:
encoded_texts_truncation = tokenizer(data['text'], max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

In [None]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(data["text"], max_length=3, truncation=True)
print("Uing left-side truncation: ", encoded_texts_truncation_left["input_ids"])


In [None]:
encoded_texts_both = tokenizer(data["text"], max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

### Generate Question Answer Pairs

In [None]:
from transformers import pipeline

# Load the FLAN-T5 generator
generator = pipeline("text2text-generation", model="google/flan-t5-base", max_length=256, do_sample=True, top_p=0.95)

# Your source text
raw_text = data["text"][0].strip()

# Better Prompt Engineering
question_prompt = f"""Given the following passage, generate a detailed, insightful, and specific question that tests comprehension:

Passage:
\"\"\"{raw_text}\"\"\"

Question:"""

# Generate a question
question_output = generator(question_prompt, max_length=100, num_return_sequences=1)[0]
question = question_output["generated_text"].strip()

# Better Answer Prompt
answer_prompt = f"""Given the following passage and question, provide an accurate and complete answer strictly based on the passage content.

Passage:
\"\"\"{raw_text}\"\"\"

Question:
{question}

Answer:"""

# Generate an answer
answer_output = generator(answer_prompt, max_length=150, num_return_sequences=1)[0]
answer = answer_output["generated_text"].strip()

# Print results
print("Generated Question:")
print(question)
print("\nGenerated Answer:")
print(answer)


In [None]:
data["text"][-1]

### Prepare instruction dataset

In [1]:
import pandas as pd

# filename = "lamini_docs.jsonl"
filename = data_file_l[0]
instruction_dataset_df = pd.read_json(filename, lines=False)
examples = instruction_dataset_df.to_dict()


NameError: name 'data_file_l' is not defined

In [2]:

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

NameError: name 'examples' is not defined

### Creating a HuggingFace Dataset

In [4]:
from datasets import Dataset
import json
from glob import glob

data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)
dataset = Dataset.from_dict(data)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data['text']

['Where r u?',
 'Where r u?',
 "How much was lowes? I'll reimburse you if I can....",
 "Ms Melissa just passed!!!! I bet she's bitching God out for taking her away from her family, about now!",
 'Love you both ..... I am fine',
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 'Me too Evan ... me too',
 'Me too Evan ... me too',
 'Yes .... Travis is a mess right now .... Those things will b determined over the next couple of days ....',
 "Of course honey .... I'm helping Travis with some arrangements & I'll let you know when I'm leaving here.... It shouldn't be much longer....",
 'Evan call me plz',
 'Thanks for sharing!!! It looks like you to are great pals!!!!',
 "I also saw Lindsay little belly!!!! Love it! Lindsay you look beautiful! I just can't wait to see your little girl!!!",
 "If it's going to b this cold, I want snow!!!!",
 "I'm 

In [None]:

# dataset = dataset.train_test_split(test_size=0.05) # optional

In [6]:
# Using meta-llama/Meta-Llama-3-8B
from transformers import AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
def tokenize(example):
        return tokenizer(example(['text'], truncation=True, padding='max_length', max_length=512))



OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B.
403 Client Error. (Request ID: Root=1-688d0462-431291ac2c5e1593766caeaf;4a098857-3c75-49b0-a5e8-4cf1b737be79)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Meta-Llama-3-8B to ask for access.

In [None]:
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

### Full Example Pipeline: 
Vector Store, QLoRA Weights, HRLF, Custom Style Classifier 

              ┌────────────────────┐
              │ Base LLM (LLaMA 3) │
              └────────┬───────────┘
                       │
         ┌─────────────▼────────────┐
         │  QLoRA Adapter Loader    │ ←─ Avatar ("Mom", "Friend", etc.)
         └─────────────┬────────────┘
                       │
         ┌─────────────▼─────────────┐
         │ LangChain Memory Manager  │
         └─────────────┬─────────────┘
                       │
              ┌────────▼─────────┐
              │  Prompt Builder  │ ←─ Style + Episodic + Semantic memory
              └────────┬─────────┘
                       │
              ┌────────▼──────────┐
              │     LLM Output    │
              └───────────────────┘


In [3]:
from datasets import Dataset
import json
from glob import glob

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load Ground Truth Dataset
data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [5]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings

embedding_model = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Embed your dataset
documents = [
    {"page_content": s, "metadata": {"source": f"statement_{i}"}}
    for i, s in enumerate(data['text'])
]


  embedding_model = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
2025-08-01 20:37:11.363990: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-01 20:37:11.562923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754095031.630251  180201 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754095031.648796  180201 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754095031.791187  180201 computa

In [6]:
documents[0]["page_content"]

'Where r u?'

In [7]:
# Create vector store
vectorstore = Chroma.from_documents(documents, embedding=embedding_model, persist_directory="./chroma_db")

AttributeError: 'dict' object has no attribute 'page_content'

### Using QLoRA with LangChain


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from peft import get_peft_model, LoraConfig, TaskType
import torch

from glob import glob
import json

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "meta-llama/Llama-3.2-3B-Instruct"

# Enable 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",  # or torch.float32 if you have more memory
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # Choose from "nf4" or "fp4"
)

# Load tokenizer and quantized model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Example message
messages = [
    {"role": "user", "content": "Who are you?"},
]

# Tokenize with chat template (for LLaMA-3-style formatting)
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    tokenize=False,
)

inputs = tokenizer(inputs, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate a response
outputs = model.generate(**inputs, max_new_tokens=40)

# Decode and print the generated text
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:03<?, ?it/s]


KeyboardInterrupt: 

### Instructions for QLoRA Fine-tuning, Vector Stores, & Custom Style Classifiers https://chatgpt.com/share/688ae919-ee40-8011-ab40-2b52a0c3db06

In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
# from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from sentence_transformers import SentenceTransformer, util
import chromadb
from chromadb.config import Settings
import numpy as np
from typing import List, Dict

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Base model and tokenizer (QLoRA-ready model)
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
ADAPTER = "meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8"

PEFT_DIR = "./qlora_adapter"  # Directory to save/load adapter weights
VECTORSTORE_DIR = "./chroma_db"

# Sentence embedding model for vector store and evaluation
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Setup Chroma client and collection
from chromadb import PersistentClient

client = PersistentClient(path=VECTORSTORE_DIR)
collection = client.get_or_create_collection(name="style_memory")



  from .autonotebook import tqdm as notebook_tqdm
2025-08-02 15:47:26.359792: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-02 15:47:26.523365: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754164046.583779  214874 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754164046.601112  214874 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754164046.735594  214874 computation_placer.cc:177] computation placer already r

In [16]:
del model, tokenizer

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
PEFT_DIR = "./qlora_adapter/checkpoint-18/"
device = "cuda" if torch.cuda.is_available() else "cpu"
LOAD_MODEL = True

def load_model_and_tokenizer_for_qlora():
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    # 4-bit quantization config for QLoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model in 4-bit for QLoRA
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # If adapter weights already exist, load them
    if LOAD_MODEL:
        print(f"Loading existing LoRA adapter from {PEFT_DIR}")
        # Load the fine-tuned QLoRA adapter weights with PeftModel.from_pretrained
        model = PeftModel.from_pretrained(model, PEFT_DIR)
    else:
        print("Preparing Model for Training")
        # Prepare for k-bit training (adds norm casting, disables gradients on frozen parts, etc.)
        model = prepare_model_for_kbit_training(model)

        # LoRA adapter configuration (adapt r, alpha, target_modules as needed)
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        # Wrap the model with PEFT
        model = get_peft_model(model, lora_config)

    model.to(device)

    model.print_trainable_parameters()
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer_for_qlora()

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.00s/it]


Loading existing LoRA adapter from ./qlora_adapter/checkpoint-18/
trainable params: 0 || all params: 3,215,043,584 || trainable%: 0.0000


### Training QLoRA

In [3]:
from datasets import Dataset
import json
from glob import glob

data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)
dataset = Dataset.from_dict(data)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

def group_texts(examples):
    # labels are input_ids with padding tokens masked as -100 to ignore in loss
    labels = []
    for input_ids in examples["input_ids"]:
        label = input_ids.copy()
        # Mask padding tokens
        label = [-100 if token == tokenizer.pad_token_id else token for token in label]
        labels.append(label)
    examples["labels"] = labels
    return examples

tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']

Map: 100%|██████████| 46/46 [00:00<00:00, 1722.63 examples/s]
Map: 100%|██████████| 46/46 [00:00<00:00, 2353.96 examples/s]


In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./qlora_adapter",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=100,
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_total_limit=2,
    bf16=True if torch.cuda.is_bf16_supported() else False,
    gradient_checkpointing=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [5]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

### Testing the qlora adapter manually

In [None]:
# # Input prompt
# input_text = "Q:  \nA:"
# inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True).to(device)

# # Make sure model is in evaluation mode
# model.eval()

# # Disable gradient computation (important for inference speed and memory)
# with torch.no_grad():
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=50,
#         do_sample=False,  # Deterministic output
#         pad_token_id=tokenizer.eos_token_id,
#         eos_token_id=tokenizer.eos_token_id  # Stop at EOS
#     )

# # Decode generated token IDs into text
# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Extract answer portion after 'A:'
# if "A:" in generated_text:
#     answer = generated_text.split("A:", 1)[1].strip()
# else:
#     answer = generated_text.strip()

# print("Generated answer:\n", answer)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated answer:
 B:  
C:  
D:  
E:  
F:  
G:  
H:  
I:  
J:  
K:  
L:  
M:  
N:  
O:  
P:  
Q:  
R


In [23]:
from glob import glob
import json
data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [24]:
embeddings = embedder.encode(data['text']).tolist()
ids = [f"doc_{i}" for i in range(len(data['text']))]

collection.add(documents=data['text'], embeddings=embeddings, ids=ids)

  return forward_call(*args, **kwargs)


In [7]:
def generate_with_context(user_input: str, top_k: int = 10, max_new_tokens: int = 50) -> str:
    # Embed query and retrieve from vector DB
    query_vec = embedder.encode(user_input).tolist()
    results = collection.query(query_embeddings=[query_vec], n_results=top_k)
    print(f"results: {results}")

    docs = results.get("documents", [[]])[0]
    if not docs:
        context = "No relevant context found."
    else:
        # Optionally truncate context length for model input token limits
        # Here we join and limit length (e.g., first 1000 chars)
        context = "\n".join(docs)
        context = context[:1000]

    # Improved prompt with explicit instruction and clear delimiters
    prompt = f"""You are the person in the contextual statements. Use the context to answer the question briefly and only once.

Context:
{context}

Q: {user_input}
A:"""

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(device)

    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Set True to enable sampling
            # temperature=0.7,  # Uncomment if do_sample=True
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract text after last "A:" in case prompt or output has multiple
    answer = decoded.split("A:")[-1].strip()
    return answer

In [8]:
generate_with_context(user_input="I love you, Mom")

  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


results: {'ids': [['doc_28', 'doc_32', 'doc_4', 'doc_45', 'doc_18', 'doc_19', 'doc_21', 'doc_34', 'doc_17', 'doc_14']], 'embeddings': None, 'documents': [['I love YOU!!!', 'I love you too sweetie.... Thanks for your help!', 'Love you both ..... I am fine', 'You are a good person', 'I love you Evan Woods!', 'Good night', 'My Bubby', 'Thanks for bringing me to work tonight! Sweet dreams handsome!!! Love u bunches!!!!', 'Ditto!!!!', "If it's going to b this cold, I want snow!!!!"]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'distances': [[0.7543279528617859, 1.1245026588439941, 1.166839838027954, 1.3305450677871704, 1.3633513450622559, 1.3723539113998413, 1.5090606212615967, 1.5399177074432373, 1.5411328077316284, 1.577136754989624]]}


'I love you Evan'