# Fine-tuning LLMs on personal data

In [32]:
from glob import glob
import json

In [33]:
data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [34]:
data['text']

['Where r u?',
 'Where r u?',
 "How much was lowes? I'll reimburse you if I can....",
 "Ms Melissa just passed!!!! I bet she's bitching God out for taking her away from her family, about now!",
 'Love you both ..... I am fine',
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 "It's ok now ... everything is ok now ... she's no longer struggling and no longer in pain ....",
 'Me too Evan ... me too',
 'Me too Evan ... me too',
 'Yes .... Travis is a mess right now .... Those things will b determined over the next couple of days ....',
 "Of course honey .... I'm helping Travis with some arrangements & I'll let you know when I'm leaving here.... It shouldn't be much longer....",
 'Evan call me plz',
 'Thanks for sharing!!! It looks like you to are great pals!!!!',
 "I also saw Lindsay little belly!!!! Love it! Lindsay you look beautiful! I just can't wait to see your little girl!!!",
 "If it's going to b this cold, I want snow!!!!",
 "I'm 

## [Tutorial Link ](https://learn.deeplearning.ai/courses/finetuning-large-language-models/lesson/vl60i/training-process)

### Data preparation


In [11]:
import pandas as pd
import datasets
from pprint import pprint
from transformers import AutoTokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
text = "Hi, how are you?"

In [14]:
encoded_text = tokenizer(text)["input_ids"]

In [15]:
decoded_text = tokenizer.decode(encoded_text)

In [16]:
decoded_text

'Hi, how are you?'

In [17]:
# Tokenize multiple texts at once
encoded_texts = tokenizer(data['text'])

In [18]:
print("Encoded several texts: ", encoded_texts)

Encoded several texts:  {'input_ids': [[7161, 391, 1484, 32], [7161, 391, 1484, 32], [2347, 1199, 369, 1698, 265, 32, 309, 1833, 27533, 368, 604, 309, 476, 2391], [12822, 38994, 816, 4817, 18963, 309, 701, 703, 434, 2372, 7695, 2656, 562, 323, 3192, 617, 1977, 432, 617, 2021, 13, 670, 1024, 2], [23337, 368, 1097, 10712, 1051, 309, 717, 4030], [1147, 434, 8718, 1024, 3346, 3253, 310, 8718, 1024, 3346, 703, 434, 642, 3356, 15586, 285, 642, 3356, 275, 3075, 22833], [1147, 434, 8718, 1024, 3346, 3253, 310, 8718, 1024, 3346, 703, 434, 642, 3356, 15586, 285, 642, 3356, 275, 3075, 22833], [5072, 1512, 37144, 3346, 479, 1512], [5072, 1512, 37144, 3346, 479, 1512], [4374, 22833, 35382, 310, 247, 4840, 987, 1024, 22833, 9240, 1841, 588, 270, 3413, 689, 253, 1735, 4564, 273, 1897, 22833], [4527, 2282, 14795, 22833, 309, 1353, 9073, 35382, 342, 690, 16669, 708, 309, 1833, 1339, 368, 871, 672, 309, 1353, 6108, 1060, 2391, 733, 10095, 626, 320, 1199, 3356, 2391], [38, 6148, 1067, 479, 499, 91], [806

### Padding and truncation

In [36]:
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(data['text'], max_length=3, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[7161, 391, 1484, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7161, 391, 1484, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2347, 1199, 369, 1698, 265, 32, 309, 1833, 27533, 368, 604, 309, 476, 2391, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [12822, 38994, 816, 4817, 18963, 309, 701, 703, 434, 2372, 7695, 2656, 562, 323, 3192, 617, 1977, 432, 617, 2021, 13, 670, 1024, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [23337, 368, 1097, 10712, 1051, 309, 717, 4030, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1147, 434, 8718, 1024, 3346, 3253, 310, 8718, 1024, 3346, 703, 434, 642, 3356, 15586, 285, 642, 3356, 275, 3075, 22833, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1147, 434, 8718, 1024, 



In [37]:
encoded_texts_truncation = tokenizer(data['text'], max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[7161, 391, 1484], [7161, 391, 1484], [2347, 1199, 369], [12822, 38994, 816], [23337, 368, 1097], [1147, 434, 8718], [1147, 434, 8718], [5072, 1512, 37144], [5072, 1512, 37144], [4374, 22833, 35382], [4527, 2282, 14795], [38, 6148, 1067], [8061, 323, 9628], [42, 671, 3047], [2042, 352, 434], [42, 1353, 9745], [1147, 434, 1512], [37, 35570, 18963], [42, 2389, 368], [8620, 2360], [30488, 2391, 5178], [3220, 378, 538], [4943, 2, 2263], [6723, 368, 22238], [42, 1353, 387], [7161, 651, 368], [6300, 247, 1652], [4045, 368, 1158], [42, 2389, 8702], [8262, 29517, 2], [42, 452, 281], [42, 1353, 13590], [42, 2389, 368], [2598, 2080, 594], [8061, 323, 9745], [20127, 22833, 309], [2374, 619, 1039], [2302, 3346, 1339], [4497, 309, 1353], [42, 689, 21567], [42, 1353, 1060], [8398, 368, 37144], [8398, 368, 37144], [1276, 25561, 1359], [42, 1353, 1469], [1394, 403, 247]]


In [38]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(data["text"], max_length=3, truncation=True)
print("Uing left-side truncation: ", encoded_texts_truncation_left["input_ids"])


Uing left-side truncation:  [[391, 1484, 32], [391, 1484, 32], [309, 476, 2391], [670, 1024, 2], [309, 717, 4030], [275, 3075, 22833], [275, 3075, 22833], [3346, 479, 1512], [3346, 479, 1512], [273, 1897, 22833], [1199, 3356, 2391], [479, 499, 91], [1270, 39839, 18963], [1652, 3226, 15844], [971, 8762, 18963], [12561, 1728, 2], [714, 360, 2], [37, 35570, 18963], [37144, 26496, 2], [8620, 2360], [285, 6858, 1795], [378, 538, 1615], [2218, 49042, 221], [368, 22238, 25561], [15844, 49042, 114], [427, 8662, 33186], [572, 1143, 22833], [387, 898, 312], [2389, 8702, 15844], [479, 1728, 2391], [871, 1024, 22833], [281, 253, 1113], [634, 1361, 2], [594, 1175, 2], [34258, 2706, 18963], [2568, 22833, 26070], [2374, 619, 1039], [7004, 745, 15844], [23483, 3063, 2], [530, 6110, 32], [42, 1353, 1060], [368, 37144, 15844], [368, 37144, 15844], [25561, 1359, 25561], [5057, 10141, 562], [247, 1175, 1436]]


In [39]:
encoded_texts_both = tokenizer(data["text"], max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[391, 1484, 32], [391, 1484, 32], [309, 476, 2391], [670, 1024, 2], [309, 717, 4030], [275, 3075, 22833], [275, 3075, 22833], [3346, 479, 1512], [3346, 479, 1512], [273, 1897, 22833], [1199, 3356, 2391], [479, 499, 91], [1270, 39839, 18963], [1652, 3226, 15844], [971, 8762, 18963], [12561, 1728, 2], [714, 360, 2], [37, 35570, 18963], [37144, 26496, 2], [8620, 2360, 0], [285, 6858, 1795], [378, 538, 1615], [2218, 49042, 221], [368, 22238, 25561], [15844, 49042, 114], [427, 8662, 33186], [572, 1143, 22833], [387, 898, 312], [2389, 8702, 15844], [479, 1728, 2391], [871, 1024, 22833], [281, 253, 1113], [634, 1361, 2], [594, 1175, 2], [34258, 2706, 18963], [2568, 22833, 26070], [2374, 619, 1039], [7004, 745, 15844], [23483, 3063, 2], [530, 6110, 32], [42, 1353, 1060], [368, 37144, 15844], [368, 37144, 15844], [25561, 1359, 25561], [5057, 10141, 562], [247, 1175, 1436]]


### Generate Question Answer Pairs

In [50]:
from transformers import pipeline

# Load the FLAN-T5 generator
generator = pipeline("text2text-generation", model="google/flan-t5-base", max_length=256, do_sample=True, top_p=0.95)

# Your source text
raw_text = data["text"][0].strip()

# Better Prompt Engineering
question_prompt = f"""Given the following passage, generate a detailed, insightful, and specific question that tests comprehension:

Passage:
\"\"\"{raw_text}\"\"\"

Question:"""

# Generate a question
question_output = generator(question_prompt, max_length=100, num_return_sequences=1)[0]
question = question_output["generated_text"].strip()

# Better Answer Prompt
answer_prompt = f"""Given the following passage and question, provide an accurate and complete answer strictly based on the passage content.

Passage:
\"\"\"{raw_text}\"\"\"

Question:
{question}

Answer:"""

# Generate an answer
answer_output = generator(answer_prompt, max_length=150, num_return_sequences=1)[0]
answer = answer_output["generated_text"].strip()

# Print results
print("Generated Question:")
print(question)
print("\nGenerated Answer:")
print(answer)




Generated Question:
Which is the title of the passage?

Generated Answer:
Where r U?


In [107]:
data["text"][-1]

'You are a good person'

### Prepare instruction dataset

In [42]:
import pandas as pd

# filename = "lamini_docs.jsonl"
filename = data_file_l[0]
instruction_dataset_df = pd.read_json(filename, lines=False)
examples = instruction_dataset_df.to_dict()


In [None]:

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

SyntaxError: incomplete input (706917710.py, line 10)

### Creating a HuggingFace Dataset

In [29]:
from datasets import Dataset
import json
from glob import glob

data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [30]:
dataset = Dataset.from_dict(data)
# dataset = dataset.train_test_split(test_size=0.05) # optional

In [None]:
# Using meta-llama/Meta-Llama-3-8B
from transformers import AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
def tokenize(example):
        return tokenizer(example(['text'], truncation=True, padding='max_length', max_length=512))





OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B.
403 Client Error. (Request ID: Root=1-688ac855-1afcbdb465e81f182a474df3;2118dbad-8d8a-4c7a-9438-f4d3d38c2465)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Meta-Llama-3-8B to ask for access.

In [None]:
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

### Full Example Pipeline: 
Vector Store, QLoRA Weights, HRLF, Custom Style Classifier 

              ┌────────────────────┐
              │ Base LLM (LLaMA 3) │
              └────────┬───────────┘
                       │
         ┌─────────────▼────────────┐
         │  QLoRA Adapter Loader    │ ←─ Avatar ("Mom", "Friend", etc.)
         └─────────────┬────────────┘
                       │
         ┌─────────────▼─────────────┐
         │ LangChain Memory Manager  │
         └─────────────┬─────────────┘
                       │
              ┌────────▼─────────┐
              │  Prompt Builder  │ ←─ Style + Episodic + Semantic memory
              └────────┬─────────┘
                       │
              ┌────────▼──────────┐
              │     LLM Output    │
              └───────────────────┘


In [35]:
from datasets import Dataset
import json
from glob import glob

In [37]:
# Load Ground Truth Dataset
data_dir = "../data/prompt_response/"
data_file_l = glob(data_dir + "*.json")
data_file_l[0]
with open(data_file_l[0], 'rb') as f:
    data = json.load(f)

In [41]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings

embedding_model = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Embed your dataset
documents = [
    {"page_content": s, "metadata": {"source": f"statement_{i}"}}
    for i, s in enumerate(data['text'])
]


In [45]:
documents[0]["page_content"]

'Where r u?'

In [46]:
# Create vector store
vectorstore = Chroma.from_documents(documents, embedding=embedding_model, persist_directory="./chroma_db")

AttributeError: 'dict' object has no attribute 'page_content'

### Using QLoRA with LangChain


In [49]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from peft import get_peft_model, LoraConfig, TaskType
import torch

model =  AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8")




OSError: meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8 does not appear to have a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.

### Instructions for QLoRA Fine-tuning, Vector Stores, & Custom Style Classifiers https://chatgpt.com/share/688ae919-ee40-8011-ab40-2b52a0c3db06

In [1]:
model_id = "meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8"

tokenizer = AutoTokenizer.from_pretrained(model_id)

NameError: name 'AutoTokenizer' is not defined