# Notebook Setup

In [None]:
!pip install onnxruntime >> NULL
!pip install -U torch >> NULL
!pip install -U sentence_transformers >> NULL
!pip install -q -U einops tiktoken accelerate peft bitsandbytes transformers >> NULL

print("Completed setup")

# LLM Inference

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

# Speciy model alias for HF
alias = "NousResearch/Llama-2-7b-chat-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(alias, trust_remote_code=True)

# Quantization Config
quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# # Load Model
model = AutoModelForCausalLM.from_pretrained(
    alias,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype="auto",
    quantization_config=quant_config
)

In [None]:
from string import Template

prompt_template = Template(
    """
    <s>[INST] <<SYS>>
    You are a helpful chatbot.
    <</SYS>>
    Answer the provided question. Be concise and clear in your response.
    $input
    [/INST]
    """
)

input = "Mary has 10 apples. She give 3 to John and 1 to Bob. She threw away 1 more apple. How many apples does Mary have left?"

prompt = prompt_template.substitute({"input": input})

encoded_prompt = tokenizer(prompt, return_tensors="pt")
# encoded_prompt = {k: v.to("cuda") for k,v in encoded_prompt.items()}

output = model.generate(**encoded_prompt, max_new_tokens=150)
# print(output)

In [None]:
model

## COT style prompting

In [None]:
cot_template = Template(
  """
  <s>[INST] <<SYS>>
  You are a helpful chatbot.
  $input
  <</SYS>>
  Answer the provided question. Let's think step by step. Plesse provide an step-by-step explanation and then answer the question.
  Be concise and clear in your response.
  [/INST]
  """
)

input = "Mary has 10 apples. She give 3 to John and 1 to Bob. She threw away 1 more apple. How many apples does Mary have left?"

prompt = cot_template.substitute({"input": input})
encoded_prompt = tokenizer(prompt, return_tensors="pt")

output = model.generate(**encoded_prompt, max_new_tokens=150)
print(tokenizer.decode(output[0], skip_special_tokens=True))

# RAG Pipeline

## Semantic Similarity Basics

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

s1 = "Cats are super cool."
s2 = "Cats are awesome."
s3 = "I like felines."
s4 = "Centipedes are terrifying."

s1_embed = embedder.encode(s1).reshape(1,-1)
s2_embed = embedder.encode(s2).reshape(1, -1)
s3_embed = embedder.encode(s3).reshape(1, -1)
s4_embed = embedder.encode(s4).reshape(1, -1)


print(f"Cosine similarity between s1 and s: {cosine_similarity(s1_embed, s2_embed)}")
print(f"Cosine similarity between s1 and s3: {cosine_similarity(s1_embed, s3_embed)}")
print(f"Cosine similarity between s1 and s4: {cosine_similarity(s1_embed, s4_embed)}")

# Semantic similarity for retrieval
query = "What is super cool?"
query_embed = embedder.encode(query).reshape(1, -1)

print(f"s1 relevance for query: {cosine_similarity(query_embed, s1_embed)}")
print(f"s2 relevance for query: {cosine_similarity(query_embed, s2_embed)}")
print(f"s3 relevance for query: {cosine_similarity(query_embed, s3_embed)}")
print(f"s4 relevance for query: {cosine_similarity(query_embed, s4_embed)}")

## Simple Dense Passage Retrieval

In [None]:
import numpy as np
documents = [
    "The giraffe has 5 spots and 100 stripes.",
    "The giraffe has blue eyes.",
    "Giraffes have 4 legs.",
    "The cat has 6 spots and 200 stripes.",
    "The cat as green eyes.",
    "Cats have 4 legs and a tail.",
    "Penguins have no spots and no stripes.",
    "Penguins have 2 legs.",
    "The penguin has emerald eyes"
]


embedder = SentenceTransformer("all-MiniLM-L6-v2")

query = "Which animal has blue eyes?"
# 1. Compute similarity between query and doc embeddings
query_embed = embedder.encode(query)

# 2. Sort documents based on cosine similarity score, order list from
# most similar to least
idx = embedder.encode(documents)

# 3. Retrieve top n documents based on sorted document id
sim_scores = cosine_similarity(query_embed.reshape(1, -1), idx)
sorted_doc_ids = np.argsort(sim_scores)[0][::-1]

# 4. Get documents
results = [documents[i] for i in sorted_doc_ids][0]


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

class SimpleVectorDB:

  def __init__(self, documents: List, embedder_alias: str):
    self.documents = documents

    # Build Index
    # a. define the embedder
    self.embedder = SentenceTransformer(embedder_alias)

    # b. Embed the documents
    self.kb = self.embedder.encode(documents)


  def fetch_knowledge(self, query: str, n_results: int = 1) -> List[str]:
    """
    Given a user query, retrieve the most relevant document from KB. Retrieval
    should be based on the document which is the most semantic relevant to the
    query as measured by cosine similarity. Return the top n_results.
    """

    # 1. Embed the query
    query_embed = self.embedder.encode(query).reshape(1,-1)

    # 2. Compute similarity between query and docs embeddings
    sim_scores = cosine_similarity(query_embed, self.kb)

    # 3. Sort documents based on cosine similarity score, order list from
    # most similar to least
    sorted_doc_ids = np.argsort(sim_scores)[0][::-1]

    # 4. Retrieve top n documents based on sorted document id
    final_docs = [ self.documents[i] for i in sorted_doc_ids ][:n_results]

    return final_docs



In [None]:
# Evaluate Retrieval
documents = [
    "The giraffe has 5 spots and 100 stripes.",
    "The giraffe has blue eyes.",
    "Giraffes have 4 legs.",
    "The cat has 6 spots and 200 stripes.",
    "The cat as green eyes.",
    "Cats have 4 legs and a tail.",
    "Penguins have no spots and no stripes.",
    "Penguins have 2 legs.",
    "The penguin has emerald eyes"
]

# Intialize kb
kb = SimpleVectorDB(documents, "all-MiniLM-L6-v2")

# Sample queries
q1 = "Which animal has blue eyes?"
expected_document = "The giraffe has blue eyes."

print(q1)
print("Top documents: ", kb.fetch_knowledge(q1))
print(f"Check: {kb.fetch_knowledge(q1)[0] == expected_document}")


q2 = "Which animals have atleast 2 legs?"
expected_documents = ['Penguins have 2 legs.', 'Giraffes have 4 legs.', 'Cats have 4 legs and a tail.']
print(q2)
print("Top documents: ", kb.fetch_knowledge(q2, 3))
print(f"Check: { len(set(kb.fetch_knowledge(q2, 3)).intersection(expected_documents)) == 3 }")


# Retrieval + Generation

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")
# prompt_template = Template(
# """
# Answer the provided question below using the provided context.
# Context: $context
# Question: $question
# """
# )


In [None]:
from string import Template

prompt_template = Template(
"""
<s>[INST] <<SYS>>
You are a helpful chatbot. 
Only answer question about animals. If the questions is not about animals, politely respond I can't answer that.
<</SYS>>
Answer the provided question using the provided context only. Be concise and clear in your response.
[/INST]

Context:
$context

Question:
$question
"""
)

# 1. Retrieve the top documents
q1 = "How many tires does a ford mustang have?"
docs =  kb.fetch_knowledge(q1,5)

# 2. Construct prompt with in-context information
#docs = "\n".join(docs)

docs = ""
prompt = prompt_template.substitute({"context": docs, "question": q1})
print('----------')

# 3. Prompt model
encoded_input = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**encoded_input, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# QLORA Finetuning

##  Prepare data

In [25]:
import pandas as pd 
from string import Template 
from datasets import Dataset
from string import Template 

data = pd.read_csv("data/copa-cleaned.csv")

instruction_template = Template(
"""
# Instructions
Answer the multiple-choice question below. Provide the answer text as the output. 

Example:
Ryan dropped a hammer on his foot. What was the effect?
a) he broke his toe 
b) Ryan scrached his head. 
Answer: 
he broke his toe

Input
$question
Answer:
"""
)


# Update the dataset input to match the instruction prompt template 
data["prompt_input"] = data["input"].apply(
    lambda x: instruction_template.substitute({"question" : x})
)

print(data.iloc[0]["prompt_input"])


# Instructions
Answer the multiple-choice question below. Provide the answer text as the output. 

Example:
Ryan dropped a hammer on his foot. What was the effect?
a) he broke his toe 
b) Ryan scrached his head. 
Answer: 
he broke his toe

Input
the couple was happy to see each other.
what was the effect?
(a) they rested
(b) they kissed
Answer:



In [26]:
# convert pandas dataframe to a dataset object
data = Dataset.from_pandas(data)

# Extract datasets by split
train = data.filter(lambda x: x["split"] == "train")
val = data.filter(lambda x: x["split"] == "val")
test = data.filter(lambda x: x["split"] == "test")

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

# Prepare Quantized Model for QLoRA Training.
1. Load quantized model
2. Convert to LoRA for PEFT training

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, set_seed


# Speciy model alias for HF
alias = "google/flan-t5-xl"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(alias, trust_remote_code=True)

#Quantization Config
quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   
   # note for non-ampehere gpus (e.g. T4, V100, RTX) use float16 or leave empty
   bnb_4bit_compute_dtype=torch.bfloat16   
)

# Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(
    alias,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype="auto",
    quantization_config=quant_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    target_modules=["q", "k", "v"],
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.5
)

model = get_peft_model(model, peft_config)

In [15]:
# Note that the trainable parameters are signifacntly smaller. We only training 24% of the model!
model.print_trainable_parameters()

trainable params: 7,077,888 || all params: 2,856,835,072 || trainable%: 0.2478


# Inference and Training
We prepare the models for inference and training in batch setting. First we configure a dataloader that can handle batched inputs.

In [16]:
from dataclasses import dataclass
from transformers import AutoTokenizer, BatchEncoding
from ast import literal_eval


# The collator is responsible for ensuring the generated batches have a fixed dimension as the 
#input will be tensor. 

@dataclass
class SimpleCollator:
    tokenizer: AutoTokenizer
    config: dict 
    
    def __call__(self, examples: list) -> dict:
        batch = BatchEncoding(
            {
                k: [examples[i][k] for i in range(len(examples))]
                for k, v in examples[0].items()
            }
        )

        encoded_inputs = self.tokenizer(
            batch[self.config["input_column"]], 
            max_length = 120, 
            padding=True, 
            truncation=True,
            return_tensors="pt"
        )

        encoded_targets = self.tokenizer(
            batch[self.config["output_column"]], max_length = 120, padding=True, truncation=True,
            return_tensors="pt"
        )
        encoded_inputs["labels"] = encoded_targets["input_ids"]

        return encoded_inputs

collator = SimpleCollator(tokenizer, {"input_column": "prompt_input", "output_column": "output"})

In [17]:
from torch.utils.data import DataLoader

# Prepare Dataloaders
train_dl = DataLoader(
    train, 
    batch_size=4,
    pin_memory=True,
    shuffle=False,
    collate_fn=collator
)

val_dl = DataLoader(
    val,
    batch_size=16,
    pin_memory=True,
    shuffle=True,
    collate_fn=collator
)

test_dl = DataLoader(
    test, 
    batch_size=16,
    pin_memory=True,
    shuffle=False,
    collate_fn=collator
)

## Zero-shot baseline
Let's first do a zero-shot inference pass for baseline reading

In [18]:
import tqdm.notebook as tqdm

all_preds = []
for batch in tqdm.tqdm(test_dl, total = len(test_dl)):
    
    preds = model.generate(**batch, max_new_tokens=25)
    outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
    all_preds.extend(outputs)

# Note the FlAN T5 model ignores our instruction format and procuces the letters for prediction
all_preds[:15]

  0%|          | 0/32 [00:00<?, ?it/s]



['(a)',
 '(a)',
 '(b)',
 '(a)',
 '(a)',
 '(b) she jumped rope',
 '(a)',
 '(a)',
 '(a)',
 '(b)',
 '(a)',
 '(b)',
 '(a)',
 '(a)',
 '(a)']

In [27]:
test_df= data.filter(lambda x: x["split"] == "test").to_pandas()
test_df["baseline_preds"] = all_preds

# Let's manually score each example
baseline_is_correct = []
for i, row in test_df.iterrows():
    
    if "a)" in row["baseline_preds"]:
        pred = 1
    else:
        pred = 2

    baseline_is_correct.append(pred == row["answer"])

test_df["baseline_is_correct"] = baseline_is_correct

print(f"Baseline accuracy: {test_df['baseline_is_correct'].mean()}")

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

## Model Finetuning
Note that the baseline accuracy is very high. Finetuning may actually make things worse!

For finetuning we ues the Pytorch lightning library which makes build finetuning loops very easy.

In [20]:
import lightning as pl
from torch.optim import AdamW 

class PeftCALMT5(pl.LightningModule):

    def __init__(self, model_alias: str, tokenizer_alias: str):

        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_alias)

        self.peft_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM, 
            inference_mode=False, 
            target_modules=["q", "k", "v"],
            r=8, 
            lora_alpha=32, 
            lora_dropout=0.5
        )

        model = AutoModelForSeq2SeqLM.from_pretrained(model_alias)
        self.model = get_peft_model(model, self.peft_config)
        

    def training_step(self, batch, batch_idx): 
        outputs = self.model.forward(**batch, return_dict=True)
        loss = outputs["loss"]  
        
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)     
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs["loss"]  
        
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True) 
        
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=5e-4)
        return optimizer


model = PeftCALMT5(model_alias="google/flan-t5-xl", tokenizer_alias="google/flan-t5-xl")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
trainer = pl.Trainer(
  max_epochs=1,
  devices=1, 
  accelerator="gpu",
  accumulate_grad_batches=3 # Note we accumlate batches to effective form larger training batches 
)

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                  | Params
------------------------------------------------
0 | model | PeftModelForSeq2SeqLM | 2.9 B 
------------------------------------------------
7.1 M     Trainable params
2.8 B     Non-trainable params
2.9 B     Total params
11,427.340Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [23]:
ft_preds = []
for batch in tqdm.tqdm(test_dl, total = len(test_dl)):
    preds = model.model.generate(**batch, max_new_tokens=25)
    outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
    ft_preds.extend(outputs)
print(ft_preds[:10])

  0%|          | 0/32 [00:00<?, ?it/s]

['it was fragile', 'i retrieved a ticket stub', 'the termites ate through the wood in the house', 'the patrol agent checked their passports', 'it was a holiday', 'she jumped rope', 'more people entered the line', 'the baby drooled on her bib', 'the audience clapped along to the music', 'the girl brought the teacher an apple']


In [32]:
test_df["lora_pred"] = ft_preds
test_df["lora_is_correct"] = test_df.apply(lambda x: int(x['lora_pred'].lower().strip() == x["output"].lower().strip()), axis=1)
print(f"LoRA FT accuracy: {test_df['lora_is_correct'].mean()}")

LoRA FT accuracy: 0.952
