# Notebook Setup

In [50]:
!pip install onnxruntime >> NULL
!pip install -U torch >> NULL
!pip install -U sentence_transformers >> NULL
!pip install -q -U einops tiktoken accelerate peft bitsandbytes transformers >> NULL

print("Completed setup")

Completed setup


# LLM Inference

In [51]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

# Speciy model alias for HF
alias = "NousResearch/Llama-2-7b-chat-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(alias, trust_remote_code=True)

# Quantization Config
# quant_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True,
#    #bnb_4bit_compute_dtype=torch.bfloat16
# )

# Load Model
model = AutoModelForCausalLM.from_pretrained(
    alias,
    trust_remote_code=True,
    torch_dtype="auto",
    #quantization_config=quant_config
)

In [None]:
from string import Template

prompt_template = Template(
    """
    <s>[INST] <<SYS>>
    You are a helpful chatbot.
    $input
    <</SYS>>
    Answer the provided question. Be concise and clear in your response.
    [/INST]
    """
)

input = "Mary has 10 apples. She give 3 to John and 1 to Bob. She throws away 1 more apple. How many apples does Mary have left?"

prompt = prompt_template.substitute({"input": input})
encoded_prompt = tokenizer(prompt, return_tensors="pt")

output = model.generate(**encoded_prompt, max_new_tokens=150)
print(output)

In [None]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

## COT style prompting

In [None]:
cot_template = Template(
  """
  <s>[INST] <<SYS>>
  You are a helpful chatbot.
  $input
  <</SYS>>
  Answer the provided question. Let's think step-by-step. Provide an your reasoning steps and then answer the question.
  Be concise and clear in your response.
  [/INST]
  """
)

input = "Mary has 10 apples. She give 3 to John and 1 to Bob. She throws away 1 more apple. How many apples does Mary have left?"

prompt = cot_template.substitute({"input": input})
encoded_prompt = tokenizer(prompt, return_tensors="pt")

output = model.generate(**encoded_prompt, max_new_tokens=150)
print(tokenizer.decode(output[0], skip_special_tokens=True))

# RAG Pipeline

## Semantic Similarity Basics

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

s1 = "Cats are super cool."
s2 = "Cats are awesome."
s3 = "I like felines."
s4 = "Centipedes are terrifying."

s1_embed = embedder.encode(s1).reshape(1,-1)
s2_embed = embedder.encode(s2).reshape(1, -1)
s3_embed = embedder.encode(s3).reshape(1, -1)
s4_embed = embedder.encode(s4).reshape(1, -1)


print(f"Cosine similarity between s1 and s: {cosine_similarity(s1_embed, s2_embed)}")
print(f"Cosine similarity between s1 and s3: {cosine_similarity(s1_embed, s3_embed)}")
print(f"Cosine similarity between s1 and s4: {cosine_similarity(s1_embed, s4_embed)}")

# Semantic similarity for retrieval
query = "What is super cool?"
query_embed = embedder.encode(query).reshape(1, -1)

print(f"s1 relevance for query: {cosine_similarity(query_embed, s1_embed)}")
print(f"s2 relevance for query: {cosine_similarity(query_embed, s2_embed)}")
print(f"s3 relevance for query: {cosine_similarity(query_embed, s3_embed)}")
print(f"s4 relevance for query: {cosine_similarity(query_embed, s4_embed)}")

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


Cosine similarity between s1 and s: [[0.8596046]]
Cosine similarity between s1 and s3: [[0.5692848]]
Cosine similarity between s1 and s4: [[0.3089841]]
s1 relevance for query: [[0.5085266]]
s2 relevance for query: [[0.26878813]]
s3 relevance for query: [[0.17692326]]
s4 relevance for query: [[0.14233847]]


## Simple Dense Passage Retrieval

In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

class SimpleVectorDB:

  def __init__(self, documents: List, embedder_alias: str):
    self.documents = documents

    # Build Index
    # a. define the embedder
    self.embedder = SentenceTransformer(embedder_alias)

    # b. Embed the documents
    self.kb = self.embedder.encode(documents)


  def fetch_knowledge(self, query: str, n_results: int = 1) -> List[str]:
    """
    Given a user query, retrieve the most relevant document from KB. Retrieval
    should be based on the document which is the most semantic relevant to the
    query as measured by cosine similarity. Return the top n_results.
    """

    # 1. Embed the query
    query_embed = self.embedder.encode(query).reshape(1,-1)

    # 2. Compute similarity between query and docs embeddings
    sim_scores = cosine_similarity(query_embed, self.kb)

    # 3. Sort documents based on cosine similarity score, order list from
    # most similar to least
    sorted_doc_ids = np.argsort(sim_scores)[0][::-1]

    # # 4. Retrieve top n documents based on sorted document id
    final_docs = [ self.documents[i] for i in sorted_doc_ids ][:n_results]

    return final_docs



In [None]:
# Evaluate Retrieval

documents = [
    "The giraffe has 5 spots and 100 stripes.",
    "The giraffe has blue eyes.",
    "Giraffes have 4 legs.",
    "The cat has 6 spots and 200 stripes.",
    "The cat as green eyes.",
    "Cats have 4 legs and a tail.",
    "Penguins have no spots and no stripes.",
    "Penguins have 2 legs.",
    "The penguin has emerald eyes"
]

# Intialize kb
kb = SimpleVectorDB(documents, "all-MiniLM-L6-v2")

# Sample queries
q1 = "Which animal has blue eyes?"
expected_document = "The giraffe has blue eyes."

print(q1)
print("Top documents: ", kb.fetch_knowledge(q1))
print(f"Check: {kb.fetch_knowledge(q1)[0] == expected_document}")


q2 = "Which animals have atleast 2 legs?"
expected_documents = ['Penguins have 2 legs.', 'Giraffes have 4 legs.', 'Cats have 4 legs and a tail.']
print(q2)
print("Top documents: ", kb.fetch_knowledge(q2, 3))
print(f"Check: { len(set(kb.fetch_knowledge(q2, 3)).intersection(expected_documents)) == 3 }")


# Retrieval + Generation

In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from string import Template

# 1. Retrieve the top documents
q1 = "Which animal has blue eyes?"
docs =  kb.fetch_knowledge(q1,3)

# 2. Construct prompt with in-context information
prompt_template = Template(
"""
Answer the provided question below using the provided context.
Context: $context
Question: $question
"""
)

# 3. Build context string by appending each documents with \n seperator
context = "\n".join(docs)
prompt = prompt_template.substitute({"context": context, "question": q1})

print("prompt")
print(prompt)

print('----------')
# 4. Prompt model
encoded_input = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**encoded_input, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))