In [None]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.


# ***This one needs cleaning pipeline integration***

In [None]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "/content/Counseling-Children.pdf"

# # Download PDF if it doesn't already exist
# if not os.path.exists(pdf_path):
#   print("File doesn't exist, downloading...")

#   # The URL of the PDF you want to download
#   url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

#   # The local filename to save the downloaded file
#   filename = pdf_path

#   # Send a GET request to the URL
#   response = requests.get(url)

#   # Check if the request was successful
#   if response.status_code == 200:
#       # Open a file in binary write mode and save the content to it
#       with open(filename, "wb") as file:
#           file.write(response.content)
#       print(f"The file has been downloaded and saved as {filename}")
#   else:
#       print(f"Failed to download the file. Status code: {response.status_code}")
# else:
#   print(f"File {pdf_path} exists.")

In [None]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 26,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -26,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -25,
  'page_char_count': 675,
  'page_word_count': 102,
  'page_sentence_count_raw': 8,
  'page_token_count': 168.75,
  'text': 'Australia • Brazil • Mexico • Singapore • United Kingdom • United States Counseling  Children Ninth EDITION Donna A. Henderson Wake Forest University Charles L. Thompson, late The University of Tennessee, Knoxville Copyright 2016 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. Due to electronic rights, some third party content may be suppressed from the eBook and/or eChapter(s). Editorial review has deemed that any suppressed content does not materially affect the overall learning experience. Cengage Learning reserves the right to remove additional content at any time if subsequent rights restrictions require it.'}]

In [None]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 664,
  'page_char_count': 4016,
  'page_word_count': 600,
  'page_sentence_count_raw': 30,
  'page_token_count': 1004.0,
  'text': 'Counseling Children with Special Concerns\x08 667 •  Sexual bullying •  Cyber bullying According to Olweus, young people bully because they have strong needs for  power and dominance, they enjoy causing injury and suffering, and they are re- warded thorough material or psychological rewards. The Olweus Bullying Prevention Program is designed to address bullying is- sues at the school, in the classroom, and with individuals, and it has had impressive  outcomes (Finn, 2014). The U.S. Department of Health and Human Services has the  Stop Bullying Now Web site (http://stopbullyingnow.hrsa.gov/adults/default.aspx)  with resources for students and adults who are concerned about bullying behav- iors and mental health information links at http://mentalhealth.samhsa.gov/15plus  /aboutbullying.asp. Bullying behaviors should be addressed by knowledge

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-26,0,1,1,0.0,
1,-25,675,102,8,168.75,Australia • Brazil • Mexico • Singapore • Unit...
2,-24,1219,191,12,304.75,This is an electronic version of the print tex...
3,-23,2521,383,15,630.25,"© 2016, 2011 Cengage Learning ALL RIGHTS RESER..."
4,-22,1029,168,14,257.25,"Counseling Children, Ninth Edition, is dedicat..."


In [None]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,754.0,754.0,754.0,754.0,754.0
mean,350.5,3455.75,552.54,37.67,863.94
std,217.81,778.73,128.41,31.05,194.68
min,-26.0,0.0,1.0,1.0,0.0
25%,162.25,3223.0,520.25,24.0,805.75
50%,350.5,3659.5,585.0,30.0,914.88
75%,538.75,3957.75,627.0,35.0,989.44
max,727.0,4962.0,822.0,157.0,1240.5


In [None]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/754 [00:00<?, ?it/s]

In [None]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 396,
  'page_char_count': 3326,
  'page_word_count': 537,
  'page_sentence_count_raw': 20,
  'page_token_count': 831.5,
  'text': 'Rational Emotive Behavior Therapy\x08 399 Summary Ellis (2008) explained that an elegant system of counseling would have these charac- teristics: “a) economy of time and effort, b) rapid symptom reduction, c) \xadeffectiveness  with a large percentage of different kinds of clients, d) depth of \xadsolution of present- ing problems, and e) lastingness of the therapeutic results” (p.\xa0202). He asserted that  REBT matched those criteria. Replying to articles critical of REBT, Ellis (1998, 2003) writes that REBT \xadremains  within the field of science while resting on some evaluative assumptions. For exam- ple, the REBT concept of unconditional humanistic self-acceptance is still valid, even  though it requires an operational definition. The REBT concept of \xadself-acceptance  means that a person is more than a set of behaviors; that is, pe

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,754.0,754.0,754.0,754.0,754.0,754.0
mean,350.5,3455.75,552.54,37.67,863.94,34.48
std,217.81,778.73,128.41,31.05,194.68,18.82
min,-26.0,0.0,1.0,1.0,0.0,0.0
25%,162.25,3223.0,520.25,24.0,805.75,26.0
50%,350.5,3659.5,585.0,30.0,914.88,32.0
75%,538.75,3957.75,627.0,35.0,989.44,38.0
max,727.0,4962.0,822.0,157.0,1240.5,114.0


In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/754 [00:00<?, ?it/s]

In [None]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 455,
  'page_char_count': 3223,
  'page_word_count': 529,
  'page_sentence_count_raw': 28,
  'page_token_count': 805.75,
  'text': '458\t Chapter 14 \t COUNSELOR:\t I’m glad. Any negative strokes? \t CHRISTOPHER:\t No. \t COUNSELOR:\t Well, I told you your hands and face were dirty and I didn’t like you coming around like that,  right? You think you’ve got the idea about how positive and negative strokes work? \t CHRISTOPHER:\t Yes. \t COUNSELOR:\t How would you use stroking? \t CHRISTOPHER:\t Well, whenever I thought somebody did a good job on something, I could tell them. \t COUNSELOR:\t You know, there’s such a thing as giving strokes that are not asked for, strokes that you just  offer freely. Can you give an example of one of those, maybe? \t CHRISTOPHER:\t Just saying something nice when they don’t even really need it … well, they do need it. Just  saying it, but just saying it even if they haven’t done anything. \t COUNSELOR:\t How about a more specific example?

In [None]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,754.0,754.0,754.0,754.0,754.0,754.0,754.0
mean,350.5,3455.75,552.54,37.67,863.94,34.48,3.91
std,217.81,778.73,128.41,31.05,194.68,18.82,1.91
min,-26.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,162.25,3223.0,520.25,24.0,805.75,26.0,3.0
50%,350.5,3659.5,585.0,30.0,914.88,32.0,4.0
75%,538.75,3957.75,627.0,35.0,989.44,38.0,4.0
max,727.0,4962.0,822.0,157.0,1240.5,114.0,12.0


In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/754 [00:00<?, ?it/s]

2949

In [None]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 236,
  'sentence_chunk': 'New York: Julian Press. Perls, L. (1992). Concepts and misconceptions of Gestalt therapy. Journal of Humanistic \xadPsychology, 32, 50–56. Polster, W., & Polster, M. (1973). Gestalt therapy integrated. New York: Brunner/Mazel. Saner, R. (1989). Culture bias of gestalt therapy: Made-in-U. S. A. Gestalt Journal, 12, 57–71. Seligman, L., & Reichenberg, L. W. (2014).',
  'chunk_char_count': 362,
  'chunk_word_count': 53,
  'chunk_token_count': 90.5}]

In [None]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,2949.0,2949.0,2949.0,2949.0
mean,356.99,875.43,133.93,218.86
std,206.36,547.38,86.54,136.85
min,-25.0,15.0,1.0,3.75
25%,182.0,440.0,65.0,110.0
50%,359.0,792.0,125.0,198.0
75%,536.0,1214.0,189.0,303.5
max,727.0,4650.0,756.0,1162.5


Hmm looks like some of our chunks have quite a low token count.

How about we check for samples with less than 30 tokens (about the length of a sentence) and see if they are worth keeping?

# Pandas profiling of the data frame


In [None]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

ValueError: Cannot take a larger sample than population when 'replace=False'

Looks like many of these are headers and footers of different pages.

They don't seem to offer too much information.

Let's filter our DataFrame/list of dictionaries to only include chunks with over 30 tokens in length.

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:3]

[{'page_number': -25,
  'sentence_chunk': 'Australia • Brazil • Mexico • Singapore • United Kingdom • United States Counseling Children Ninth EDITION Donna A. Henderson Wake Forest University Charles L. Thompson, late The University of Tennessee, Knoxville Copyright 2016 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. Due to electronic rights, some third party content may be suppressed from the eBook and/or eChapter(s). Editorial review has deemed that any suppressed content does not materially affect the overall learning experience. Cengage Learning reserves the right to remove additional content at any time if subsequent rights restrictions require it.',
  'chunk_char_count': 674,
  'chunk_word_count': 101,
  'chunk_token_count': 168.5},
 {'page_number': -24,
  'sentence_chunk': 'This is an electronic version of the print textbook. Due to electronic rights restrictions, some third party content may be suppressed. Editorial review

In [None]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981113e-02  3.03164795e-02 -2.01218147e-02  6.86483756e-02
 -2.55255271e-02 -8.47689621e-03 -2.07084100e-04 -6.32377341e-02
  2.81606186e-02 -3.33352946e-02  3.02635301e-02  5.30720539e-02
 -5.03526032e-02  2.62288153e-02  3.33314314e-02 -4.51578870e-02
  3.63044329e-02 -1.37113058e-03 -1.20171346e-02  1.14946300e-02
  5.04510924e-02  4.70857024e-02  2.11912952e-02  5.14607318e-02
 -2.03746632e-02 -3.58889513e-02 -6.67892222e-04 -2.94393133e-02
  4.95858938e-02 -1.05639584e-02 -1.52014289e-02 -1.31752621e-03
  4.48197350e-02  1.56022953e-02  8.60380283e-07 -1.21392391e-03
 -2.37978548e-02 -9.09427938e-04  7.34480796e-03 -2.53931968e-03
  5.23369759e-02 -4.68043573e-02  1.66214537e-02  4.71578874e-02
 -4.15599234e-02  9.01929627e-04  3.60278897e-02  3.42214443e-02
  9.68227461e-02  5.94828576e-02 -1.64984558e-02 -3.51249650e-02
  5.92514267e-03 -7.08006672e-04 -2.4103

Nice! We've now got a way to numerically represent each of our chunks.

Our embedding has a shape of `(768,)` meaning it's a vector of 768 numbers which represent our text in high-dimensional space, too many for a human to comprehend but machines love high-dimensional space.

> **Note:** No matter the size of the text input to our `all-mpnet-base-v2` model, it will be turned into an embedding size of `(768,)`. This value is fixed. So whether a sentence is 1 token long or 1000 tokens long, it will be truncated/padded with zeros to size 384 and then turned into an embedding vector of size `(768,)`. Of course, other embedding models may have different input/output shapes.

How about we add an embedding field to each of our chunk items?

Let's start by trying to create embeddings on the CPU, we'll time it with the `%%time` magic to see how long it takes.

In [None]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


Ok not too bad... but this would take a *really* long time if we had a larger dataset.

Now let's see how long it takes to create the embeddings with a GPU.

In [None]:
%%time

# Send the model to the GPU
embedding_model.to("cuda") # requires a GPU installed, for reference

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/2947 [00:00<?, ?it/s]

CPU times: user 58 s, sys: 493 ms, total: 58.5 s
Wall time: 1min 5s


In [None]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: user 44.2 s, sys: 81.1 ms, total: 44.3 s
Wall time: 44.7 s


tensor([[ 0.0223, -0.0335, -0.0247,  ..., -0.0076,  0.0265, -0.0098],
        [-0.0103, -0.1128, -0.0336,  ...,  0.0183,  0.0271, -0.0448],
        [ 0.0132,  0.0179, -0.0230,  ..., -0.0051, -0.0293, -0.0203],
        ...,
        [ 0.0395,  0.0080, -0.0169,  ...,  0.0288,  0.0131, -0.0082],
        [ 0.0907, -0.0500, -0.0123,  ...,  0.0110,  0.0452, -0.0335],
        [ 0.0754, -0.0012, -0.0292,  ...,  0.0251,  0.0490, -0.0133]],
       device='cuda:0')

In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)

In [None]:
# prompt: using this dataframe text_chunks_and_embeddings_df
# make a pandas profiling report and other reports too using other libraries

from ydata_profiling import ProfileReport

# Assuming text_chunks_and_embeddings_df is already defined from the previous code
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_widgets()

# Other reporting libraries can be used similarly
# Example using sweetviz (you'll need to install it: !pip install sweetviz)

try:
    import sweetviz as sv

    my_report = sv.analyze(df)
    my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"
except ImportError:
    print("sweetviz not installed. Please install it using !pip install sweetviz")

# Example using dtale (you'll need to install it: !pip install dtale)
try:
    import dtale
    d = dtale.show(df)
    print(f"dtale report available at: {d._url}")
except ImportError:
    print("dtale not installed. Please install it using !pip install dtale")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

sweetviz not installed. Please install it using !pip install sweetviz
dtale not installed. Please install it using !pip install dtale


In [None]:
# Save embeddings to file

embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

And we can make sure it imports nicely by loading it.

In [None]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-25,Australia • Brazil • Mexico • Singapore • Unit...,674,101,168.5,[ 2.22830456e-02 -3.35137136e-02 -2.46837623e-...
1,-24,This is an electronic version of the print tex...,971,148,242.75,[-1.02757579e-02 -1.12849019e-01 -3.36313397e-...
2,-24,Editorial review has deemed that any suppresse...,238,34,59.5,[ 1.32418536e-02 1.79285351e-02 -2.29817890e-...
3,-23,"© 2016, 2011 Cengage Learning ALL RIGHTS RESER...",2481,357,620.25,[ 4.26830575e-02 -5.16567603e-02 -2.94548571e-...
4,-22,"Counseling Children, Ninth Edition, is dedicat...",902,146,225.5,[ 3.70728709e-02 -2.96163838e-02 -8.58205277e-...


In [None]:
# import torch

# def dot_product(vector1, vector2):
#     return torch.dot(vector1, vector2)

# def cosine_similarity(vector1, vector2):
#     dot_product = torch.dot(vector1, vector2)

#     # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
#     norm_vector1 = torch.sqrt(torch.sum(vector1**2))
#     norm_vector2 = torch.sqrt(torch.sum(vector2**2))

#     return dot_product / (norm_vector1 * norm_vector2)

# # Example tensors
# vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
# vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# # Calculate dot product
# print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
# print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
# print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# # Calculate cosine similarity
# print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
# print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
# print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

ModuleNotFoundError: No module named 'timer'

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings

    dot_scores = util.dot_score(query_embedding, embeddings)[0]


    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

SyntaxError: f-string: empty expression not allowed (<ipython-input-133-f1ced59fc507>, line 20)

In [None]:
query = "symptoms of panic attack"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

In [None]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

In [None]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [None]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


### Loading an LLM locally

Alright! Looks like `gemma-7b-it` it is (for my local machine with an RTX 4090, change the `model_id` and `use_quantization_config` values to suit your needs)!

There are plenty of examples of how to load the model on the `gemma-7b-it` [Hugging Face model card](https://huggingface.co/google/gemma-7b-it).

Good news is, the Hugging Face [`transformers`](https://huggingface.co/docs/transformers/) library has all the tools we need.

To load our LLM, we're going to need a few things:
1. A quantization config (optional) - This will determine whether or not we load the model in 4bit precision for lower memory usage. The we can create this with the [`transformers.BitsAndBytesConfig`](https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/quantization#transformers.BitsAndBytesConfig) class (requires installing the [`bitsandbytes` library](https://github.com/TimDettmers/bitsandbytes)).
2. A model ID - This is the reference Hugging Face model ID which will determine which tokenizer and model gets used. For example `gemma-7b-it`.
3. A tokenzier - This is what will turn our raw text into tokens ready for the model. We can create it using the [`transformers.AutoTokenzier.from_pretrained`](https://huggingface.co/docs/transformers/v4.38.2/en/model_doc/auto#transformers.AutoTokenizer) method and passing it our model ID.
4. An LLM model - Again, using our model ID we can load a specific LLM model. To do so we can use the [`transformers.AutoModelForCausalLM.from_pretrained`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained) method and passing it our model ID as well as other various parameters.

As a bonus, we'll check if [Flash Attention 2](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2) is available using `transformers.utils.is_flash_attn_2_available()`. Flash Attention 2 speeds up the attention mechanism in Transformer architecture models (which is what many modern LLMs are based on, including Gemma). So if it's available and the model is supported (not all models support Flash Attention 2), we'll use it. If it's not available, you can install it by following the instructions on the [GitHub repo](https://github.com/Dao-AILab/flash-attention).

> **Note:** Flash Attention 2 currently works on NVIDIA GPUs with a compute capability score of 8.0+ (Ampere, Ada Lovelace, Hopper architectures). We can check our GPU compute capability score with [`torch.cuda.get_device_capability(0)`](https://pytorch.org/docs/stable/generated/torch.cuda.get_device_capability.html).

> **Note:** To get access to the Gemma models, you will have to [agree to the terms & conditions](https://huggingface.co/google/gemma-7b-it) on the Gemma model page on Hugging Face. You will then have to authorize your local machine via the [Hugging Face CLI/Hugging Face Hub `login()` function](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication). Once you've done this, you'll be able to download the models. If you're using Google Colab, you can add a [Hugging Face token](https://huggingface.co/docs/hub/en/security-tokens) to the "Secrets" tab.
>
> Downloading an LLM locally can take a fair bit of time depending on your internet connection. Gemma 7B is about a 16GB download and Gemma 2B is about a 6GB download.

Let's do it!

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `Learning` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-aut

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5012354048, 'model_mem_mb': 4780.15, 'model_mem_gb': 4.67}

In [None]:
input_text = "I am feeling like i should suicide beause no one is talking to me parents dont talk and nobody wants to be my friends"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
input_text = (
    "You are an empathetic and supportive therapist. Use the knowledge base "
    "to provide the best possible response based on the patient's input. Ensure the response is "
    "detailed, actionable, and comforting. Stay professional and act as a therapist and maintain a supportive tone.\n\n"
    "Patient's Input:\n"
    f"{input_text}\n\n"
    "Your Response:"
)
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
I am feeling like i should suicide beause no one is talking to me parents dont talk and nobody wants to be my friends

Prompt (formatted):
<bos><start_of_turn>user
You are an empathetic and supportive therapist. Use the knowledge base to provide the best possible response based on the patient's input. Ensure the response is detailed, actionable, and comforting. Stay professional and act as a therapist and maintain a supportive tone.

Patient's Input:
I am feeling like i should suicide beause no one is talking to me parents dont talk and nobody wants to be my friends

Your Response:<end_of_turn>
<start_of_turn>model



In [None]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   2045,    708,    671, 200059,
            578,  39724,  45605, 235265,   5362,    573,   5567,   3222,    577,
           3658,    573,   1963,   3077,   3590,   3482,    611,    573,   7679,
         235303, 235256,   3772, 235265,  45409,    573,   3590,    603,  11352,
         235269, 134059, 235269,    578,  88227, 235265,  23291,   6790,    578,
           2027,    685,    476,  45605,    578,  10528,    476,  39724,  14533,
         235265,    109,  25390, 235303, 235256,  11438, 235292,    108, 235285,
           1144,   7965,   1154,    496,   1412,  26983,   2298,   1589,    793,
            974,    603,   8161,    577,    682,   6424,   8280,   5063,    578,
          21566,   8507,    577,    614,    970,   4078,    109,   6922,  10567,
         235292,    107,    108,    106,   2516,    108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Woah! That looks like a pretty good answer.

But notice how the output contains the prompt text as well?

How about we do a little formatting to replace the prompt in the output text?

> **Note:** `"<bos>"` and `"<eos>"` are special tokens to denote "beginning of sentence" and "end of sentence" respectively.

In [None]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

In [None]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "Why do I feel empty even though my life seems good on the outside? Is there a way to pinpoint what's missing?"
    "I experience mood swings multiple times a day. What could be causing this, and how can I stabilize my emotions?",
    "How do I stop blaming myself for something I now know wasn’t my fault?"

]

# Manually created question list
manual_questions = ["I constantly compare myself to others, and it makes me feel inadequate. How can I stop this pattern?",
                    "I feel like my partner doesn’t understand my emotions. How can I communicate better without starting an argument?",
                    "I feel guilty saying 'no' to others, even when I’m overwhelmed. How can I set boundaries without feeling selfish?",
                    "Sometimes I freeze or feel numb when I remember something painful from my past. What is happening to me, and how can I work through it?"
    ]

query_list = gpt4_questions + manual_questions

check `retrieve_relevant_resources()`

In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
# def prompt_formatter(query: str,
#                      context_items: list[dict]) -> str:
#     """
#     Augments query with text-based context from context_items.
#     """
#     # Join context items into one dotted paragraph
#     context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

#     # Create a base prompt with examples to help the model
#     # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
#     # We could also write this in a txt file and import it in if we wanted.
#     base_prompt = """Based on the following context items, please answer the query.
# Give yourself room to think by extracting relevant passages from the context before answering the query.
# Don't return the thinking, only return the answer.
# Make sure your answers are as explanatory as possible.
# Use the following examples as reference for the ideal answer style.
# \nExample 1:
# Query: What are the fat-soluble vitamins?
# Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
# \nExample 2:
# Query: What are the causes of type 2 diabetes?
# Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
# \nExample 3:
# Query: What is the importance of hydration for physical performance?
# Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
# \nNow use the following context items to answer the user query:
# {context}
# \nRelevant passages: <extract relevant passages from the context here>
# User query: {query}
# Answer:"""

#     # Update base prompt with context items and query
#     base_prompt = base_prompt.format(context=context, query=query)

#     # Create prompt template for instruction-tuned model
#     dialogue_template = [
#         {"role": "user",
#         "content": base_prompt}
#     ]

#     # Apply the chat template
#     prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                           tokenize=False,
#                                           add_generation_prompt=True)
#     return prompt


def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    Formats the prompt to simulate a psychological therapist's response.
    """
    # Join context items into one readable format
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Base prompt adjusted to simulate a therapist
    base_prompt = """You are a compassionate and knowledgeable psychological therapist.
Your goal is to provide empathetic, evidence-based, and actionable advice to mental health patients.
Use the following context items to inform your responses, but tailor your tone to be reassuring and supportive.

Here are some examples of how to structure your responses:

\nExample 1:
Query: I feel like I’m always anxious, even when nothing is wrong. What can I do to feel better?
Answer: Feeling anxious without an obvious cause can be overwhelming, but you're not alone in this. Anxiety often stems from both internal and external factors, and managing it involves exploring what works best for you. Techniques such as mindfulness exercises, deep breathing, and journaling can help. Additionally, maintaining a routine, getting enough rest, and practicing self-care can create a sense of stability. If the anxiety persists, seeking guidance from a therapist to explore the underlying triggers might be beneficial. You’re taking a great step by asking for help.

\nExample 2:
Query: How can I stop overthinking everything? It’s affecting my sleep and mood.
Answer: Overthinking can feel like an endless loop, and it’s good that you’re addressing it. One way to manage this is by setting aside 'worry time' during the day, where you allow yourself to focus on concerns for a limited period. Outside of this time, practice redirecting your thoughts through activities like exercise, reading, or hobbies. Techniques like cognitive-behavioral therapy (CBT) are also effective in challenging and reframing negative thought patterns. Sleep hygiene practices, such as avoiding screens before bed and creating a relaxing bedtime routine, can further improve your rest.

\nExample 3:
Query: Why do I always feel tired and unmotivated, even when I try to rest?
Answer: Feeling tired and unmotivated despite resting can be a sign of emotional or physical strain. Stress, anxiety, or depression often affect energy levels and motivation. It's important to focus on balance—regular exercise, a nutritious diet, and consistent sleep patterns can make a difference. Break large tasks into smaller, achievable goals to rebuild a sense of accomplishment. Checking in with a therapist can also help you explore any deeper issues that may be contributing to how you feel. Remember, you don’t have to face this alone.

Now, use the context below to answer the user query empathetically and professionally. Avoid sounding robotic; instead, focus on being kind, understanding, and clear.
\nContext:
{context}

Relevant passages: <extract relevant passages from the context here>
\nUser query: {query}
Answer:"""

    # Format the prompt with the context and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt


Looking good! Let's try our function out.

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

In [None]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

## Extensions

* May want to improve text extraction with something like Marker - https://github.com/VikParuchuri/marker
* Guide to more advanced PDF extraction - https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517
* See the following prompt engineering resources for more prompting techniques - promptinguide.ai, Brex's Prompt Engineering Guide
* What happens when a query comes through that there isn't any context in the textbook on?
* Try another embedding model (e.g. Mixed Bread AI large, `mixedbread-ai/mxbai-embed-large-v1`, see: https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
* Try another LLM... (e.g. Mistral-Instruct)
* Try different prompts (e.g. see prompting techniques online)
* Our example only focuses on text from a PDF, however, we could extend it to include figures and images
* Evaluate the answers -> could use another LLM to rate our answers (e.g. use GPT-4 to make)
* Vector database/index for larger setup (e.g. 100,000+ chunks)
* Libraries/frameworks such as LangChain / LlamaIndex can help do many of the steps for you - so it's worth looking into those next, wanted to recreate a workflow with lower-level tools to show the principles
* Optimizations for speed
    * See Hugging Face docs for recommended speed ups on GPU - https://huggingface.co/docs/transformers/perf_infer_gpu_one
    * Optimum NVIDIA - https://huggingface.co/blog/optimum-nvidia, GitHub: https://github.com/huggingface/optimum-nvidia
    * See NVIDIA TensorRT-LLM - https://github.com/NVIDIA/TensorRT-LLM
    * See GPT-Fast for PyTorch-based optimizations - https://github.com/pytorch-labs/gpt-fast
    * Flash attention 2 (requires Ampere GPUs or newer) - https://github.com/Dao-AILab/flash-attention
* Stream text output so it looks prettier (e.g. each token appears as it gets output from the model)
* Turn the workflow into an app, see Gradio type chatbots for this - https://www.gradio.app/guides/creating-a-chatbot-fast, see local example: https://www.gradio.app/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face