### Rerieval Augmentation Generations (RAG) Pipeline:

#### Importing Essential Libraries:

In [52]:
import os
import re
import time
import torch
import fitz
import random
import textwrap
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm 
from spacy.lang.en import English
from time import perf_counter as time
from timeit import default_timer as timer
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer

#### Downloading Documents:

In [2]:
# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist on our PC, Hold on We are Downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as: {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} already exists in our System.")

File human-nutrition-text.pdf already exists in our System.


#### Reading Documents and Applying Text Formatting:

-- fitz library utilizedn for reading pdf file. -> (!pip install PyMuPDF fitz pypdf)

-- Source code PyMuODF: https://github.com/pymupdf/pymupdf

-- pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially.

In [3]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text.
    
    Text formatting might be different for each documents."""
    
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)
    
    # Write down other formatting here if required.....
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),    # No. of Words in the Page
                                "page_word_count": len(text.split(" ")),  # No. of Words in Page
                                "page_sentence_count_raw": len(text.split(". ")), # No. Sentences in the Page
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]   # Extracting the First Two Dict from List.

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

#### Extracting Random Samples from List of Dictionaries:

In [4]:
x = random.sample(pages_and_texts, k=3)
for idx, sample in enumerate(x):
    print(f"Sample: {idx+1}")
    print(f"Sample:\n {sample}")
    print("--------------------------------------------------------")

Sample: 1
Sample:
 {'page_number': 979, 'page_char_count': 1715, 'page_word_count': 298, 'page_sentence_count_raw': 11, 'page_token_count': 428.75, 'text': '• Amino Acid Supplements. Certain amino acid supplements,  which are often taken by bodybuilders among others, can  increase the risk of consuming too much protein. An  occasional amino acid drink in the place of a meal is not a  problem. However, problems may arise if you add the  supplement to your existing diet. Most Americans receive two  to three times the amount of protein required on a daily basis  from their existing diets—taking amino acid supplements just  adds to the excess. Also, certain amino acids share the same  transport systems in the absorption process; therefore, a  concentrated excess of one amino acid obtained from a  supplement may increase the probability of decreased  absorption of another amino acid that uses the same transport  system. This could lead to deficiency in the competing amino  acid.  Supplement

In [5]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
print(f"Description Stat:")
df.describe().round(2)

Description Stat:


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


#### Preprocessing: Splitting Pages into Sentences: (Group of 10 Sentences)

In [7]:
# Sentencizer:
nlp = English()  # English Instance: Already Imported

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer 
nlp.add_pipe("sentencizer")   # Turning Text Text into Sentences.

# Create document instance as an example
doc = nlp("This is a sentence. This another sentence. I like elephants.")
assert len(list(doc.sents)) == 3

# Print out our sentences split
list(doc.sents)

[This is a sentence., This another sentence., I like elephants.]

In [8]:
# Example Text:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 864,
 'page_word_count': 137,
 'page_sentence_count_raw': 8,
 'page_token_count': 216.0,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.\xa0Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.  \xa0https:/ /ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitami

In [9]:
# Preprocessing: Splitting Pages into Sentences: (Group of 10 Sentences)
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
# Random Sample View:
random.sample(pages_and_texts, k=1)

[{'page_number': 1113,
  'page_char_count': 1621,
  'page_word_count': 256,
  'page_sentence_count_raw': 17,
  'page_token_count': 405.25,
  'text': 'not others. The Diabetes Prevention Trial that studied lifestyle and  drug interventions in more than three thousand participants who  were at high risk for Type 2 diabetes found that intensive lifestyle  intervention reduced the chances of getting Type 2 diabetes by 58  percent.11  Gestational Diabetes  During pregnancy some women develop gestational diabetes.  Gestational diabetes is characterized by high blood-glucose levels  and insulin resistance. The exact cause is not known but does  involve the effects of pregnancy hormones on how cells respond  to insulin. Gestational diabetes can cause pregnancy complications  and it is common practice for healthcare practitioners to screen  pregnant women for this metabolic disorder. The disorder normally  ceases when the pregnancy is over, but the National Diabetes  Information Clearing House 

In [11]:
# Statistics:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32
std,348.86,560.44,95.75,6.19,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


##### Chunking: Chunking is the method of breaking down the large files into more manageable segments/chunks so the LLM applications can get proper context and the retrieval can be easy.

-- Different Types of Chunking:

⮕ Level 1 : Fixed Size Chunking
This is the most crude and simplest method of segmenting the text. It breaks down the text into chunks of a specified number of characters, regardless of their content or structure. Langchain and llamaindex framework offer CharacterTextSplitter and SentenceSplitter (default to spliting on sentences) classes for this chunking technique.

⮕ Level 2: Recursive Chunking
While Fixed size chunking is easier to implement, it doesn’t consider the structure of text. Recursive chunking offers an alternative.
In this method, we divide the text into smaller chunk in a hierarchical and iterative manner using a set of separators. Langchain framework offers RecursiveCharacterTextSplitter class, which splits text using default separators (“\n\n”, “\n”, “ “,””)

⮕ Level 3 : Document Based Chunking
In this chunking method, we split a document based on its inherent structure. This approach considers the flow and structure of content but may not be as effective documents lacking clear structure.

⮕ Level 4: Semantic Chunking
All above three levels deals with content and structure of documents and necessitate maintaining constant value of chunk size. This chunking method aims to extract semantic meaning from embeddings and then assess the semantic relationship between these chunks. The core idea is to keep together chunks that are semantic similar.
Llamindex has SemanticSplitterNodeParse class that allows to split the document into chunks using contextual relationship between chunks.

⮕ Level 5: Agentic Chunking
This chunking strategy explore the possibility to use LLM to determine how much and what text should be included in a chunk based on the context.

In [12]:
# Chunking our Senteces into 10 Group:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split lists of texts recursively into chunk size
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

# Testing the above function:
test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)  
    item["num_chunks"] = len(item["sentence_chunks"])   # No. of chunks Created after Splitting the sentences into 10 sentences.

  0%|          | 0/1208 [00:00<?, ?it/s]

In [14]:
# Sample Example:
random.sample(pages_and_texts, k=1)

[{'page_number': 632,
  'page_char_count': 639,
  'page_word_count': 120,
  'page_sentence_count_raw': 6,
  'page_token_count': 159.75,
  'text': 'Age Group  RDA (mg/day) UL (mg/day)  Infants (0–6 months)  100*  –  Infants (6–12 months)  275*  –  Children (1–3 years)  460  3,000  Children (4–8 years)  500  3,000  Children (9–13 years)  1,250  4,000  Adolescents (14–18 years)  1,250  4,000  Adults (19–70 years)  700  4,000  Adults (> 70 years)  700  3,000  * denotes Adequate Intake  Micronutrient Information Center: Phosphorus. Oregon State  University, Linus Pauling Institute. http:/ /lpi.oregonstate.edu/mic/ minerals/phosphorus. Updated in July 2013. Accessed October 22,  2017.  Dietary Sources of Phosphorus  Table 10.4 Phosphorus Content of Various Foods  632  |  Phosphorus',
  'sentences': ['Age Group  RDA (mg/day) UL (mg/day)  Infants (0–6 months)  100*  –  Infants (6–12 months)  275*  –  Children (1–3 years)  460  3,000  Children (4–8 years)  500  3,000  Children (9–13 years)  1,2

In [15]:
# View Stats:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [16]:
# Split each chunk into its own item:
pages_and_chunks = []
for item in tqdm(pages_and_texts): 
    for sentence_chunk in item["sentence_chunks"]: 
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars
        
        pages_and_chunks.append(chunk_dict) 
        
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [17]:
# Sample:
random.sample(pages_and_chunks, k=1)

[{'page_number': 489,
  'sentence_chunk': 'Of course many behaviors are reflective of what we have easy access to—a concept we will discuss next. Societal Influences It is without a doubt that the American society affects what and how much we eat. Portion sizes have increased dramatically in the past few decades. For example, a bagel is now more than twice the size it was in the 1960s. Today, American teenagers have access to a massive amount of calorie-dense foods and beverages, which is a large contributor to the recent rapid increase in overweight and obesity in adolescents in this country. Even different cultures within the United States have different eating habits. For instance, Native Hawaiians and Pacific Islanders who have since adopted the western diet, post-colonization consume\xa0 foods high in fat, which is a contributing factor to their higher incidences of overweight and obesity. The fast food industry in America not only supplies Americans with a large proportion of the

In [18]:
# Stats:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [19]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25


#### Filter chunks of text for short chunks:

In [22]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 29.25 | Text: 2. Lacto-vegetarian. This type of vegetarian diet includes dairy products but not eggs. Lifestyles and Nutrition | 27
Chunk token count: 9.75 | Text: Table 3.5 Salt Substitutes Sodium | 185
Chunk token count: 28.75 | Text: American Journal of Clinical Dietary, Behavioral, and Physical Activity Recommendations for Weight Management | 509
Chunk token count: 20.25 | Text: Published 2002. Accessed December 2, 2017. Pacific Based Dietary Guidelines | 761
Chunk token count: 3.75 | Text: 806 | Pregnancy


In [23]:
# Filter our DataFrame for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [24]:
# Sample Example:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 533,
  'sentence_chunk': 'percent and other nonvertebral fractures by 23 percent.4 A reduction in fracture risk was not observed when people took vitamin D supplements at doses of 400 international units. Many other health benefits have been linked to higher intakes of vitamin D, from decreased cardiovascular disease to the prevention of infection. Furthermore, evidence from laboratory studies conducted in cells, tissues, and animals suggest vitamin D prevents the growth of certain cancers, blocks inflammatory pathways, reverses atherosclerosis, increases insulin secretion, and blocks viral and bacterial infection and many other things. Vitamin D deficiency has been linked to an increased risk for autoimmune diseases. Immune diseases, rheumatoid arthritis, multiple sclerosis, and Type 1 diabetes have been observed in populations with inadequate vitamin D levels. Additionally, vitamin D deficiency is linked to an increased incidence of hypertension. Until the results co

### Embedding: 
- An embedding is a way to represent data, such as words or images, as numerical vectors in a multi-dimensional space. This representation captures relationships and similarities between different pieces of data, making it easier for algorithms to process and understand them.

In [27]:
# Embedding our text chunks using Hugging Face Library:

# It may take some time for downloading the model and loading it for the first time.
# Subsequent runs will not require downloading again; the model will be loaded from the cache.
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")    # Download from hugging face.


In [28]:
# Create a list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
             "Sentences can be embedded one by one or in a list.",
             "I like horses!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

Sentence: The Sentence Transformer library provides an easy way to create embeddings.
Embedding: [-3.44286300e-02  2.95328330e-02 -2.33643539e-02  5.57257496e-02
 -2.19098590e-02 -6.47060527e-03  1.02848671e-02 -6.57803118e-02
  2.29718108e-02 -2.61121094e-02  3.80420424e-02  5.61403073e-02
 -3.68746668e-02  1.52787855e-02  4.37020473e-02 -5.19723520e-02
  4.89479750e-02  3.58107663e-03 -1.29750716e-02  3.54384189e-03
  4.23262566e-02  3.52606587e-02  2.49402728e-02  2.99176984e-02
 -1.99382678e-02 -2.39752326e-02 -3.33366124e-03 -4.30450439e-02
  5.72014190e-02 -1.32517805e-02 -3.54477949e-02 -1.13936048e-02
  5.55561259e-02  3.61094647e-03  8.88527097e-07  1.14026899e-02
 -3.82230096e-02 -2.43546255e-03  1.51314037e-02 -1.32734815e-04
  5.00659980e-02 -5.50876148e-02  1.73444897e-02  5.00959419e-02
 -3.75959203e-02 -1.04463520e-02  5.08322567e-02  1.24861356e-02
  8.67377147e-02  4.64143082e-02 -2.10690200e-02 -3.90251614e-02
  1.99694349e-03 -1.42345466e-02 -1.86794717e-02  2.826694

In [29]:
embeddings[0].shape  # Represent the Each Sentence with 768 Numbers

(768,)

In [30]:
embedding = embedding_model.encode("My favourite animal is the cow!")
embedding

array([-1.45474076e-02,  7.66726956e-02, -2.85872333e-02, -3.31282988e-02,
        3.65210623e-02,  4.78570424e-02, -7.08108172e-02,  1.62834208e-02,
        1.93443522e-02, -2.80482415e-02, -2.91747209e-02,  5.11309542e-02,
       -3.28720175e-02, -8.98753665e-03, -1.03672855e-02, -3.15488651e-02,
        4.22783680e-02, -9.13287606e-03, -1.94017403e-02,  4.35689427e-02,
       -2.31998023e-02,  4.29883264e-02, -1.72393285e-02, -2.01372374e-02,
       -3.13574187e-02,  8.08164012e-03, -2.06725132e-02, -2.27869693e-02,
        2.44812425e-02,  1.71968229e-02, -6.26672804e-02, -7.54797310e-02,
        3.57421711e-02, -5.46572637e-03,  1.24730320e-06, -7.63199665e-03,
       -3.53221931e-02,  1.91326831e-02,  3.99046019e-02,  2.11734464e-03,
        1.64565891e-02,  9.84057132e-03, -1.80700831e-02,  9.33838170e-03,
        3.23482789e-02,  5.84784821e-02,  4.23187055e-02,  1.62091162e-02,
       -9.14910734e-02,  1.82305351e-02, -5.25728147e-03, -7.81020615e-03,
       -3.47644202e-02, -

In [31]:
# Testing the Creating of Embeddings using CPU: It Take More Time That GPU So We will work with GPU:

# %%time
# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [32]:
# Embedding Creation with GPU
# %%time
embedding_model.to("cuda")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

In [37]:
# Embed the Chunk in Batch: This takes less time to create embeddings in batch instead of one by one.

# %%time
import time
start_time = time.time()

# Extract the text chunks from the data
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
print(f"Time taken to extract chunks: {time.time() - start_time} seconds")
print(text_chunks[419])

Time taken to extract chunks: 0.0 seconds
often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture

In [38]:
%%time
# Embed all texts in batches: You can see an improvement in embedding the chunks. (Here, Each Chunk Contains Less Than or Equal to 10 Sentences.)
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,  # You can experiment to find which batch size leads to the best results.
                                               convert_to_tensor=True)
text_chunk_embeddings

CPU times: total: 23.2 s
Wall time: 1min 44s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

### Save the Embedding to File:
- You can also save the embedding in vector database for efficent retreiving.

In [39]:
pages_and_chunks_over_min_token_len[419]

{'page_number': 277,
 'sentence_chunk': 'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture 

In [40]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [41]:
# Import saved file and view 
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242675e-02 9.02281255e-02 -5.09549351e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156232e-02 5.92139661e-02 -1.66167356e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801954e-02 3.39813679e-02 -2.06426606e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566836e-02 3.81274670e-02 -8.46854784e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264494e-02 -8.49767216e-03 9.57160257e-...


### RAG: Search & Answer: (Semantic Search)

- RAG goal: Retrieve relevant passages based on a query and use those passages to augment an input to an LLM so it can generate an output based on those relevant passages.

- Similarity Search: To Find Similar Passage and Context based on Query. Comparing embeddings is known as similarity search, vector search, semantic search.

In [42]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert our embeddings into a torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")
text_chunks_and_embedding_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242675, 0.0902281255, -0.00509549351, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156232, 0.0592139661, -0.0166167356, -0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.50,"[0.0279801954, 0.0339813679, -0.0206426606, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,"[0.0682566836, 0.038127467, -0.00846854784, -0..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.0330264494, -0.00849767216, 0.00957160257, ..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1305,176,326.25,"[0.0185622647, -0.0164278075, -0.0127045643, -..."
1676,1164,Hazard Analysis Critical Control Points reused...,375,51,93.75,"[0.03347205, -0.0570440702, 0.0151489452, -0.0..."
1677,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1286,173,321.50,"[0.0770515576, 0.00978558045, -0.0121817328, 0..."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,59,102.50,"[0.103045173, -0.0164701752, 0.00826843735, 0...."


In [43]:
embeddings.shape

torch.Size([1680, 768])

In [44]:
# Create model
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",   # Loaded from Cache. Because we downloaded previously.
                                      device=device)

#### Semantic Search Pipeline:
Let's create a small semantic search pipeline. In essence, we want to search for a query (e.g. "macronutrient functions") and get back relevant passages from our textbook.

We can do so with the following steps:
1. Define a query string.
2. Turn the query string into an embedding.
3. Perform a dot product or cosine similarity function between the text embeddings and the query embedding.
4. Sort the results from 3 in descending order.

Note: to use dot product for comparison, ensure vector sizes are of same shape (e.g. 768) and tensors/vectors are in the same datatype (e.g. both are in torch.float32).

In [46]:
# 1. Define the query
query = "good foods for protein"
print(f"Query: {query}")

# 2. Embed the query
# Note: it's import to embed you query with the same model you embedding your passages
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

# 3. Get similarity scores with the dot product (use cosine similarity if outputs of model aren't normalized)
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer() 

print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep top 5)
top_results_dot_product = torch.topk(dot_scores, k=5)  # Torch Method to find top 5 results from the dot product result.
top_results_dot_product 

Query: good foods for protein
[INFO] Time taken to get scores on 1680 embeddings: 0.00132 seconds.


torch.return_types.topk(
values=tensor([0.7729, 0.7647, 0.6743, 0.6743, 0.6634], device='cuda:0'),
indices=tensor([611, 616, 615, 620, 617], device='cuda:0'))

In [47]:
# To Match the Extracted Indices Number with The Query:
print(pages_and_chunks[611]['sentence_chunk'])
print(pages_and_chunks[616]['sentence_chunk'])
print(pages_and_chunks[620]['sentence_chunk'])

Dietary Sources of Protein The protein food group consists of foods made from meat, seafood, poultry, eggs, soy, dry beans, peas, and seeds. According to the Harvard School of Public Health, “animal protein and vegetable protein probably have the same effects on health. It’s the protein package that’s likely to make a difference.”1 1. Protein: The Bottom Line. Harvard School of Public Proteins, Diet, and Personal Choices | 411
Additionally, a person should consume 8 ounces of cooked seafood every week (typically as two 4-ounce servings) to assure they are getting the healthy omega-3 fatty acids that have been linked to a lower risk for heart disease. Another tip is choosing to eat dry beans, peas, or soy products as a main dish. Some of the menu choices include chili with kidney and pinto beans, hummus on pita bread, and black bean enchiladas. You could also enjoy nuts in a variety of ways. You can put them on a salad, in a stir-fry, or use them as a topping for steamed vegetables in p

In [48]:
1000*1680*150 # Embeddings*Average Words

252000000

In [49]:
# Example to Test How Work fast Semantic Search:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer() 

print(f"[INFO] Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

Embeddings shape: torch.Size([168000, 768])
[INFO] Time taken to get scores on 168000 embeddings: 0.00068 seconds.


We can see that searching over embeddings is very fast even if we do exhaustive.

But if you had 10M+ embeddings, you likely want to create an index.

An index is like letters in the dictionary.

For example, if you wanted to search "duck" in the dictionary, you'd start at "d" then find words close to "du..." etc.

An index helps to narrow it down.

A populary indexing library for vector search is Faiss, see here: https://github.com/facebookresearch/faiss 

One technique that the library provides is approximate nearest neighbour search (ANN): https://en.wikipedia.org/wiki/Nearest_neighbor_search

Let's make our vector search results pretty. 

In [50]:
# Better Visaulization of Results:

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

query = "good foods for protein"
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indices from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")    

Query: 'good foods for protein'

Results:
Score: 0.7729
Text:
Dietary Sources of Protein The protein food group consists of foods made from
meat, seafood, poultry, eggs, soy, dry beans, peas, and seeds. According to the
Harvard School of Public Health, “animal protein and vegetable protein probably
have the same effects on health. It’s the protein package that’s likely to make
a difference.”1 1. Protein: The Bottom Line. Harvard School of Public Proteins,
Diet, and Personal Choices | 411
Page number: 411


Score: 0.7647
Text:
Additionally, a person should consume 8 ounces of cooked seafood every week
(typically as two 4-ounce servings) to assure they are getting the healthy
omega-3 fatty acids that have been linked to a lower risk for heart disease.
Another tip is choosing to eat dry beans, peas, or soy products as a main dish.
Some of the menu choices include chili with kidney and pinto beans, hummus on
pita bread, and black bean enchiladas. You could also enjoy nuts in a variety of
w

Note: We could potentially improve the order of these results with a reranking model. A model that has been trained specifically to take search results (e.g. the top 25 semantic results) and rank them in order from most likely top-1 to least likely.

See here for an open-source reranking model: https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1

To check our results, what if we wanted to automatically surface the page of texts related to our query?

In [53]:
# Load the model, here we use our base sized model: Re Ranking The Results:
model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1")

# Example query and documents
query = "Who wrote 'To Kill a Mockingbird'?"
documents = [
    "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.",
    "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.",
    "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.",
    "Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.",
    "The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.",
    "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan."
]

# Lets get the scores
results_ = model.rank(query, documents, return_documents=True, top_k=3)
results_

[{'corpus_id': 0,
  'score': 0.99801946,
  'text': "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature."},
 {'corpus_id': 2,
  'score': 0.9969399,
  'text': "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961."},
 {'corpus_id': 5,
  'score': 0.0294786,
  'text': "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan."}]

In [54]:
# Example query and documents with page numbers
query = "Who wrote 'To Kill a Mockingbird'?"
documents = [
    ("'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.", 10),
    ("The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.", 20),
    ("Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.", 30),
    ("Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.", 40),
    ("The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.", 50),
    ("'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan.", 60)
]

# Lets get the scores
results = model.rank(query, [doc[0] for doc in documents], return_documents=True, top_k=3)

# Print the results with page numbers
for idx, doc in enumerate(results):
    print(f"Document {idx+1}:")
    print("Page Number:", documents[doc['corpus_id']][1])
    print("Score:", doc['score'])
    print("Text:", doc['text'])
    print("\n")  # Adding a new line for better readability


Document 1:
Page Number: 10
Score: 0.99801946
Text: 'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.


Document 2:
Page Number: 30
Score: 0.9969399
Text: Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.


Document 3:
Page Number: 60
Score: 0.0294786
Text: 'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan.




In [None]:
# Implement the Re Ranking In Our Script:

# # 1. Step Prepare the Documents:

# query = "good foods for protein"
# print(f"Query: '{query}'\n")
# print("Results:")

# related_documents = []
# # Loop through top_results_dot_product and extract list on Index 1: Which is List of Indexes
# for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):    
#     related_documents.append(pages_and_chunks[idx]["sentence_chunk"])
# print(f"Related Documents: \n {related_documents}")    

# # 2. Let's Get Score:
# # Lets get the scores
# results_reranked = model.rank(query, related_documents, return_documents=True, top_k=3)
# for item in results_reranked:
#     print("Score:", item['score'])
#     print("Text:", item['text'])
#     print("\n")  # Adding a new line for better readability
    

# Step 1: Prepare the Documents
query = "good foods for protein"
print(f"Query: '{query}'\n")
print("Results:")

related_documents_with_pages = []  # Combined list for documents and page numbers

# Loop through top_results_dot_product and extract documents with page numbers
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    related_document = {"text": pages_and_chunks[idx]["sentence_chunk"],
                        "page_number": pages_and_chunks[idx]["page_number"]}
    related_documents_with_pages.append(related_document)

# Step 2: Get Scores and Re-Rank
results_reranked = model.rank(query, [doc['text'] for doc in related_documents_with_pages], return_documents=True, top_k=3)

# Print re-ranked results with page numbers
for idx, item in enumerate(results_reranked):
    print(f"Rank {idx+1}:")
    print("Page Number:", related_documents_with_pages[item['corpus_id']]['page_number'])
    print("Score:", item['score'])
    print("Text:", item['text'])
    print("\n")  # Adding a new line for better readability
    

In [None]:
# open PDF and load target
pdf_path = "human-nutrition-text.pdf"
doc = fitz.open(pdf_path)
page = doc.load_page(411 + 41) # note: page numbers of our PDF start 41+

# Get the image of the page
img = page.get_pixmap(dpi=300)

# Save image (optional)
# img.save("output_filename.png")
doc.close()

# Convert the pixmap to a numpy array
img_array = np.frombuffer(img.samples_mv,
                          dtype=np.uint8).reshape((img.h, img.w, img.n))

# Display the image using Matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis("off")
plt.show()

### Other Similarity Measures --------------------------------

Cosine Similarity: A⋅B/∥A∥⋅∥B∥ -- Where A⋅B is the dot product of vectors & ∥A∥ and ∥𝐵∥ are the Euclidean norms (magnitudes) of vectors A and 𝐵, respectively.

Example: Let's consider two vectors: (A = [3, 4]) and (B = [1, 2]).

- A⋅B=(3×1)+(4×2)=3+8=11
- ∥A∥ = SQRT((3)**2 + (4)**2)  = 5
- ∥B∥= SQRT((1)**2 + (2)**2) = SQRT(5)
- Cosine Similarity: A⋅B/∥A∥⋅∥B∥ = 11/(5xSQRT(5)) = 0.984
- Interpreation: A cosine similarity value close to 1 indicates that the vectors have a high degree of similarity in direction, meaning they are pointing in similar directions within the vector space. In our example, vectors 𝐴 and  𝐵 are relatively similar in direction, leading to a high cosine similarity value.


In [None]:
# Other Similarity Measures:

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example vectors/tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

print("----------------------------------------------------------------")
print("Cosine similarity Implemented in Numpy")

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

# Example vectors
vector_a = np.array([3, 5, 2])
vector_b = np.array([1, 4, 7])

# Calculate cosine similarity
cos_sim = cosine_similarity(vector_a, vector_b)
print(f"Cosine Similarity: {cos_sim}")


### Full Fledged Functionizing our Semantic Search Pipeline:

-- Our Pipeline Contains Two Model First is for Embedding Creation & Second is for Raranking the Result. You can Also Remove the Reranking Steps:

-- Also We Provide by default because we already downloaded the models and imported in this scripts.

In [None]:
# Load the model, here we use our base sized model
rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1", device="cuda")
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")    # Download from hugging face.

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model_encode,
                                n_resources_to_return,
                                print_time: bool = True):
    """
    Embeds a query with a model and returns top-k scores and indices from embeddings.
    
    Args:
    - query: The input query string.
    - embeddings: Torch tensor containing embeddings of all resources.
    - model_encode: SentenceTransformer model for encoding queries .
    - n_resources_to_return: Number of top resources to return after searching .
    - print_time: Boolean flag to print time taken for computation (default: True).
    
    Returns:
    - scores: Torch tensor containing top-k scores.
    - indices: Torch tensor containing indices corresponding to top-k scores.
    """
    
    # Embed the query
    query_embedding = model_encode.encode(query, convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on ({len(embeddings)} embeddings): {end_time - start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)
    
    return scores, indices

def print_top_results_and_scores(query: str,
                                embeddings: torch.tensor,
                                model_encode: SentenceTransformer = embedding_model,
                                model_rerank: CrossEncoder = rerank_model,
                                n_resources_to_return: int = 5,
                                n_resources_to_return_rerank: int = 5,
                                pages_and_chunks: list[dict] = pages_and_chunks):
    
    """
    Finds relevant passages given a query and prints them out along with their scores.
    """
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  model_encode=model_encode,
                                                  n_resources_to_return=n_resources_to_return)
    related_documents_with_pages = []
    
    # Loop through top_results_dot_product and extract documents with page numbers
    for score, idx in zip(scores, indices):
        related_document = {"text": pages_and_chunks[idx]["sentence_chunk"],
                            "page_number": pages_and_chunks[idx]["page_number"]}
        related_documents_with_pages.append(related_document)
        
    # Step 2: Get Scores and Re-Rank
    results_reranked = model_rerank.rank(query, [doc['text'] for doc in related_documents_with_pages], return_documents=True, top_k=n_resources_to_return_rerank)
    """Results ReRanked Returns: List of Dict with Keys Name like: corpus id, text"""

    # Print re-ranked results with page numbers
    for idx, item in enumerate(results_reranked):
        print(f"Rank {idx+1}:")
        print("Page Number:", related_documents_with_pages[item['corpus_id']]['page_number'])
        print("Score:", item['score'])
        print("Text:", item['text'])
        print("\n")  # Adding a new line for better readability


In [None]:
# Execution of Above Pipeline:
query="foods high in fiber"
print_top_results_and_scores(query=query, 
                             embeddings=embeddings,
                             model_encode=embedding_model,
                             model_rerank=rerank_model,
                             n_resources_to_return=5,
                             n_resources_to_return_rerank=3,
                             pages_and_chunks=pages_and_chunks)

### Text Generate from LLM based on our Enocded Query and Context & Paragraph:

In [None]:
### Checking our local GPU memory availability so Based on that we can load our Model:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")
gpu_memory_bytes = None  # Free the Space

In [None]:
!nvidia-smi

### Loading an LLM locally

We can load an LLM locally using Hugging Face `transformers`.

To get a model running local we're going to need a few things:
1. A quantization config (optional) - a config on what precision to load the model in (e.g. 8bit, 4bit, etc)
2. A model ID - this will tell transformers which model/tokenizer to load
3. A tokenizer - this turns text into numbers ready for the LLM (note: a tokenizer is different from an embedding model)
4. An LLM model - this will be what we use to generate text based on an input!

> **Note:** There are many tips and tricks on loading/making LLMs work faster. One of the best ones is flash_attn (Flash Attention 2). See the GitHub for more: https://github.com/Dao-AILab/flash-attention 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
# from flash_attn import flash_attn_qkvpacked_func, flash_attn_func  # Not Supported in Our Hardware so

# 1. Create a quantization config
# quantization_config = BitsAndBytesConfig(load_in_4bit=True,
#                                          bnb_4bit_compute_dtype=torch.float16)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # Load the model weights in 4-bit precision (instead of the default 8-bit precision) to save memory) -  (Weights & Activatin.. So Significant Reduce Model Memory Footprint Less 16x Mmemory and 2X Fatser Inference )
    bnb_4bit_use_double_quant=True, # Use double quantization for 4-bit weights # Nested Quantization # which can improve accuracy.
    bnb_4bit_quant_type="nf4",      # Use the "nf4" quantization type (NVIDIA's 4-bit floating-point format) # Normalized Float 4
    bnb_4bit_compute_dtype=torch.float16 # Perform computations using float16 (half-precision)
)

# Bonus: flash attention 2 = faster attention mechanism
# Flash Attention 2 requires a GPU with a compute capability score of 8.0+ (Ampere, Ada Lovelace, Hopper and above): https://developer.nvidia.com/cuda-gpus 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa" # scaled dot product attention
print(f"Using attention implementation: {attn_implementation}")


# 2. Pick a model we'd like to use
# This is to download pre trained models. but we have already downloaded that. but which is not gemma
model_id = "microsoft/phi-1_5"
model_id = model_id
model_id = r"C:\Coding\LLM\phi-1_5"
tokenizer_id = r"microsoft/phi-1_5"

# 3. Instantiate tokenizer (tokenizer turns text into tokens)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_id, trust_remote_code=True)

# Set the `pad_token` attribute of the tokenizer to be the same as the `eos_token` (end-of-sequence token)
tokenizer.pad_token = tokenizer.eos_token

# 4. Instantiate the model 
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,
                                                 device_map={"":0},
                                                 quantization_config=bnb_config,
                                                 low_cpu_mem_usage=True, # use as much memory as we can
                                                 attn_implementation=attn_implementation)

# if not bnb_config:
#     llm_model.to("cuda")

In [None]:
# Model Architecture:
llm_model

In [None]:
!nvidia-smi

In [1]:
# Getting Model Params:
def get_model_num_params(model: torch.nn.Module):
    params =  sum([param.numel() for param in model.parameters()])
    print(f"No. of Parameters: {params}")
    params = None # Svae the Mmemory
    

get_model_num_params(llm_model)
print("-------------------------------------------------------------------")

# Getting Model Size:
def get_model_mem_size(model: torch.nn.Module):
    # Calculate memory size of model parameters
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    
    # Calculate memory size of model buffers
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate total model memory size in bytes, megabytes, and gigabytes
    model_mem_bytes = mem_params + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3) 

    # Print the memory sizes in a readable format
    print({
        "model_mem_bytes": f"{model_mem_bytes} Bytes",
        "model_mem_mb": f"{round(model_mem_mb, 2)} MB", 
        "model_mem_gb": f"{round(model_mem_gb, 2)} GB"
    })
    
    # Clear memory variables to release memory resources
    mem_params, mem_buffers = None, None
    model_mem_bytes, model_mem_mb, model_mem_gb = None, None, None   

get_model_mem_size(llm_model)

NameError: name 'torch' is not defined

### Generating The Text from Model:

In [None]:
## So Our Model is Not Suppor Chat Format So Directly we apply the:

# input_text = "What are the macronutrients and what are their functions in the body?"
# print(f"Input text:\n{input_text}")

# # Create prompt template for instruction-tuned model
# dialogue_template = [
#     {"role": "user",
#      "content": input_text}
# ]

# # Apply the chat template
# prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                        tokenize=False,
#                                        add_generation_prompt=True)
# print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
%%time

inputs = tokenizer('What are the macronutrients and what are their functions in the body?', 
                   return_tensors="pt", return_attention_mask=False).to("cuda")
outputs = llm_model.generate(**inputs, max_length=250)
text = tokenizer.batch_decode(outputs)[0]

print(f"Decoded Token by Model: \n{text}")

In [None]:
# Normal Decoding Method:
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

In [None]:
# Loading:

import numpy as np
import pandas as pd
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert our embeddings into a torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")
text_chunks_and_embedding_df

In [None]:
# Load the model, here we use our base sized model
from sentence_transformers import CrossEncoder
from sentence_transformers import util, SentenceTransformer

rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1", device="cuda")
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")    # Download from hugging face.

In [None]:
!nvidia-smi

In [None]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"]

query_list = gpt4_questions + manual_questions

In [None]:
import random

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model_encode=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool = True):
    """
    Embeds a query with a model and returns top-k scores and indices from embeddings.
    
    Args:
    - query: The input query string.
    - embeddings: Torch tensor containing embeddings of all resources.
    - model_encode: SentenceTransformer model for encoding queries .
    - n_resources_to_return: Number of top resources to return after searching .
    - print_time: Boolean flag to print time taken for computation (default: True).
    
    Returns:
    - scores: Torch tensor containing top-k scores.
    - indices: Torch tensor containing indices corresponding to top-k scores.
    """
    
    # Embed the query
    query_embedding = model_encode.encode(query, convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    if print_time:
        print(f"[INFO] Time taken to get scores on ({len(embeddings)} embeddings): {end_time - start_time:.5f} seconds.")
    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)
    return scores, indices

In [None]:
from timeit import default_timer as timer

def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    \nExample 1:
    Query: What are the fat-soluble vitamins?
    Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
    \nExample 2:
    Query: What are the causes of type 2 diabetes?
    Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
    \nExample 3:
    Query: What is the importance of hydration for physical performance?
    Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
    \nNow use the following context items to answer the user query:
    {context}
    \nRelevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:""" 
    base_prompt = base_prompt.format(context=context,
                                     query=query)

    # Create prompt template for instruction-tuned model 
    dialogue_template = [
        {"role": "user",
         "content": base_prompt}]
    
    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt

In [None]:
query = random.choice(query_list) 
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format our prompt
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # from 0 to 1 and the lower the value, the more deterministic the text, the higher the value, the more creative
                             do_sample=True, # whether or not to use sampling, https://huyenchip.com/2024/01/16/sampling.html
                             max_new_tokens=256)

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\m{output_text.replace(prompt, '')}")

In [None]:
!nvidia-smi

https://www.youtube.com/watch?v=qN_2fnOPY-M&t=14944s