<a href="https://colab.research.google.com/github/chawbel/RAG_Project/blob/main/RAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create and run a RAG pipeline from scratch






import PDF Document

In [7]:
import os
import requests

path = "human-nutrition-text.pdf"

#Download PDF
if not os.path.exists(path):
  print("[INFO] file does not exist downloading....")

  #URL of the pdf
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2e22/open/download?type=pdf"

  #the local name to save the pdf
  file_name = path

  #send a get request to the URL
  response = requests.get(url)

  #check if the request was successfull
  if response.status_code == 200:
    #open the file and save it
    with open(file_name, "wb") as file:
      file.write(response.content)
    print(f"the file has been downloaded and saved as {file_name}")

  else:
    print(f"failed to download the file, Statues code {response.status_code}")

else:
    print(f"file {path} exists")

file human-nutrition-text.pdf exists


In [8]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
  cleaned_text = text.replace("\n"," ").strip()
  return cleaned_text

def open_and_read_pdf(path: str) -> list[dict]:
  doc = fitz.open(path)
  pages_and_text = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text=text)
    pages_and_text.append(
                            {
                            "page number" : page_number-34,
                            "page_char_count" : len(text),
                            "page_word_count" : len(text.split(" ")),
                            "page_sentence_count_raw" : len(text.split(". ")),
                            "page_token_count" : len(text)/4,
                            "text" : text
                            }
                          )
  return pages_and_text

pages_and_text = open_and_read_pdf(path=path)
pages_and_text[34:35]

0it [00:00, ?it/s]

[{'page number': 0,
  'page_char_count': 310,
  'page_word_count': 57,
  'page_sentence_count_raw': 3,
  'page_token_count': 77.5,
  'text': 'Ya-Yun Yang  Ya-Yun is a Graduate Assistant in the Distance Education Program  for the College of Tropical Agriculture and Human Resources. She is  currently a PhD student in Learning Design and Technology (LTEC)  at the University of Hawai‘i at Mānoa.  Christina Gar Lai Young  About the Contributors  |  xxxv'}]

In [9]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-34,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-33,0,1,1,0.0,
2,-32,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-31,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-30,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [10]:
df.describe()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,569.5,1148.004139,199.499172,10.519868,287.001035
std,348.86387,560.382275,95.830681,6.548495,140.095569
min,-34.0,0.0,1.0,1.0,0.0
25%,267.75,762.0,134.0,5.0,190.5
50%,569.5,1231.5,216.0,10.0,307.875
75%,871.25,1603.5,272.0,15.0,400.875
max,1173.0,2308.0,430.0,39.0,577.0


##Further text preprocessing (splitting pages into sentences)


1. we've done this by splitting on ". "  
2. we can also do this using an NLP library such as spaCy



In [11]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

#create a document instance as an exmaple
doc = nlp("this is a sentence. another sentence. i like elephants")
assert len(list(doc.sents)) == 3

list(doc.sents)

[this is a sentence., another sentence., i like elephants]

In [12]:
for item in tqdm(pages_and_text):
  item["sentences"] = list(nlp(item["text"]).sents)

  #make sure all sentences are strings (the default is spacy datatype)
  item["sentences"] = [str(sentence) for sentence in item["sentences"]]

  #count the senteces
  item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
pages_and_text[123:124]

[{'page number': 89,
  'page_char_count': 200,
  'page_word_count': 34,
  'page_sentence_count_raw': 1,
  'page_token_count': 50.0,
  'text': 'The Cardiovascular System  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Figure 2.10 The Cardiovascular system  82  |  The Cardiovascular System',
  'sentences': ['The Cardiovascular System  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Figure 2.10 The Cardiovascular system  82  |  The Cardiovascular System'],
  'page_sentence_count_spacy': 1}]

In [14]:
df = pd.DataFrame(pages_and_text)
df.describe()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,569.5,1148.004139,199.499172,10.519868,287.001035,10.319536
std,348.86387,560.382275,95.830681,6.548495,140.095569,6.300843
min,-34.0,0.0,1.0,1.0,0.0,0.0
25%,267.75,762.0,134.0,5.0,190.5,5.0
50%,569.5,1231.5,216.0,10.0,307.875,10.0
75%,871.25,1603.5,272.0,15.0,400.875,15.0
max,1173.0,2308.0,430.0,39.0,577.0,28.0


##chunking our sentences together

the process of splitting large pieces of text into smaller ones often refered to as text splitting or chunking

there is no 100% correct way to do this

we'll keep it simple and split into groups of 10 sentences

why do we do this?


1. so our text are easier to filter (smaller groups of text can be easier to inspect)
2. so our text chunks can fit into embedding model context window
3. so our contents passed to an LLM can be more specific and focused



In [15]:
#define splt size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

#create a function to split lists of text recursively into chunk size
def split_list(input_list: list[str], slice_size: int=num_sentence_chunk_size)-> list[list[str]]:
  return [input_list[i:i+slice_size]
            for i in range(0,len(input_list), slice_size)]



In [16]:
#loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_text):
  item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                       slice_size=num_sentence_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [17]:
pages_and_text[900:901]

[{'page number': 866,
  'page_char_count': 1628,
  'page_word_count': 247,
  'page_sentence_count_raw': 24,
  'page_token_count': 407.0,
  'text': 'should considering their length, and slightly more than 20 percent  of children ages two to five are overweight or have obesity.4  Some minority group children, such as Filipinos, Native  Hawaiians, and Other Pacific Islanders, in Hawai‘i have higher rates  of overweight and obesity. In 2012, 12.8% of Hawai‘i WIC (low- income) participants ages two to four years were overweight and  10.2% had obesity.567 One study that investigated 2000-2010 data  for children ages two to eight years in 51 communities in 11 United  States Affiliated Pacific (USAP) jurisdictions found that 14.4% of the  study population was overweight and 14% had obesity.8  4. Institute of Medicine. (2011). Early childhood obesity  prevention policies. The National Academies Press.  5. Oshiro C., Novotny R., Grove J., Hurwitz E. (2015). Race/ ethnic differences in birth size

In [18]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,569.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-34.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,267.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,569.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,871.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1173.0,2308.0,430.0,39.0,577.0,28.0,3.0


##Splitting each chunk into its own item

we'd like to embedd each chunk of sentences into its own numerical representation

that'll give us a good level of granularity

Meaning we can dive specifically into the text sample that was used in our model

In [19]:
import re

#split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_text):
  for sentence_chunk in item["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page number"]

    #join the sentences together into a paragraph-like structure aka join the list of sentences into one paragraph
    joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)
    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [20]:
pages_and_chunks[30:31]

[{'page_number': -3,
  'sentence_chunk': 'Cheryl Gibby Cheryl Gibby was born and raised in Hawai‘i and is a wife and mother of three. She received her BA, MS in Nutritional Sciences, and PhD in Nutrition from the University of Hawai‘i at Mānoa. She has served as an instructor for the introductory Nutrition course at the University of Hawai‘i at Mānoa, and her research interests include infant and child health, dental and bone health, mobile health interventions, school nutrition policies, and online education. xxxii | About the Contributors',
  'chunk_char_count': 504,
  'chunk_word_count': 82,
  'chunk_token_count': 126.0}]

In [21]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,590.381443,734.098752,112.742811,183.524688
std,347.78867,447.510661,71.236055,111.877665
min,-34.0,12.0,3.0,3.0
25%,287.5,315.0,45.0,78.75
50%,593.0,745.0,115.0,186.25
75%,897.0,1118.0,173.0,279.5
max,1173.0,1830.0,297.0,457.5


#Filter chunks of text for short chunks
these chunks may not contain much useful information

In [22]:
#show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
  print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 3.25 | Text: 814 | Infancy
Chunk token count: 16.25 | Text: Updated January 2015. Accessed December 4, 2017. Middle Age | 917
Chunk token count: 22.75 | Text: Building a protein involves three steps: transcription, translation, Defining Protein | 369
Chunk token count: 21.5 | Text: http://www.health.gov.fj/?page_id=1406. Accessed November 12, 2017. 652 | Introduction
Chunk token count: 5.5 | Text: You can Chloride | 193


In [23]:
#Filter our dataframe for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"]>min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -32,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -31,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

##Embedding our text chunks

 what we'd like to do:
  -turn our text into chunks, specifically embeddings
  
  A useful numerical representation

  The best part about embeddings is that they are a "Learned representation"

In [24]:
from sentence_transformers import SentenceTransformer
emdbedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                       device="cuda")

sentences = ["the sentence transformer library provides an eady way to create embeddings",
             "sentences canbe embedded one by one or in a list",
             "i like horses"]

#sentences are encoded/embedded by calling model.encode()
embeddings = emdbedding_model.encode(sentences)
embedding_dict = dict(zip(sentences,embeddings))

#See the embeddigns
for sentence, embedding in embedding_dict.items():
  print(f"sentence: {sentence}")
  print(f"embedding: {embedding}")
  print(" ")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


sentence: the sentence transformer library provides an eady way to create embeddings
embedding: [-2.94748973e-02  2.78465692e-02 -2.35938057e-02  6.11584075e-02
 -1.83001217e-02 -8.82710703e-03  1.25020137e-02 -5.91714978e-02
  1.35191688e-02 -2.56932266e-02  3.28757055e-02  4.82225306e-02
 -3.45653556e-02  1.55104948e-02  4.45536487e-02 -5.18284328e-02
  4.24537100e-02  6.39464008e-03 -2.52390075e-02 -1.52969893e-04
  3.98194604e-02  2.91065816e-02  2.05166079e-02  3.93162668e-02
 -2.44629476e-02 -2.71279588e-02 -6.48483750e-04 -3.63872461e-02
  5.48749380e-02 -1.23578431e-02 -3.49787399e-02 -1.10805985e-02
  4.94070426e-02  3.27449711e-03  9.32209787e-07  2.80626211e-03
 -3.90615314e-02 -8.36904719e-03  1.37149123e-02  5.71032433e-05
  5.08406758e-02 -5.47374636e-02  2.56445743e-02  5.08615039e-02
 -4.79061939e-02 -1.47311939e-02  5.01348227e-02  1.81838963e-02
  7.85619691e-02  4.41439189e-02 -2.43289359e-02 -3.81822996e-02
 -1.73981325e-03 -1.61298122e-02 -2.22476050e-02  3.3129822

In [25]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]


In [26]:
#embed all text in bacthes

text_chunk_embeddings = emdbedding_model.encode(text_chunks,
                                                batch_size=32,
                                                convert_to_tensor=True)

In [30]:
for item in tqdm(pages_and_chunks_over_min_token_len):
  item["embedding"] = emdbedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

In [27]:
text_chunk_embeddings

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

##Save embeddings to file

In [34]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path,index=False)

In [40]:
#Import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv("text_chunks_and_embeddings_df.csv")

In [41]:
text_chunks_and_embeddings_df_load.head(2)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-32,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242675e-02 9.02281404e-02 -5.09548886e-...
1,-31,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156419e-02 5.92139773e-02 -1.66167244e-...


#RAG search and answer

RAG goal: retrieve relevant passages on  a query and use those passges to augment an input to an LLM so it can generate an output based on those relevant passages

### similarity search

Embeddings can be used for almost any type of data (eg. images,sound,text...)

comparing embeddings is known as similarity search, vector search, semantic search

in our case we want to query our nutrition textbook passages based on semantics or "vibe"

so if i search on macronutrients functions i should get relevant passages to that text but may not contain exactly the words "macronutrient functions"



In [60]:
import random
import torch
import numpy as np
import pandas as pd

device ="cuda" if torch.cuda.is_available() else "cpu"

#import texts and embeddings
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#convert embeddings back to np array
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"),sep=" "))

#convert the embeddings to a torch tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0))

#convert texts and embeddings df to a list of dictionairies
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")



In [63]:
embeddings.shape

torch.Size([1680, 768])

In [65]:
# create model
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                       device=device)




we want to search for a query (eg. "macronutrient function)

we can do this with the following steps:
  

1. Define a query stringm
2. turn the query string into embedding
3. perform a dot product or cosine similarity function between the text embedding and the query embedding
4. sort the results from 3 in descending order


