# Local RAG pipeline

In [18]:
import torch
from datetime import datetime

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [20]:
def showTime():
    return str("["+datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')+" UTC]")

In [21]:
# download pdf
import os
import requests
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"{showTime()}[INFO] File does not exist, downloading...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    filename = pdf_path

    # send request
    response = requests.get(url)

    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"{showTime()}[INFO] File downloaded and saved as {filename}")
    else:
        print(f"{showTime()}[INFO] Failed to download. {response.status_code}")
else:
    print(f"{showTime()}[INFO] File exists.")    

[2024-10-04 20:15:52.988538 UTC][INFO] File does not exist, downloading...
[2024-10-04 20:16:18.601700 UTC][INFO] File downloaded and saved as human-nutrition-text.pdf


In [31]:
# open pdf
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor forating on text"""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number-41, # page number where real book content starts
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text)/4, # 1 token = 4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:3]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count_raw': 1,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'}]

In [32]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 472,
  'page_char_count': 641,
  'page_word_count': 105,
  'page_sentence_count_raw': 5,
  'page_token_count': 160.25,
  'text': 'Photo by  Hope House  Press on  unsplash.co m / CC0  https://unspl ash.com/ photos/ PJzc7LOt2Ig  Weight Management  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  “Obesogenic” is a word that has sprung up in the language of public  health professionals in the last two decades. The Centers for  Disease Control and Prevention (CDC) defines obesogenic as “an  environment that promotes increased food intake, non-healthful  foods, and physical inactivity.”1  1. Obesogenic Environments. Center for Disease Control  and Prevention (CDC). https://www.cdc.gov/pcd/ 472  |  Weight Management'},
 {'page_number': 780,
  'page_char_count': 1155,
  'page_word_count': 202,
  'page_sentence_count_raw': 12,
  'page_token_count': 288.75,
  'text': 'Learning Objectives  By the end of this chapter you will be

In [33]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [34]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [41]:
from spacy.lang.en import English

nlp = English()
# https://spacy.io/api/sentencizer
nlp.add_pipe("sentencizer")

# create example document instance
doc = nlp("This is sentence. This id second sentence. I like trains.")
assert len(list(doc.sents)) == 3

list(doc.sents)


[This is sentence., This id second sentence., I like trains.]

In [43]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [47]:
random.sample(pages_and_texts, k=1)

[{'page_number': 873,
  'page_char_count': 1677,
  'page_word_count': 291,
  'page_sentence_count_raw': 17,
  'page_token_count': 419.25,
  'text': 'children should be provided nutrient-dense food at meal- and  snack-time. However, it is important not to overfeed children, as  this can lead to childhood obesity, which is discussed in the next  section. Parents and other caregivers can turn to the MyPlate  website for guidance: http://www.choosemyplate.gov/.  Macronutrients  For carbohydrates, the Acceptable Macronutrient Distribution  Range (AMDR) is 45–65 percent of daily calories (which is a  recommended daily allowance of 135–195 grams for 1,200 daily  calories). Carbohydrates high in fiber should make up the bulk of  intake. The AMDR for protein is 10–30 percent of daily calories  (30–90 grams for 1,200 daily calories). Children have a high need for  protein to support muscle growth and development. High levels of  essential fatty acids are needed to support growth (although not as

In [50]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [52]:
# chunk sentences together

num_sentence_chunk_size = 10

def split_list(input_list: list[str], slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [53]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [54]:
random.sample(pages_and_texts, k=1)

[{'page_number': 769,
  'page_char_count': 1918,
  'page_word_count': 309,
  'page_sentence_count_raw': 12,
  'page_token_count': 479.5,
  'text': 'collaboration with FAO, continually reviews new research and  information  from  around  the  world  on  human  nutrient  requirements and recommended nutrient intakes. This is a vast  and never-ending task, given the large number of essential human  nutrients. These nutrients include protein, energy, carbohydrates,  fats and lipids, a range of vitamins, and a host of minerals and trace  elements.  Many countries rely on WHO and FAO to establish and  disseminate this information, which they adopt as part of their  national dietary allowances. Others use it as a base for their  standards. The establishment of human nutrient requirements is the  common foundation for all countries to develop food-based dietary  guidelines for their populations.  Establishing requirements means that the public health and  clinical significance of intake levels

In [55]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [58]:
# spliting each chunk into each own item

import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        # join list of sentences into paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        # add " " before start of sentece and after .
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)
        
        

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [60]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 522,
  'sentence_chunk': 'are packaged into the lipid-containing chylomicrons inside small intestine mucosal cells and then transported to the liver. In the liver, carotenoids are repackaged into lipoproteins, which transport them to cells. The retinoids are aptly named as their most notable function is in the retina of the eye where they aid in vision, particularly in seeing under low-light conditions. This is why night blindness is the most definitive sign of vitamin A deficiency. Vitamin A has several important functions in the body, including maintaining vision and a healthy immune system. Many of vitamin A’s functions in the body are similar to the functions of hormones (for example, vitamin A can interact with DNA, causing a change in protein function). Vitamin A assists in maintaining healthy skin and the linings and coverings of tissues; it also regulates growth and development. As an antioxidant, vitamin A protects cellular membranes, helps in maintaining glut

In [61]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [62]:
# filter chunks of text for short chunks
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 16.5 | Text: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281
Chunk token count: 9.0 | Text: 1088 | Nutrition, Health and Disease
Chunk token count: 26.25 | Text: Snowdon W, Osborn T. (2003). Coconut: It’s role in health. Secretariat of the Pacific. 292 | Introduction
Chunk token count: 25.25 | Text: PART XV CHAPTER 15. LIFESPAN NUTRITION IN ADULTHOOD Chapter 15. Lifespan Nutrition in Adulthood | 901
Chunk token count: 6.5 | Text: Fat-Soluble Vitamins | 537


In [64]:
# filter dataframe for rows with under 30 tokens
ages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
ages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [67]:
random.sample(ages_and_chunks_over_min_token_len, k=2)

[{'page_number': 480,
  'sentence_chunk': 'The sum of caloric expenditure is referred to as total energy expenditure (TEE). Basal metabolism refers to those metabolic pathways necessary to support and maintain the body’s basic functions (e.g. breathing, heartbeat, liver and kidney function) while at rest. The basal metabolic rate (BMR) is the amount of energy required by the body to conduct its basic functions over a certain time period. The great majority of energy expended (between 50 and 70 percent) daily is from conducting life’s basic processes. Of all the organs, the liver requires the most energy 480 | Weight Management',
  'chunk_char_count': 591,
  'chunk_word_count': 95,
  'chunk_token_count': 147.75},
 {'page_number': 1006,
  'sentence_chunk': 'Amanita Muscaria by Onder Wijsgek / CC BY 3.0 Poisonous Mushrooms Like molds, mushrooms are fungi and the poisonous kind produces mycotoxins that can cause food intoxication. Toxic mushrooms, also known as toadstools, can cause severe