# Local RAG pipeline

In [18]:
import torch
from datetime import datetime

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [20]:
def showTime():
    return str("["+datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')+" UTC]")

In [21]:
# download pdf
import os
import requests
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"{showTime()}[INFO] File does not exist, downloading...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    filename = pdf_path

    # send request
    response = requests.get(url)

    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"{showTime()}[INFO] File downloaded and saved as {filename}")
    else:
        print(f"{showTime()}[INFO] Failed to download. {response.status_code}")
else:
    print(f"{showTime()}[INFO] File exists.")    

[2024-10-04 20:15:52.988538 UTC][INFO] File does not exist, downloading...
[2024-10-04 20:16:18.601700 UTC][INFO] File downloaded and saved as human-nutrition-text.pdf


In [31]:
# open pdf
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor forating on text"""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number-41, # page number where real book content starts
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text)/4, # 1 token = 4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:3]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count_raw': 1,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'}]

In [32]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 472,
  'page_char_count': 641,
  'page_word_count': 105,
  'page_sentence_count_raw': 5,
  'page_token_count': 160.25,
  'text': 'Photo by  Hope House  Press on  unsplash.co m / CC0  https://unspl ash.com/ photos/ PJzc7LOt2Ig  Weight Management  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  “Obesogenic” is a word that has sprung up in the language of public  health professionals in the last two decades. The Centers for  Disease Control and Prevention (CDC) defines obesogenic as “an  environment that promotes increased food intake, non-healthful  foods, and physical inactivity.”1  1. Obesogenic Environments. Center for Disease Control  and Prevention (CDC). https://www.cdc.gov/pcd/ 472  |  Weight Management'},
 {'page_number': 780,
  'page_char_count': 1155,
  'page_word_count': 202,
  'page_sentence_count_raw': 12,
  'page_token_count': 288.75,
  'text': 'Learning Objectives  By the end of this chapter you will be