In [1]:
# Download PDF file
import os
import requests
import sys

# Add the root directory to Python's path
sys.path.append(os.path.abspath(".."))

directory = "../data"

# List all files in the directory
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

print("Files in the directory:")
for file in files:
    print(file)

# Get PDF document
pdf_path = "../human-nutrition-text.pdf"
pdf_path2 = "../hitech.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist...")
else:
  print(f"File {pdf_path} exists.")

Files in the directory:
hitech.pdf
human-nutrition-text.pdf
File doesn't exist...


In [22]:
import fitz 
from tqdm.auto import tqdm 

# text formatting 
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip() 

    return cleaned_text


# reading pdf
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts2 = open_and_read_pdf(pdf_path=pdf_path2)


1208it [00:02, 403.93it/s]
18it [00:00, 138.17it/s]


In [25]:
import random

print(random.sample(pages_and_texts, k=1))
print(random.sample(pages_and_texts2, k=1))

[{'page_number': 1151, 'page_char_count': 1164, 'page_word_count': 208, 'page_sentence_count_raw': 6, 'page_token_count': 291.0, 'text': 'Appendix A  Appendix A  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  This table compares the typical levels of recommended daily  nutrient intake to the United States Tolerable Upper Intake Levels  (ULs) and the United Kingdom’s Safe Upper Levels (SULs). The  Recommended Dietary Allowance (RDA) and Adequate Intake (AI)  values are considered to be levels of nutrient intake that meet or  exceed the needs of practically all healthy people. The Daily Value  amounts, that are currently used as reference values on food and  supplement labels, are similar to the RDA/AI values, but differ in  some cases. UL values are the amounts that are considered to be the  maximum safe level of intake from food and supplements combined.  SUL values are the maximum level of intake of a nutrient from  dietary supple

### Stats


In [27]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [28]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


### Further text processing (splitting pages into sentences)

In [None]:
# run this command first
!python -m spacy download en_core_web_sm

In [31]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [32]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:04<00:00, 278.21it/s]


In [33]:
random.sample(pages_and_texts, k=1)

[{'page_number': 738,
  'page_char_count': 831,
  'page_word_count': 143,
  'page_sentence_count_raw': 5,
  'page_token_count': 207.75,
  'text': 'Eating smaller meals will diminish the size of your appetite  over time so you will feel satisfied with smaller amounts of  food.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  738  |  Discovering Nutrition Facts',
  'sentences'

In [34]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### chunking

In [35]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 202725.53it/s]


In [38]:
random.sample(pages_and_texts, k=1)

[{'page_number': 665,
  'page_char_count': 666,
  'page_word_count': 108,
  'page_sentence_count_raw': 4,
  'page_token_count': 166.5,
  'text': 'Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=382  \xa0 Iron  |  665',
  'sentences': ['Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.',
   '\xa0 These activities are  available in the web-based textb

In [39]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item


In [41]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 12260.96it/s]


1843

In [45]:
random.sample(pages_and_chunks, k=1)


[{'page_number': 265,
  'sentence_chunk': 'a diet high in fructose could potentially stimulate fat deposition and weight gain. In human studies, excessive fructose intake has sometimes been associated with weight gain, but results are inconsistent. Moderate fructose intake is not associated with weight gain at all. Moreover, other studies show that some fructose in the diet actually improves glucose metabolism especially in people with Type 2 diabetes.5 In fact, people with diabetes were once advised to use fructose as an alternative sweetener to table sugar. Overall, there is no good evidence that moderate fructose consumption contributes to weight gain and chronic disease. At this time conclusive evidence is not available on whether fructose is any worse than any other added sugar in increasing the risk for obesity, Type 2 diabetes, and cardiovascular disease. Do Low-Carbohydrate Diets Affect Health? Since the early 1990s, marketers of low-carbohydrate diets have bombarded us with th

In [46]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)


Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [47]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 27.75 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=268 404 | Diseases Involving Proteins
Chunk token count: 3.0 | Text: Iodine | 681
Chunk token count: 15.25 | Text: Accessed November 30, 2017. Discovering Nutrition Facts | 737
Chunk token count: 18.0 | Text: Updated July 24, 2017. Accessed April 15, 2018. 1112 | Threats to Health
Chunk token count: 5.5 | Text: 894 | Late Adolescence


In [48]:
# OTPIONAL
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding our text chunks


In [50]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
# if this shows error run the below command
# pip install --upgrade numpy

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") 

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array


In [None]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "../combined_text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [55]:
# Import saved file and view
embeddings_df_save_path = "../combined_text_chunks_and_embeddings_df.csv"
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,file_name,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,Fundamentals of Physics Textbook ( PDFDrive ).pdf,-40,"Quadratic Formula If , then Binomial Theorem ....",429,105,107.25,[-4.10841927e-02 3.03728618e-02 -1.62778515e-...
1,Fundamentals of Physics Textbook ( PDFDrive ).pdf,-40,.(x2 1) Products of Vectors Let u be the small...,400,103,100.0,[-6.95691034e-02 3.50480117e-02 -3.15377116e-...
2,Fundamentals of Physics Textbook ( PDFDrive ).pdf,-40,"Then ! ""! ""axbx# ayby# azbz"" ab cos u Trigonom...",356,100,89.0,[-8.19482803e-02 2.84125227e-02 -2.79706139e-...
3,Fundamentals of Physics Textbook ( PDFDrive ).pdf,-40,"axbx# ayby# azbz"" ab cos u Trigonometric Ident...",418,125,104.5,[-9.70287994e-02 3.05141322e-02 -3.81805710e-...
4,Fundamentals of Physics Textbook ( PDFDrive ).pdf,-40,ay by az bz!$ jˆ !ax bx az bz!# kˆ !ax bx ay b...,349,101,87.25,[-4.39422689e-02 -1.34909861e-02 -2.32988819e-...
