In [28]:
import os 
import requests #download things from internet
import fitz 
from tqdm.auto import tqdm 

pdf_path="mentalhealth.pdf"

if not os.path.exists(pdf_path): 
    print(f"File {pdf_path} does not exist")
    url="https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    response=requests.get(url) 
    if response.status_code==200: 
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        print (f"The file has been downloaded succesfully and saved as {pdf_path}")
    else: 
        print(f"Failed to download the file {response.status_code}")
else: 
    print("File already exists")



File already exists


In [29]:
def text_formater(text:str) -> str:
    cleaned_text= text.replace("\n", " ").strip()
    return cleaned_text 

def open_and_read_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts=[]
    for page_number, page in tqdm(enumerate(doc)): 
        text= page.get_text()
        text=text_formater(text)
        pages_and_texts.append({"page_number":page_number, "text":text, "lenght_char":len(text), 
                                "page_word_count":len(text.split(" ")),
                                "page_sentence_count_raw":len(text.split(".")),
                                "page_token_count":len(text)/4})
    return pages_and_texts 

pages_and_texts=open_and_read_pdf(pdf_path)
pages_and_texts[0:2]


1208it [00:02, 407.23it/s]


[{'page_number': 0,
  'text': 'Human Nutrition: 2020 Edition',
  'lenght_char': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25},
 {'page_number': 1,
  'text': '',
  'lenght_char': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0}]

In [30]:
import pandas as pd 
df=pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,lenght_char,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,198.3,14.18,287.0
std,348.86,560.38,95.76,9.54,140.1
min,0.0,0.0,1.0,1.0,0.0
25%,301.75,762.0,134.0,8.0,190.5
50%,603.5,1231.5,214.5,13.0,307.88
75%,905.25,1603.5,271.0,19.0,400.88
max,1207.0,2308.0,429.0,82.0,577.0


In [33]:
from spacy.lang.en import English 
nlp=English()
nlp.add_pipe("sentencizer")
doc=nlp("This is one sencence. This is another sentence.")

for element in tqdm(pages_and_texts):
    element["sentences"]=list(nlp(element["text"]).sents)
    element["sentences"] = [str(sentences) for sentences in element["sentences"]] 
    element["sentence_count"]=len(element["sentences"])

100%|██████████| 1208/1208 [00:04<00:00, 250.84it/s]


In [34]:
import random 
random.sample(pages_and_texts,2)

[{'page_number': 501,
  'text': 'The Atom  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Cells are the basic building blocks of life, but atoms are the basic  building blocks of all matter, living and nonliving. The structural  elements of an atom are protons (positively charged), neutrons (no  charge), and electrons (negatively charged). Protons and neutrons  are contained in the dense nucleus of the atom; the nucleus thus has  a positive charge. Because opposites attract, electrons are attracted  to this nucleus and move around it in the electron cloud.  Electrons contain energy, and this energy is stored within the  charge and movement of electrons and the bonds atoms make with  one another. However, this energy is not always stable, depending  on the number of electrons within an atom. Atoms are more stable  when their electrons orbit in pairs. An atom with an odd number  of electrons must have an unpaired electron. In most ca

In [35]:
df=pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,lenght_char,page_word_count,page_sentence_count_raw,page_token_count,sentence_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,198.3,14.18,287.0,10.32
std,348.86,560.38,95.76,9.54,140.1,6.3
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,301.75,762.0,134.0,8.0,190.5,5.0
50%,603.5,1231.5,214.5,13.0,307.88,10.0
75%,905.25,1603.5,271.0,19.0,400.88,15.0
max,1207.0,2308.0,429.0,82.0,577.0,28.0


In [36]:
num_sentences_chunk_size=10

def split_list(input_list: list[str], 
               slice_size: int=num_sentences_chunk_size) -> list[list[str]]:
 return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]

for element in tqdm(pages_and_texts):
    element["chunks"]=split_list(element["sentences"])
    element["chunk_count"]=len(element["chunks"])



100%|██████████| 1208/1208 [00:00<00:00, 2817.00it/s]


In [48]:
random.sample(pages_and_texts, k=1) 

[{'page_number': 314,
  'text': 'Foods  Total  Carbohydrates  Sugars Fiber Added  Sugars  Banana  27 (1 medium)  14.40  3.1  0  Lentils  40 (1 c.)  3.50  16.0  0  Snap beans  8.7 (1 c.)  1.60  4.0  0  Green pepper  5.5 (1 medium)  2.90  2.0  0  Corn tortilla  10.7 (1)  0.20  1.5  0  Bread, wheat bran  17.2 (1 slice)  3.50  1.4  3.4  Bread, rye  15.5 (1 slice)  1.20  1.9  1.0  Bagel (plain)  53 (1 medium)  5.30  2.3  4.8  Brownie  36 (1 square)  20.50  1.2  20.0  Oatmeal cookie  22.3 (1 oz.)  12.00  2.0  7.7  Cornflakes  23 (1 c.)  1.50  0.3  1.5  Pretzels  47 (10 twists)  1.30  1.7  0  Popcorn  (homemade)  58 (100 g)  0.50  10.0  0  Skim milk  12 (1 c.)  12.00  0  0  Cream (half and  half)  0.65 (1 Tbs.)  0.02  0  0  Cream substitute  1.0 (1 tsp.)  1.00  0  1.0  Cheddar cheese  1.3 (1 slice)  0.50  0  0  Yogurt (with fruit)  32.3 (6 oz.)  32.30  0  19.4  Caesar dressing  2.8 (1 Tbs.)  2.80  0  2.4  Sources:  • National Nutrient Database for Standard Reference. US  Department of Agricul

In [49]:
df=pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,lenght_char,page_word_count,page_sentence_count_raw,page_token_count,sentence_count,chunk_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,198.3,14.18,287.0,10.32,1.53
std,348.86,560.38,95.76,9.54,140.1,6.3,0.64
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,301.75,762.0,134.0,8.0,190.5,5.0,1.0
50%,603.5,1231.5,214.5,13.0,307.88,10.0,1.0
75%,905.25,1603.5,271.0,19.0,400.88,15.0,2.0
max,1207.0,2308.0,429.0,82.0,577.0,28.0,3.0


In [68]:
import re 
pages_and_chunks= []
for element in tqdm(pages_and_texts):
    for sentence_chunk in element["chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = element["page_number"]	
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk =re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk 
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4
        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)



100%|██████████| 1208/1208 [00:00<00:00, 4440.54it/s]


1843

In [69]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 961,
  'sentence_chunk': 'the later years, the heart has to work harder because each pump is not as efficient as it used to be. Kidneys are not as effective in excreting metabolic products such as sodium, acid, and potassium, which can alter water balance and increase the risk for over- or underhydration. In addition, immune function decreases and there is lower efficiency in the absorption of vitamins and minerals. In addition, disorders of the nervous system can have profound effects. Dementia is the umbrella term for changes in the normal activity of the brain. Elderly adults who suffer from dementia may experience memory loss, agitation, and delusions. One in eight people over age sixty-four and almost half of all people over eighty- five suffer from the brain disorder Alzheimer’s disease, which is the most common form of dementia.2 Neurological disorder and psychological conditions, such as depression, can influence attitudes toward food, along with the ability to

In [70]:
df=pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,624.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,0.0,12.0,3.0,3.0
25%,321.5,315.0,44.0,78.75
50%,627.0,746.0,114.0,186.5
75%,931.0,1118.5,173.0,279.62
max,1207.0,1831.0,297.0,457.75
