In [56]:
import fitz
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [57]:
pdf_path="data/nutrition_book.pdf"
doc = fitz.open(pdf_path)

In [112]:
def text_formatter(text: str):
    cleaned_text = text.replace("\n", " ")
    return cleaned_text

def open_and_read_pdf(doc):
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        meta_text = {
            "page_number": page_number-41,  # Since actual content starts at page number 42
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text)/4, # 1 token equals nearly 4 characters
            "text": text
        }
        pages_and_texts.append(meta_text)
    return pages_and_texts


## EDA on Text

In [113]:
import pandas as pd

In [114]:
pages_and_texts=open_and_read_pdf(doc)
df = pd.DataFrame(pages_and_texts)
df

0it [00:00, ?it/s]

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,31,6,1,7.75,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.00,
2,-39,322,56,1,80.50,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,214,34,2,53.50,Human Nutrition: 2020 Edition by University of...
4,-37,799,149,3,199.75,Contents Preface University of Hawai‘i at Mā...
...,...,...,...,...,...,...
1203,1162,1677,269,18,419.25,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,1163,1618,263,10,404.50,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,1164,1716,274,13,429.00,Flashcard Images Note: Most images in the fla...
1206,1165,1734,276,13,433.50,ShareAlike 11. Organs reused “Pancreas Organ ...


In [115]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1149.01,200.51,10.52,287.25
std,348.86,560.41,95.86,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,763.0,135.0,5.0,190.75
50%,562.5,1232.5,217.0,10.0,308.12
75%,864.25,1604.5,273.0,15.0,401.12
max,1166.0,2309.0,431.0,39.0,577.25


## Spliting text into scentences

In [116]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pakistan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [117]:
for item in tqdm(pages_and_texts):
    item['sentences']=sent_tokenize(item['text'])
    # to make sure all the sentences are in string type
    item['sentences'] = [str(sentence) for sentence in item['sentences']]
    # count the sentences
    item['page_sentence_count_nltk'] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [118]:
random.sample(pages_and_texts,k=1)

[{'page_number': 1003,
  'page_char_count': 1308,
  'page_word_count': 222,
  'page_sentence_count_raw': 14,
  'page_token_count': 327.0,
  'text': 'Giardia lamblia is another parasite that is found in contaminated  drinking water. In addition, it lives in the intestinal tracts of animals,  and can wash into surface water and reservoirs, similar to  Cryptosporidium. Giardia causes giardiasis, with symptoms that  include abdominal cramping and diarrhea within one to three days.  Although most people recover within one to two weeks, the disease  can lead to a chronic condition, especially in people with  compromised immune systems.  The  parasite  Toxoplasma  gondii  causes  the  infection  toxoplasmosis, which is a leading cause of death attributed to  foodborne illness in the United States. More than sixty million  Americans carry Toxoplasma gondii, but very few have symptoms.  Typically, the body’s immune system keeps the parasite from  causing disease. Sources include raw or undercoo

In [119]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_nltk
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1149.01,200.51,10.52,287.25,10.29
std,348.86,560.41,95.86,6.55,140.1,6.28
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,763.0,135.0,5.0,190.75,5.0
50%,562.5,1232.5,217.0,10.0,308.12,10.0
75%,864.25,1604.5,273.0,15.0,401.12,15.0
max,1166.0,2309.0,431.0,39.0,577.25,28.0


In [120]:
df

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_nltk
0,-41,31,6,1,7.75,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.00,,[],0
2,-39,322,56,1,80.50,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,214,34,2,53.50,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,799,149,3,199.75,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,3
...,...,...,...,...,...,...,...,...
1203,1162,1677,269,18,419.25,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...,"[39., Exercise 10.2 & 11.3 reused “Egg Oval Fo...",18
1204,1163,1618,263,10,404.50,Images / Pixabay License; “Pumpkin Cartoon Ora...,[Images / Pixabay License; “Pumpkin Cartoon Or...,10
1205,1164,1716,274,13,429.00,Flashcard Images Note: Most images in the fla...,[Flashcard Images Note: Most images in the fl...,13
1206,1165,1734,276,13,433.50,ShareAlike 11. Organs reused “Pancreas Organ ...,"[ShareAlike 11., Organs reused “Pancreas Orga...",13


## Chunking/Grouping sentences together

In [129]:
sentence_chunk_size=10

#create a function that 
def chunk_sentences(input_list, chunk_size):
    return [input_list[i:i+chunk_size] for i in range(0,len(input_list), chunk_size)]

In [131]:
t=pages_and_texts[1044]
input_list=t['sentences']
[input_list[i:i+sentence_chunk_size] for i in range(0,len(input_list), sentence_chunk_size)]
    
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = chunk_sentences(input_list=item["sentences"],
                                         chunk_size=sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [132]:
df=pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_nltk,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1149.01,200.51,10.52,287.25,10.29,1.52
std,348.86,560.41,95.86,6.55,140.1,6.28,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,763.0,135.0,5.0,190.75,5.0,1.0
50%,562.5,1232.5,217.0,10.0,308.12,10.0,1.0
75%,864.25,1604.5,273.0,15.0,401.12,15.0,2.0
max,1166.0,2309.0,431.0,39.0,577.25,28.0,3.0


In [136]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1840

In [137]:
df=pd.DataFrame(pages_and_chunks)
df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50
3,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.50
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25
...,...,...,...,...,...
1835,1164,Flashcard Images Note: Most images in the flas...,1304,186,326.00
1836,1164,Hazard Analysis Critical Control Points reused...,374,51,93.50
1837,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1285,175,321.25
1838,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,63,102.50
