In [1]:
import fitz
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
pdf_path="data/nutrition_book.pdf"
doc = fitz.open(pdf_path)

In [3]:
def text_formatter(text: str):
    cleaned_text = text.replace("\n", " ")
    return cleaned_text

def open_and_read_pdf(doc):
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        meta_text = {
            "page_number": page_number-41,  # Since actual content starts at page number 42
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text)/4, # 1 token equals nearly 4 characters
            "text": text
        }
        pages_and_texts.append(meta_text)
    return pages_and_texts


## EDA on Text

In [4]:
import pandas as pd

In [5]:
pages_and_texts=open_and_read_pdf(doc)
df = pd.DataFrame(pages_and_texts)
df

0it [00:00, ?it/s]

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,31,6,1,7.75,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.00,
2,-39,322,56,1,80.50,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,214,34,2,53.50,Human Nutrition: 2020 Edition by University of...
4,-37,799,147,2,199.75,Contents Preface University of Hawai‘i at Mā...
...,...,...,...,...,...,...
1203,1162,1677,253,18,419.25,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,1163,1618,255,10,404.50,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,1164,1716,262,13,429.00,Flashcard Images Note: Most images in the fla...
1206,1165,1734,269,13,433.50,ShareAlike 11. Organs reused “Pancreas Organ ...


In [115]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1149.01,200.51,10.52,287.25
std,348.86,560.41,95.86,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,763.0,135.0,5.0,190.75
50%,562.5,1232.5,217.0,10.0,308.12
75%,864.25,1604.5,273.0,15.0,401.12
max,1166.0,2309.0,431.0,39.0,577.25


## Spliting text into scentences

In [6]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pakistan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
for item in tqdm(pages_and_texts):
    item['sentences']=sent_tokenize(item['text'])
    # to make sure all the sentences are in string type
    item['sentences'] = [str(sentence) for sentence in item['sentences']]
    # count the sentences
    item['page_sentence_count_nltk'] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [11]:
import random
random.sample(pages_and_texts,k=1)

[{'page_number': 21,
  'page_char_count': 964,
  'page_word_count': 171,
  'page_sentence_count_raw': 8,
  'page_token_count': 241.0,
  'text': 'Image by  John Towner  on  unsplash.co m / CC0  Lifestyles and Nutrition  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  In addition to nutrition, health is affected by genetics, the  environment, life cycle, and lifestyle. One facet of lifestyle is your  dietary habits. Recall that we discussed briefly how\xa0nutrition affects  health. A greater discussion of this will follow in subsequent  chapters in this book, as there is an enormous amount of  information regarding this aspect of lifestyle. Dietary habits include  what a person eats, how much a person eats during a meal, how  frequently meals are consumed, and how often a person eats out.  Other aspects of lifestyle include physical activity level, recreational  drug use, and sleeping patterns, all of which play a role in health and  

In [8]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_nltk
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1149.6,199.89,9.97,287.4,10.28
std,348.86,560.47,95.78,6.19,140.12,6.27
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,763.75,135.0,4.0,190.94,5.0
50%,562.5,1233.5,216.0,10.0,308.38,10.0
75%,864.25,1606.25,272.25,14.0,401.56,15.0
max,1166.0,2309.0,430.0,32.0,577.25,28.0


## Chunking/Grouping sentences together

In [9]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [11]:
# Spliting each chunk into its own items
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1840

In [14]:
df=pd.DataFrame(pages_and_chunks)
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,115,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,143,235.25


In [15]:
# Remove the chunks that has token number less than 30
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [18]:
# save to drive
import json
file_path="data/pages_and_chunks.json"
with open(file_path, 'w') as file:
    json.dump(pages_and_chunks_over_min_token_len, file,indent=10)