# Table of Contents
* [Exploring science textbooks for parsing and annotations](#Exploring-science-textbooks-for-parsing-and-annotations)
	* [basic parameters](#basic-parameters)
	* [checking extractability](#checking-extractability)
	* [Testing pdf miner on single page](#Testing-pdf-miner-on-single-page)
		* [drawing bounding boxes over image](#drawing-bounding-boxes-over-image)
* [END](#END)


In [1]:
%%capture
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict

In [2]:
from wand.image import Image as WImage
from IPython.display import display
import PIL.Image as Image
import cv2

In [3]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

# Exploring science textbooks for parsing and annotations

## basic parameters

In [4]:
ls pdfs/ | wc -l 

In [5]:
book_list = !ls pdfs/
book_list

There are 28 total. With a couple of series-

In [6]:
book_breakdowns = defaultdict(list)

In [7]:
spectrum_science =  !ls pdfs/ | grep 'Spectrum Science'
book_breakdowns['spectrum_sci'] = spectrum_science
print('Spectrum Science,: ', len(spectrum_science), ' total')
# print('\n'.join(spectrum_science))

In [8]:
daily_science =  !ls pdfs/ | grep 'Daily Sc' 
book_breakdowns['daily_sci'] = daily_science
print('Daily Science: ', len(daily_science), ' total')
# print('\n'.join(daily_science))

In [9]:
read_understand =  !ls pdfs/ | grep 'Read and Understand Science' 
book_breakdowns['read_und_sci'] = read_understand
print('Read and Understand Science: ', len(read_understand), ' total')
# print('\n'.join(read_understand))

In [10]:
workbooks =  !ls pdfs/ | grep -i  'workbook' 
book_breakdowns['workooks'] = workbooks
print('workbooks: ', len(workbooks), ' total')
# print('\n'.join(workbooks))

In [11]:
for book in book_list:
    if not sum([book in series for series in book_breakdowns.values()]):
        book_breakdowns['misc'].append(book)

In [12]:
sum([len(books) for books in book_breakdowns.values()])

all books accounted for in groupings

In [13]:
for group, books in book_breakdowns.items():
    print(group)
    print('\n'.join(books + [' ']))
    

## checking extractability 

In [15]:
extractable = 0
for textbook in book_list:
    test_book_path = './pdfs/' + textbook
    with open(test_book_path, 'r') as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        extractable += document.is_extractable
extractable

all of the documents are extractable!

## Testing pdf miner on single page

In [16]:
pages = []
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    pages.append(device.get_result())

In [17]:
# test_book = book_breakdowns['daily_sci'][3]
test_book ='test_page.pdf'
test_book_path = './' + test_book
test_book

In [18]:
# Open a PDF document.
with open(test_book, 'r') as fp:
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)

# Get the outlines of the document.
# outlines = document.get_outlines()
# for (level,title,dest,a,se) in outlines:
#     print (level, title)

In [19]:
laparams = LAParams()
# Create a PDF page aggregator object.
with open(test_book, 'r') as fp:
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

In [20]:
img = WImage(filename= test_book)
img

### drawing bounding boxes over image

In [21]:
import random
def random_color():
    return random.randint(0,255), random.randint(0,255), random.randint(0,255)

def get_bbox_tuple(box, y_height):
    def shift_coord(coord, v_height):
        return coord[0], v_height - coord[1]
    
    lower_right = tuple(map(lambda x: int(x), box.bbox[2:]))
    upper_left = tuple(map(lambda x: int(x), box.bbox[:2]))
    
    return shift_coord(lower_right, y_height), shift_coord(upper_left, y_height)

In [22]:
timg_png = cv2.imread('./test.png')
y_height, x_width, n_col = timg_png.shape
for box in layout._objs:
    lr, ul = get_bbox_tuple(box, y_height)
    cv2.rectangle(timg_png, ul, lr, color=random_color(), thickness=2)
    try:
        print(box.get_text())
    except AttributeError:
        pass
display(Image.fromarray(timg_png, 'RGB'))

# END