# Table of Contents
* [Exploring science textbooks for parsing and annotations](#Exploring-science-textbooks-for-parsing-and-annotations)
	* [basic parameters](#basic-parameters)
	* [checking extractability](#checking-extractability)
	* [Testing pdf miner on single page](#Testing-pdf-miner-on-single-page)
		* [drawing bounding boxes over image](#drawing-bounding-boxes-over-image)
* [END](#END)


In [1]:
%%capture
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict

In [192]:
%load_ext autoreload
%autoreload 2

In [2]:
from wand.image import Image as WImage
from IPython.display import display
import PIL.Image as Image
import cv2

In [221]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.converter import TextConverter

# Exploring science textbooks for parsing and annotations

## basic parameters

In [4]:
ls pdfs/ | wc -l 

In [5]:
book_list = !ls pdfs/
book_list

There are 28 total. With a couple of series-

In [6]:
book_breakdowns = defaultdict(list)

In [7]:
spectrum_science =  !ls pdfs/ | grep 'Spectrum Science'
book_breakdowns['spectrum_sci'] = spectrum_science
print('Spectrum Science,: ', len(spectrum_science), ' total')
# print('\n'.join(spectrum_science))

In [8]:
daily_science =  !ls pdfs/ | grep 'Daily Sc' 
book_breakdowns['daily_sci'] = daily_science
print('Daily Science: ', len(daily_science), ' total')
# print('\n'.join(daily_science))

In [9]:
read_understand =  !ls pdfs/ | grep 'Read and Understand Science' 
book_breakdowns['read_und_sci'] = read_understand
print('Read and Understand Science: ', len(read_understand), ' total')
# print('\n'.join(read_understand))

In [10]:
workbooks =  !ls pdfs/ | grep -i  'workbook' 
book_breakdowns['workooks'] = workbooks
print('workbooks: ', len(workbooks), ' total')
# print('\n'.join(workbooks))

In [11]:
for book in book_list:
    if not sum([book in series for series in book_breakdowns.values()]):
        book_breakdowns['misc'].append(book)

In [12]:
sum([len(books) for books in book_breakdowns.values()])

all books accounted for in groupings

In [13]:
for group, books in book_breakdowns.items():
    print(group)
    print('\n'.join(books + [' ']))

## checking extractability 

In [15]:
extractable = 0
for textbook in book_list:
    test_book_path = './pdfs/' + textbook
    with open(test_book_path, 'r') as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        extractable += document.is_extractable
extractable

all of the documents are extractable!

In [228]:
!find ./pdfs/* | xargs -L 1 pdffonts

fonts and encodings are consistent

## Testing pdf miner on single page

In [16]:
pages = []
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    pages.append(device.get_result())

In [17]:
# test_book = book_breakdowns['daily_sci'][3]
test_book ='test_page.pdf'
test_book_path = './' + test_book
test_book

### drawing bounding boxes over image

In [85]:
page_file ='test_page.pdf'
page_layout = make_page_layout(page_file)
page_png_stream, y_height = make_png_stream(page_file)
page_img = make_open_cv_img(page_png_stream)

for box in page_layout._objs:
    lr, ul = get_bbox_tuple(box, y_height)
    cv2.rectangle(page_img, ul, lr, color=random_color(), thickness=2)
    try:
        print(box.get_text())
    except AttributeError:
        pass
display(Image.fromarray(page_img, 'RGB'))

In [None]:
 laparams = LAParams()
    page_layouts = []
    with open(pdf_file, 'r') as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            page_layouts.append(layout)
    return page_layouts

pdf miner doesn't seem to find figs/ images

## Drawing multiple pages

In [256]:
def make_page_layouts(pdf_file, page_range):
    laparams = LAParams()
    page_layouts = []
    with open(pdf_file, 'r') as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page_n, page in enumerate(PDFPage.create_pages(document)):
            if not page_range:
                interpreter.process_page(page)
                layout = device.get_result()
                page_layouts.append(layout)
            elif page_range[0] <= page_n and page_n <= page_range[1]:
                interpreter.process_page(page)
                layout = device.get_result()
                page_layouts.append(layout)
    return page_layouts

In [259]:
def make_png_stream(pdf_page):
    raw_pdf = WImage(pdf_page)
    page_png = raw_pdf.convert('png')
    y_height, x_width, n_col = timg_png.shape
    
    png_blob = page_png.make_blob()
    return png_blob, y_height

def make_open_cv_img(page_stream, color_flag = 1):
    img_array = np.asarray(bytearray(page_stream), dtype=np.uint8)
    return cv2.imdecode(img_array, color_flag)

def random_color():
    import random
    return random.randint(0,255), random.randint(0,255), random.randint(0,255)

def get_bbox_tuple(box, y_height):
    def shift_coord(coord, v_height):
        return coord[0], v_height - coord[1]
    
    lower_right = tuple(map(lambda x: int(x), box.bbox[2:]))
    upper_left = tuple(map(lambda x: int(x), box.bbox[:2]))
    
    return shift_coord(lower_right, y_height), shift_coord(upper_left, y_height)

def display_page(raw_page_img, page_layout):
    page_png_stream, y_height = make_png_stream(raw_page_img)
    page_img = make_open_cv_img(page_png_stream)
    for box in page_layout._objs[:-1]:
        lr, ul = get_bbox_tuple(box, y_height)
        cv2.rectangle(page_img, ul, lr, color=random_color(), thickness=2)
#         print(box.get_text())
    display(Image.fromarray(page_img, 'RGB'))

In [253]:
def draw_pdf_with_boxes(book_file, page_range):
    if page_range:
        page_range= map(lambda x: x - 1, page_range)
        suffix = '[{}-{}]'.format(page_range[0], page_range[1])
        raw_multi_pdf = WImage(filename= book_file + suffix)
    else:
        raw_multi_pdf = WImage(filename= book_file)
        
    doc_page_layouts = make_page_layouts(book_file, page_range)
    page_images = raw_multi_pdf.sequence
    for page_n in range(len(page_images)):
        display_page(page_images[page_n], doc_page_layouts[page_n])

In [261]:
book_file ='./pdfs/DK_Workbooks__Science,_Second_Grade_DK_Publishing_60p_146541729X.pdf'
draw_pdf_with_boxes(book_file, [50,55])

# END