https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517

In [6]:

# to read pdf
import PyPDF2

# to analyse pdf layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure

# extract tables
import pdfplumber

# extract images
from PIL import Image
from pdf2image import convert_from_path

# perform OCR to extract text from images
import pytesseract 

# standard imports
import os

#### Approach

* Use PDFMinder to split document into multiple page objects
* PDF files inherently lack structured information such as paragraphs, sentences or words
* They understand only individual characters and their position on the page

##### How does PDFMinder work?

* Reconstructs contents of hte page into its individual characters along with their positions in the file
* Compares distances of the characters from others it composes the appropriate words, sentences and paragraphs
* Converts each page into and LTPage object
* For each page it tries to identify the component as 
    * LTFigure - embedded image
    * LTText Container - block of text which is split into LTextLine and then into LTChar objects
    * LTRect objects - 2D recatngle used to frame images, figures, tables into an LTPage object.
* Based on reconstruction we apply the appropriate function.

In [7]:
# function to extract text from PDF document using PyPDF2
def text_extraction(element):
    # extract text from in-line text elements
    line_text = element.get_text()
    
    # find the format of the text
    # initialize a list with all formats that appeared in the line of text
    line_formats = []
    # iterate through all elements
    for text_line in element:
        # if the element is a text container
        if isinstance(text_line, LTTextContainer):
            # iterate through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # append font name of character
                    line_formats.append(character.fontname)
                    # append font size of character
                    line_formats.append(character.size)
    
    # find unique font sizes and names in the line
    format_per_line = list(set(line_formats))
    
    # return a tuple with text in each line along with its format
    return (line_text, format_per_line)
    

In [8]:
# define a function to crop image elements from pdf
def crop_image(element, pageObj):
    # get the coordinates of the image
    [image_left, image_top, image_right, image_bottom] = [element.xo, element.yo, element.x1, element.y1]
    
    # crop the page using cooridnates
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    
    # save cropped image to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    
    # save cropped pdf to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

In [9]:
# function to convert PDF to image
def convert_to_images(input_file):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = "PDF_image.png"
    image.save(output_file, "PNG")

In [10]:
# create a function to read text from images
def image_to_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

In [11]:
# create a function to extract tables from image
def extract_table(pdf_path, page_num, table_num):
    # open the pdf file
    pdf = pdfplumber.open(pdf_path)
    
    # find the examined page
    table_page = pdf.pages[page_num]
    
    # extract the table
    table = table_page.extract_tables()[table_num]
    
    # return the table
    return table
    

In [12]:
# write a function to convert teh table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

In [14]:
# load pdf file and create a pdf object
pdf_path = '../data/Example PDF.pdf'
pdfFileObj = open(pdf_path, 'rb')

# create a pdf reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

# create dictionary to extract text from each image
text_per_page = {}

In [None]:
# extract pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
    # initialize the variables needed for text extraction from the page
    pageObj = pdfReaded.pages[pagenum]
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    
    # initialize number of examined tables
    table_num = 0
    first_element = True
    table_extraction_flag = False
    
    # open the pdf file
    pdf = pdfplumber.open(pdf_path)
    
    # find the examined page
    page_tables = pdf.pages[pagenum]
    # find number of tables
    tables = page_tables.find_tables()
    
    # find all elements
    page_elements = [(element.y1, element) for element in page._objs]
    # sort all elments as they appear
    page_elements.sort(key=lambda x: x[0], reverse=True)
    
    # find the elements that composed a page
    for i, component in enumerate(page_elements):
        # extract the position of top side of element in pdf
        pos = component[0]
        # extract the element of the page layout
        element = component[1]
        # check if element is a text element
        if isinstance(element, LTTextContainer):
            # check if text appeared in table
            if table_extraction_flag == False:
                # use the function to extract text and format for each text
                (line_text, format_per_line) = text_extraction(element)
                # append text of each line to the page text
                page_text.append(line_text)
                # append format of each line to the page format
                line_format.append(format_per_line)
                page_content.append(line_text)
            else:
                # omit the text and move on
                pass
        
        # check for image

In [1]:
# overall function within which to apply all other functions
for pagenum, page in enumerate(extract_pages(pdf_path)):
    # iterate the elements that composed a page
    for element in page:
        # check if element is a text element
        if isinstance(element, LTTextContainer):
            # function to extract text from text block
            pass
            # function to extract text format
            pass
        if isinstance(element, LTFigure):
            # function to convert PDF to Image
            pass
            # function to extract text with OCR
            pass
        # check the elements for tables
        if isinstance(element, LTRect):
            # function to extract tables
            pass
            # function to convert table content into a string

NameError: name 'extract_pages' is not defined

In [18]:
x = [4,6,2,4,76]
y = x.sort(key=lambda a: a[0], reverse=True)
print(y)

TypeError: 'int' object is not subscriptable