In [None]:
import fitz # PyMuPDF library, install with pip install pymupdf --> https://github.com/pymupdf/PyMuPDF
import pandas as pd
import re
import os

def PDF_data_extract(pdf_path, image_resolution=300):
    """
    Note -- this is not an OCR module, instead it uses PyMuPDF to extract text and metadata directly from PDF documents.
    It utilizes the getTextBlocks function to get "block"-level text and image groupings that are extracted by PyMuPDF. 
    
    Data is returned as a pandas dataframe with columns:
      - page: page number (starting at 1)
      - block: block number for a given page (starting at 1)
      - X1: X coordinate of upper left corner of block bounding box
      - Y1: Y coordinate of upper left corner of block bounding box
      - X2: X coordinate of lower right corner of block bounding box
      - Y2: Y coordinate of lower right corner of block bounding box
      - text: text contained in block
      
     Extra columns for tracking document name and extraction method are included.
    
    If desired, this can be adjusted to get page-level, paragraph-level, line-level and word-level text
    groupings.
    
    For example, to get individual words see here:
    https://pymupdf.readthedocs.io/en/latest/functions/#Page.getTextWords
    """
    
    # Handle whether or not a document can be opened
    try:
        
        # Open PDF using PyMuPDF library
        pdf = fitz.open(pdf_path)
        
        # Handls whether or not an opened document can be processed
        try:

            # intialize dictionaries to be used for processing
            pdf_pages_dfs = {}

            # Iterate through pages of PDF file
            for i in range(0, pdf.pageCount):    

                # Handle empty pages
                try:
                    
                    # extract text and metadata from page, load into dataframe
                    temp_df = pd.DataFrame(pdf.loadPage(i).getTextWords())   # extracts list of individual word/token strings

                    # Data cleaning, rearranging, etc...
                    temp_df['filename'] = re.split(r'/|\\', pdf_path)[-1][:-4]
                    temp_df = temp_df.rename(columns = {0:'x1', 1:'y1', 2:'x2', 3:'y2', 4:'text', 5:'block', 6:'line', 7:'word'})
                    temp_df['page'] = int(i + 1)
                    temp_df['block'] = temp_df['block'] + 1
                    temp_df['line'] = temp_df['line'] + 1
                    temp_df['word'] = temp_df['word'] + 1
                    
                    # don't need bounding box coordinates, drop them.  Reorder remaining columns
                    temp_df = temp_df[['filename', 'page', 'block', 'line', 'word', 'text']]

                    # Save page-level dataframe to PDF-level dictionary, with page keys as integers
                    pdf_pages_dfs.update({int(i+1):temp_df}) 

                except:

                    print("   ****** Page " + str(i+1) + " was empty ******")
                    pass

            # Concatenate all page-level dataframes of PDF into a single dataframe
            temp_df_list = [v for k, v in pdf_pages_dfs.items()]
            pdf_df = pd.concat(temp_df_list, axis=0).reset_index(drop=True)      

            print('Successfully extracted data from ' + pdf_path.rsplit('\\')[-1])
            
        except:
            print("\n   ****** Cannot process opened pdf ******\n")
            pdf.close()
            return
            
    except:
        print("\n   ****** Cannot open pdf ******\n")
        return

    return pdf_df

In [None]:
file_folder_path = '../data/initial_ideas/'
filenames = os.listdir(file_folder_path)
filenames

In [None]:
for file in filenames:
    if file[-4:] == '.pdf':
        print(file)
        file_path = f"{file_folder_path}{file}"
        df = PDF_data_extract(file_path, image_resolution=300)
        save_path = f"{file_folder_path}csv_out/{file}"
        df.to_csv(f'{save_path}.csv', index=False)