In [1]:
#Explore how to extract Text from PDF (binary) and PDF(images) OCR
import PyPDF2

#import for timing file conversion and text extraction
import time

import pandas as pd

#Imports for Image OCR - Extraction
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os

In [2]:
#get files and convert them to images

def pdf_convert_to_images(file_name):
    #get the pdf file and find out how many pages exists
    pages = convert_from_path(file_name, 500)

    image_counter = 1
    
    #for each page of the file, convert it to JPEG image file
    for page in pages: 
        
        #store the file as "page_1.jpg, page_2.jpg" etc.
        file_name = "page_"+str(image_counter)+".jpg"
        print("file created:"+file_name)
        
        #convert the image as jpeg file
        page.save(file_name, "JPEG")
        
        image_counter+=1
        
    #return the total number of files created    
    return image_counter

def image_convert_to_text(file_limit, out_file_name):
    
    #open a file and save the output text from the converted image as a "Text File"
    #navigate to OUPUT folder and save the text file there
    print("Output file name: "+out_file_name)
    f = open(out_file_name, "a")
    
    #iterate through each image file and converrt it to text, delete the jpeg after using it
    for page_num in range (0, file_limit-1):
        
        #generate the image file name
        file_name="page_"+str(page_num+1)+".jpg"
        print("File deleted:", file_name)
        
        #Example of adding any additional options.
        custom_oem_psm_config = r'--oem 3 --psm 3'
        #main magic line, this uses Google OCR to convert image into text
        text = str(((pytesseract.image_to_string(Image.open(file_name), config=custom_oem_psm_config)))) 
        
        #delete JPEG files after done extracting
        os.remove(file_name) 
    
        #remove partial sentence endinga with spaces
        #example:   hello what's happen-
        #-ing New York?
        #It will be changed to " hello what's happening in New York"
        text = text.replace('-\n', '')
        text = text.replace('\r', '')
    
        #print(text)
        #write the extracted text to text file
        f.write(text)
        
    f.close()
        

In [3]:
#different types of files for testing
# 'recalls.pdf' = PPT file converted to PDF
# 'ENCR1607.pdf' = Email Copy
# 'ENCR2080 Write Up.pdf' = Text converted to PDF

#NCR File Samples
# '0000000000000000000010069 corrected complaint.pdf' (4 Pages)
# '0000000000000000000010477 HVT-COMP-0000048.pdf' --- Actual Complaint File (15 pages)

#Get the proper file path
path = os.getcwd()
file_path = os.path.dirname(path)
print("Current directory: ", file_path)

#get current file path and navigate to ouput folder
out_path = os.path.join(file_path, "Output/Maximo/")
print("Current Output directory: ", out_path)


#single file testing
#file_name = "0000000000000000000010069 corrected complaint.pdf"

#batch process - provide folder name

pdf_folder_name = "PDFs/Maximo/"

path = os.path.join(os.path.dirname(os.getcwd()), pdf_folder_name)
print("New file path: ", path)


file_name_list = []
pdf_file_size_list = []
file_conversion_time_list = []
converted_txt_file_size_list = []

Current directory:  /Users/dechen/Notebooks/Python PDF Testing
Current Output directory:  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/
New file path:  /Users/dechen/Notebooks/Python PDF Testing/PDFs/Maximo/


In [4]:

#start bath process, walkthrough  folder and find each "PDF" file

for dirName, subdirList, fileList in os.walk(path):
    print("Found Directory: %s" % dirName)
    for file_name in fileList:
        
        print("\t%s"% file_name)
        #keep track of file names
        file_name_list.append(file_name)
        
        #ignore non-PDF files
        if(os.path.splitext(file_name)[1] == ".pdf"):

            out_file_name = file_name[:-4]+".txt"

            file_name = os.path.join(file_path, pdf_folder_name+file_name)
            print("PDF File size: ", os.path.getsize(file_name))
            pdf_file_size_list.append(os.path.getsize(file_name))
            #print(out_file_name)
            #print(file_name)
            
            
            #start the clock
            start_time = time.time()
            
            file_limit = pdf_convert_to_images(file_name);
    
            #find the path of the output folder and create a file based on original PDF name
            #PDF name = '0001.pdf', then text name is '0001.txt'
            outfile = os.path.join(out_path,out_file_name)
            print("Outfile is ", outfile)

            #convert the JPGs into single Text file
            image_convert_to_text(file_limit, outfile)
            
            print("File size is:", os.path.getsize(outfile))
            converted_txt_file_size_list.append(os.path.getsize(outfile))
           
            #stop the clock, print time elapsed
            end_time = time.time() - start_time 
            print('Time elapsed: ', end_time)
            file_conversion_time_list.append(end_time)
            

#batch process - ends  
file = [file_name_list, pdf_file_size_list, file_conversion_time_list, converted_txt_file_size_list]
df = pd.DataFrame(file).T
df.columns = ['File Name', 'PDF Size', 'Conversion Time', 'Text File Size']
df.to_csv(out_path+"/maximo_export.csv", index=False)

Found Directory: /Users/dechen/Notebooks/Python PDF Testing/PDFs/Maximo/
	WO120193.pdf
PDF File size:  13036
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO120193.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO120193.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1512
Time elapsed:  9.291407108306885
	WO124691.pdf
PDF File size:  13523
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO124691.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO124691.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1810
Time elapsed:  9.400281190872192
	WO135763.pdf
PDF File size:  13445
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO135763.txt
Output file name: /Users/dechen/Notebooks/Python 

File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File deleted: page_5.jpg
File size is: 6353
Time elapsed:  27.823811054229736
	WO112683.pdf
PDF File size:  18580
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
file created:page_4.jpg
file created:page_5.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO112683.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO112683.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File deleted: page_5.jpg
File size is: 6215
Time elapsed:  26.052393913269043
	WO117704.pdf
PDF File size:  13787
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO117704.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO117704.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 2055
Time elapsed:  11.34

file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO126135.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO126135.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1595
Time elapsed:  10.823862075805664
	WO151430.pdf
PDF File size:  15701
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
file created:page_4.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO151430.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO151430.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File size is: 3827
Time elapsed:  21.898619890213013
	WO103675.pdf
PDF File size:  15645
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO103675.txt
Output file name: /Users/dechen/No

file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
file created:page_4.jpg
file created:page_5.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO133798.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO133798.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File deleted: page_5.jpg
File size is: 5029
Time elapsed:  31.539769172668457
	WO131616.pdf
PDF File size:  12954
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO131616.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO131616.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1576
Time elapsed:  10.42460584640503
	WO100568.pdf
PDF File size:  13090
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO100568.txt
Output fil

File deleted: page_4.jpg
File deleted: page_5.jpg
File deleted: page_6.jpg
File size is: 5522
Time elapsed:  32.549736976623535
	WO144536.pdf
PDF File size:  15617
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO144536.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO144536.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File size is: 3506
Time elapsed:  17.987778902053833
	WO149210.pdf
PDF File size:  18242
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
file created:page_4.jpg
file created:page_5.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO149210.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO149210.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File deleted: page_5.jpg
File size is: 5

Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO127268.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO127268.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File deleted: page_4.jpg
File size is: 4032
Time elapsed:  17.854278087615967
	WO102375.pdf
PDF File size:  15498
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO102375.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO102375.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File deleted: page_3.jpg
File size is: 3738
Time elapsed:  14.160090684890747
	WO129872.pdf
PDF File size:  13204
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO129872.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO129872.txt
File deleted: pag

File deleted: page_4.jpg
File size is: 4414
Time elapsed:  17.362263679504395
	WO123914.pdf
PDF File size:  13300
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO123914.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO123914.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1601
Time elapsed:  8.533116817474365
	WO130759.pdf
PDF File size:  13266
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO130759.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO130759.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1875
Time elapsed:  8.584570169448853
	WO103634.pdf
PDF File size:  14508
file created:page_1.jpg
file created:page_2.jpg
file created:page_3.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO103634.txt
Output file name: /U

File deleted: page_4.jpg
File size is: 4411
Time elapsed:  15.767268180847168
	WO120981.pdf
PDF File size:  13623
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO120981.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO120981.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1940
Time elapsed:  7.664046049118042
	WO130370.pdf
PDF File size:  13011
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO130370.txt
Output file name: /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO130370.txt
File deleted: page_1.jpg
File deleted: page_2.jpg
File size is: 1687
Time elapsed:  7.154124975204468
	WO100139.pdf
PDF File size:  13108
file created:page_1.jpg
file created:page_2.jpg
Outfile is  /Users/dechen/Notebooks/Python PDF Testing/Output/Maximo/WO100139.txt
Output file name: /Users/dechen/Notebooks/Py

FileNotFoundError: [Errno 2] No such file or directory: '/Users/dechen/Notebooks/Python PDF Testing/PDFs/Maximo/WO102827_Attachment3.pdf'

In [5]:
file_name_list

['WO120193.pdf',
 'WO124691.pdf',
 'WO135763.pdf',
 'WO118815.pdf',
 'WO143637.pdf',
 'WO130356.pdf',
 'WO131935.pdf',
 'WO101000.pdf',
 'WO118989.pdf',
 'WO104783.pdf',
 'WO123856.pdf',
 'WO100731.pdf',
 'WO124334.pdf',
 'WO112916.pdf',
 'WO145133.pdf',
 'WO100679.pdf',
 'WO146402.pdf',
 'WO147126.pdf',
 'WO136734.pdf',
 '.DS_Store',
 'WO109089.pdf',
 'WO112683.pdf',
 'WO117704.pdf',
 'WO112668.pdf',
 'WO142739.pdf',
 'WO146211.pdf',
 'WO150301.pdf',
 'WO140272.pdf',
 'WO117699.pdf',
 'WO124482.pdf',
 'WO133101.pdf',
 'WO114031.pdf',
 'WO141959.pdf',
 'WO113944.pdf',
 'WO108357.pdf',
 'WO107058.pdf',
 'WO133100.pdf',
 'WO115892.pdf',
 'WO142128.pdf',
 'WO107306.pdf',
 'WO126492.pdf',
 'WO126135.pdf',
 'WO151430.pdf',
 'WO103675.pdf',
 'WO146705.pdf',
 'WO118876.pdf',
 'WO141094.pdf',
 'WO144113.pdf',
 'WO103927.pdf',
 'WO116286.pdf',
 'WO135846.pdf',
 'WO105005.pdf',
 'WO126179.pdf',
 'WO104482.pdf',
 'WO122082.pdf',
 'WO140834.pdf',
 'WO112425.pdf',
 'WO119570.pdf',
 'WO126144.pdf',


In [6]:
file = [file_name_list, pdf_file_size_list, file_conversion_time_list, converted_txt_file_size_list]
df = pd.DataFrame(file).T
df.columns = ['File Name', 'PDF Size', 'Conversion Time', 'Text File Size']
df.to_csv(out_path+"/maximo_export.csv", index=False)