In [8]:
import os
import shutil
from timeit import default_timer as time
import sys
from IPython.display import clear_output

from wand.image import Image as wi
import pandas as pd

In [9]:
def update_progress(progress,start_time):
    bar_length = 50
    
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)
    print('Elapse time = ', int((time()-start_time)/60), ' min')

# CONVERT PDF TO JPG

In [11]:
def pdftojpg(src_filename,dest_folder, dest_nameprefix=" ", dpi=300, start_fileindex=0): 
# function to convert pdf to jpg and return list of jpg file name    
    #initiate
    new_jpgfilenames = []
    
    # check available of source file 
    if not (os.path.exists(src_filename)):
        print('not available file: ', src_filename)
        return new_jpgfilenames
    
    #check available of desition folder
    if not (os.path.exists(dest_folder)):
        print('not available folder: ', dest_folder)
        return new_jpgfilenames

    # load src file
    pdf = wi(filename=src_filename, resolution=dpi)
    pdfimage = pdf.convert("jpeg")

    # naming rules
    # newname = dest_folder + "/" + dest_nameprefix + " " + new_file_number +".jpg"
    i = start_fileindex # the last file_number index
            
    # start export jpg files
    for img in pdfimage.sequence:

        # spawn new file-name
        dest_filename = dest_folder + "/"  + dest_nameprefix + " " + str(i).zfill(5) +".jpg"
        while os.path.exists(dest_filename): #check if dest_file name is exist
            i+=1
            dest_filename = dest_folder + "/" + dest_nameprefix + " " + str(i).zfill(3) +".jpg"        
        
        #save new file
        page = wi(image=img)
        page.save(filename=dest_filename)      
        
        #append new file to new_jpgfilenames
        new_jpgfilenames.append(dest_filename)
        
        # new-index
        i += 1 
        
    return new_jpgfilenames

In [12]:
def is_converted(src_filename):
    return (filemapping['pdf'].eq(src_filename)).any()

def bulk_pdftojpg(src_folder,dest_folder, jpg_nameprefix='pdftojpg'):
    successconvertfilename = []
    failconvertfilename = []
    filenamemap = pd.DataFrame(columns=['pdf','jpg'])
    mapto_pdfunc = lambda scr_name,dest_list: pd.DataFrame(list(zip([scr_name]*len(dest_list), dest_list)),columns=['pdf','jpg'])

    i=0
    start_time = time()
    for fname in os.listdir(src_folder):
        if fname.endswith('.pdf'):
            i+=1        
            src_filename = src_folder+"/"+fname
            if is_converted(src_filename): break

            # export pdf to jpg files
            print('converting file: ', src_filename)
            exportedfilenames = pdftojpg(src_filename,
                                         dest_folder,
                                         jpg_nameprefix,
                                         start_fileindex = len(os.listdir(dest_folder))+1
                                        )

            # check the converting
            if len(exportedfilenames)==0: # exportedfilenames is empty list
                #convert fail
                failconvertfilename.append(src_filename) #add to falure list
            else:
                #convert suceed
                successconvertfilename.append(src_filename) # add to suceed list
                 # add to exported-filenames to pandas_file_name_mapping
                filenamemap=pd.concat([filenamemap,mapto_pdfunc(src_filename,exportedfilenames)], 
                                      ignore_index=True)
            
            #update % progress
            update_progress(i/len(os.listdir(src_folder)),start_time)

    if len(successconvertfilename)==0: 
        print('no file was converted')
    else:
        pass
        #print('succeed converted files:')
        #print(successconvertfilename)

    if len(failconvertfilename)>0: 
        print('failure to convert files as below list:')
        print(failconvertfilename)

    return filenamemap

## MAIN

In [17]:
#src_folders =  ['D:/tmrisc/pdf/2020','D:/tmrisc/pdf/2019','D:/tmrisc/pdf/2018']
#dest_folders = ['D:/tmrisc/jpg/2020','D:/tmrisc/jpg/2019','D:/tmrisc/jpg/2018']
map_filename = 'D:/tmrisc/map/filemapping.csv'
#load existing mapping
if (os.path.exists(map_filename)):
    map_pdfile = pd.read_csv(map_filename)
else: 
    map_pdfile = pd.DataFrame(columns=['pdf','jpg'])
    
for src_folder,dest_folder in zip(src_folders,dest_folders):
    jpg_nameprefix = dest_folder[-4:]
    #export pdf to jpg
    map_pdfile = pd.concat([map_pdfile,bulk_pdftojpg(src_folder,dest_folder,jpg_nameprefix)],
                            ignore_index=True
                            )
    
#write new mapping
map_pdfile.to_csv(map_filename,index=False)

['2020 filemapping.csv']
no file was converted
['2019 filemapping.csv']
no file was converted
['2018 filemapping.csv']
no file was converted
