# Libraries

In [1]:
#import necessary libararies
import csv

import os
import glob

import cv2
import pytesseract

pytesseract.pytesseract.tesseract_cmd = '...\\Tesseract-OCR\\tesseract.exe'

# Functions

In [2]:
def img_resize(img, scale_percent):
    '''
    This function gets an image ans scale pct.
    Returns a new image resized by the scale pct
    '''
    #get image width & height
    width = int(img.shape[1] * scale_percent / 100)
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)

    # resize image
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)

    return resized

In [3]:
def img2txt(image_date: dict, resize_opt: list):
    '''
    This function gets a dictionary of all image data and the resize options
    The resize option are percentage values to resize the images
    The function runs the rezied through the pytesseract.image_to_string() to extract text
    '''
    #initiate an empty list
    text_data = []

    #ID counter
    i = 1
    
    #read the dict of images
    for k,v in image_date.items():

        #pass the ID and dict.key to the new_text list
        new_text = [i, k]
        
        #run for all resize option
        for pct in resize_opt:
            img_resized = img_resize(v,pct)
            text_img_resized = pytesseract.image_to_string(img_resized)
            new_text.append(text_img_resized)

        #counter +1
        i +=1

        #append  the new text list to all data
        text_data.append(new_text)
    
    return text_data

# Settings

In [4]:
#data directory 
PATH = '...\\PNG\\'

#Set image resize options for scalling the images 
RESIZE_OPT = [10, 25, 40, 50, 75, 100]

# Main

## Read files

In [5]:
# use glob to get all the pngs files in PATH folder 
png_files = glob.glob(os.path.join(PATH, "*.png"))

#initiate a new dict to store the data
image_date = dict()

# loop over the list of csv files
for f in png_files:
    
    #get file name
    file_name = f.split("\\")[-1]
    
    #read image from file
    img = cv2.imread(f)
    
    #convert BGR to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    #append to image_data
    image_date[file_name] = img

In [6]:
print('Number of images processed: ', len(image_date))

Number of images processed:  89


## Extract Text 

In [7]:
#Run the img2txt funtion
text_data = img2txt(image_date, RESIZE_OPT)

## Write results to CSV

In [8]:
csv_file = PATH + "TextReader.csv"
field_names = ['ID', 'File_Name']

for pct in RESIZE_OPT:
    field_names.append(str(pct))

In [9]:
with open(csv_file, 'w') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
    write.writerow(field_names)
    write.writerows(text_data)

In [10]:
print('File successfully generated!')

File successfully generated!
