In [35]:
# Testing of Operational Version

In [2]:
# Libraries
import pytesseract                      # Optical Character Recognition
from PIL import Image                   # Image Accessing
import cv2                              # Image Modification
import os                               # Directory handling
import pandas as pd                     # Excel sheets
import xlsxwriter
from matplotlib import pyplot as plt
from pathlib import Path


# Tesseract Setup
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Elena.Justo\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

In [3]:
# Target folder
dir = r"testing\cessnock"

In [4]:
# Create dataframe file to serve as UI when converted into excel file
user_df = pd.DataFrame(data=[])

# Debug
user_df

In [39]:
# Getting file paths
fileNames = []
for item in os.listdir(dir):
    fileNames.append(item)

filePaths = []
for item in os.listdir(dir):
    filePaths.append(dir + "\\" + item)
    
# Debug
print(fileNames)
print(filePaths)

['img (1).jpg', 'img (10).jpg', 'img (11).jpg', 'img (12).jpg', 'img (13).jpg', 'img (14).jpg', 'img (15).jpg', 'img (2).jpg', 'img (3).jpg', 'img (4).jpg', 'img (5).jpg', 'img (6).jpg', 'img (7).jpg', 'img (8).jpg', 'img (9).jpg']
['testing\\cessnock\\img (1).jpg', 'testing\\cessnock\\img (10).jpg', 'testing\\cessnock\\img (11).jpg', 'testing\\cessnock\\img (12).jpg', 'testing\\cessnock\\img (13).jpg', 'testing\\cessnock\\img (14).jpg', 'testing\\cessnock\\img (15).jpg', 'testing\\cessnock\\img (2).jpg', 'testing\\cessnock\\img (3).jpg', 'testing\\cessnock\\img (4).jpg', 'testing\\cessnock\\img (5).jpg', 'testing\\cessnock\\img (6).jpg', 'testing\\cessnock\\img (7).jpg', 'testing\\cessnock\\img (8).jpg', 'testing\\cessnock\\img (9).jpg']


In [40]:
# Save file names into UI
user_df["File Name"] = fileNames

# Debug
user_df

Unnamed: 0,File Name
0,img (1).jpg
1,img (10).jpg
2,img (11).jpg
3,img (12).jpg
4,img (13).jpg
5,img (14).jpg
6,img (15).jpg
7,img (2).jpg
8,img (3).jpg
9,img (4).jpg


In [41]:
# Create new column for images
user_df.insert(1, "Image Preview", None, False)

# Create new column for results
user_df.insert(2, "OCR Results", None, False)

# Create new column to see contours
user_df.insert(3, "Computer View", None, False)

In [42]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("main.xlsx", engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
user_df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Scale
worksheet.set_column(0, 0, 10)      # Index column format
worksheet.set_column(1, 3, 25)      # Filename column format

# Embed images
for image in fileNames:
    worksheet.set_row(fileNames.index(image)+1, 200)

    # Insert an image.
    worksheet.embed_image(fileNames.index(image)+1, 2, dir + "\\" + str(image))

In [43]:
# FUNCTION: Display image inline
# DESC: displaying-different-images-with-actual-size-in-matplotlib-subplot
def display(im_path):    
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [44]:
# Create temp folder to store pre-processing of images
try:
    os.mkdir("temp")
except FileExistsError:
    print("File already exists")

File already exists


In [45]:
# Clean temp folder
for files in os.listdir("temp"):
    os.remove("temp\\"+files)

# OCR Results
results  = []

# Pre-Process Images
for paths in filePaths:

    image = cv2.imread(paths)

    print(fileNames[filePaths.index(paths)])

    # Grayscale image
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Save grayscaled image to temp folder
    cv2.imwrite("temp\\"+ "gray_" +fileNames[filePaths.index(paths)], gray)

    # Apply guassian blur to grayscaled image
    blur = cv2.GaussianBlur(gray, (7,7), 0)
    
    # Save guassian blurred image
    cv2.imwrite("temp\\"+ "blur_" +fileNames[filePaths.index(paths)], blur)

    # Apply binary thresholding to blurred image
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]   

    # Save thresholded image
    cv2.imwrite("temp\\"+ "thresh_" +fileNames[filePaths.index(paths)], thresh) 

    # Create kernal
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 13))

    # Save kernel
    cv2.imwrite("temp\\"+ "kernal_" +fileNames[filePaths.index(paths)], kernal)

    # Dilate image
    #dilate = cv2.dilate(thresh, kernal, iterations=1)
    # Save dilation
    #cv2.imwrite("temp\\"+ "dilated_" +fileNames[filePaths.index(paths)], dilate)

    # Erode image instead
    erode = cv2.erode(thresh, kernal, iterations=1)
    # Save erosion
    cv2.imwrite("temp\\"+ "erode_" +fileNames[filePaths.index(paths)], erode)

    # Set countour mode
    contMode = cv2.RETR_CCOMP

    # Get image contours
    #cnts = cv2.findContours(dilate, contMode, cv2.CHAIN_APPROX_SIMPLE)  # If dilate
    cnts = cv2.findContours(erode, contMode, cv2.CHAIN_APPROX_SIMPLE)  # If erosion

    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

    # Draw contours onto image
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        if h > 200 and w > 20:
            roi = image[y:y+h, x:x+h]
            cv2.imwrite("temp\\"+ "dilated_" +fileNames[filePaths.index(paths)], roi)
            cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)

            # OCR
            ocr_result = pytesseract.image_to_string(roi)
            ocr_result = ocr_result.split("\n")
        
            if ocr_result == " " or ocr_result == "":
                worksheet.write(filePaths.index(paths)+1, 3, "unreadable")
                continue
            else:
                results.append(ocr_result)
                worksheet.write(filePaths.index(paths)+1, 3, str(ocr_result))
    
    cv2.imwrite("temp\\"+ "contoured_" +fileNames[filePaths.index(paths)], image)
    
    # Insert a contoured image
    worksheet.set_column(1, 4, 50)                        # Filename column format
    worksheet.set_row(filePaths.index(paths)+1, 200)
    worksheet.embed_image(filePaths.index(paths)+1, 4, "temp\\contoured_" + fileNames[filePaths.index(paths)])

# Close the Pandas Excel writer and output the Excel file.
writer.close()

img (1).jpg
img (10).jpg
img (11).jpg
img (12).jpg
img (13).jpg
img (14).jpg
img (15).jpg
img (2).jpg
img (3).jpg
img (4).jpg
img (5).jpg
img (6).jpg
img (7).jpg
img (8).jpg
img (9).jpg
