In [169]:
# Improve preprocessing technique to increase accuracy of OCR results

In [170]:
# Key steps to preprocessing an image
# 1) Binarization
# 2) Skew Correction
# 3) Noise Removal
# 4) Thinning and Skeletonization

In [171]:
# 1) Binarization

# DESCRIPTION
# In layman’s terms Binarization means converting a coloured image into an image 
# which consists of only black and white pixels (Black pixel value=0 and White pixel value=255). 
# As a basic rule, this can be done by fixing a threshold (normally threshold=127, as it is 
# exactly half of the pixel range 0–255). If the pixel value is greater than the threshold, it 
# is considered as a white pixel, else considered as a black pixel.

# ISSUE
# Method is dodgy when lighting conditions in image is not uniform

# USING METHOD
# Crucial part of binarization is determining the threshold.

# METHODS FOR DETERMINING THRESHOLD

# 1. LOCAL MAXIMA MINIMA METHOD
# Define a threshold for a defined size of locality in the image (like a 10x10 size part). 
# Using this strategy we’ll have different threshold values for different parts of the image.
# Transitions are not smooth

# 2. OTSU'S BINARIZATION
# Gives a threshold for the whole image considering the various characteristics of the whole image 
# (like lighting conditions, contrast, sharpness etc).

# 3. ADAPTIVE THRESHOLDING
# Gives a threshold for a small part of the image depending on the characteristics of its locality and 
# neighbours i.e there is no single fixed threshold for the whole image but every small part of the 
# image has a different threshold depending upon the locality and also gives smooth transition.

In [172]:
# 2) Skew Correction

# DESCRIPTION
# Image aligned at a certain angle with horizontal

# METHODS FOR SKEW CORRECTION

# 1. Projection profile method [Simplest and mostly used]
# Determines the skew angle then crrects for it

# 2. Hough transformation method

# 3. Topline method

# 4. Scanline method

In [173]:
# 3) Noise Removal

# DESCRIPTION
# Smoothen the image by removing small dots/patches which have high 
# intensity than the rest of the image. Noise removal can be performed 
# for both Coloured and Binary images.

In [174]:
# Libraries
import pytesseract                      # Optical Character Recognition
from PIL import Image                   # Image Accessing
import cv2                              # Image Modification
import os                               # Directory handling
import pandas as pd                     # Excel sheets
import xlsxwriter
from matplotlib import pyplot as plt
from pathlib import Path


# Tesseract Setup
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Elena.Justo\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

In [175]:
# Target folder
dir = r"testing\cessnock"

In [176]:
# Create dataframe file to serve as UI when converted into excel file
user_df = pd.DataFrame(data=[])

# Debug
user_df

In [177]:
# Getting file paths
fileNames = []
for item in os.listdir(dir):
    fileNames.append(item)

filePaths = []
for item in os.listdir(dir):
    filePaths.append(dir + "\\" + item)
    
# Debug
print(fileNames)
print(filePaths)

['img (1).jpg', 'img (10).jpg', 'img (11).jpg', 'img (12).jpg', 'img (13).jpg', 'img (14).jpg', 'img (15).jpg', 'img (2).jpg', 'img (3).jpg', 'img (4).jpg', 'img (5).jpg', 'img (6).jpg', 'img (7).jpg', 'img (8).jpg', 'img (9).jpg']
['testing\\cessnock\\img (1).jpg', 'testing\\cessnock\\img (10).jpg', 'testing\\cessnock\\img (11).jpg', 'testing\\cessnock\\img (12).jpg', 'testing\\cessnock\\img (13).jpg', 'testing\\cessnock\\img (14).jpg', 'testing\\cessnock\\img (15).jpg', 'testing\\cessnock\\img (2).jpg', 'testing\\cessnock\\img (3).jpg', 'testing\\cessnock\\img (4).jpg', 'testing\\cessnock\\img (5).jpg', 'testing\\cessnock\\img (6).jpg', 'testing\\cessnock\\img (7).jpg', 'testing\\cessnock\\img (8).jpg', 'testing\\cessnock\\img (9).jpg']


In [178]:
# Save file names into UI
user_df["File Name"] = fileNames

# Debug
user_df

Unnamed: 0,File Name
0,img (1).jpg
1,img (10).jpg
2,img (11).jpg
3,img (12).jpg
4,img (13).jpg
5,img (14).jpg
6,img (15).jpg
7,img (2).jpg
8,img (3).jpg
9,img (4).jpg


In [179]:
# Create new column for images
user_df.insert(1, "Image Preview", None, False)

# Create new column for results
user_df.insert(2, "OCR Results", None, False)

# Create new column to see blur
user_df.insert(3, "Guassian Blurred Result", None, False)

# Create new column to see binarization
user_df.insert(4, "Binarization Result", None, False)

# Create new column to see contours
user_df.insert(5, "Contour View", None, False)

In [180]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("preprocessTesting.xlsx", engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
user_df.to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Setting column widths
worksheet.set_column(first_col=0, last_col=0, width=10)      # Index column width
worksheet.set_column(first_col=1, last_col=1, width=15)      # Filename column width
worksheet.set_column(first_col=2, last_col=8, width=50)      # Image columns width

# Embed Raw Images
for image in fileNames:
    worksheet.set_row(row=fileNames.index(image)+1, height=100)                     # Setting row height
    worksheet.embed_image(fileNames.index(image)+1, 2, dir + "\\" + str(image))     # Insert an image.

In [181]:
# Create temp folder to store pre-processing of images
try:
    os.mkdir("temp")
except FileExistsError:
    print("File already exists")

File already exists


In [182]:
# Clean temp folder
for files in os.listdir("temp"):
    os.remove("temp\\"+files)

# OCR Results
results  = []

# Pre-Process Images
for paths in filePaths:

    image = cv2.imread(paths)

    print(fileNames[filePaths.index(paths)])

    # ***** GRAYSCALE IMAGE *****

    # Grayscale image
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Save grayscaled image to temp folder
    cv2.imwrite("temp\\"+ "gray_" +fileNames[filePaths.index(paths)], gray)

    # ***** BLUR IMAGE *****

    # Apply guassian blur to grayscaled image
    blur = cv2.GaussianBlur(gray, (7,7), 0)
    
    # Save guassian blurred image
    cv2.imwrite("temp\\"+ "blur_" +fileNames[filePaths.index(paths)], blur)

    # Insert the blurred image
    worksheet.embed_image(filePaths.index(paths)+1, 4, "temp\\"+ "blur_" +fileNames[filePaths.index(paths)])

    # ***** BINARISE IMAGE *****

    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    
    thresh = cv2.bitwise_not(thresh)   # Invert the colours as opencv finds WHITE contours.

    # Save thresholded image
    cv2.imwrite("temp\\"+ "thresh_" +fileNames[filePaths.index(paths)], thresh) 

    # Insert the thresholded image
    worksheet.embed_image(filePaths.index(paths)+1, 5, "temp\\"+ "thresh_" +fileNames[filePaths.index(paths)])

    # ***** FIND CONTOURS IN IMAGE  *****

    # Set countour mode
    contMode = cv2.RETR_CCOMP

    # Get image contours
    cnts = cv2.findContours(thresh, contMode, cv2.CHAIN_APPROX_SIMPLE) 
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

    # Draw contours onto image
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        if h > 200 and w > 20:
            roi = image[y:y+h, x:x+h]
            cv2.imwrite("temp\\"+ "cont_" +fileNames[filePaths.index(paths)], roi)
            cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)

            # Insert the contoured image
            worksheet.embed_image(filePaths.index(paths)+1, 6, "temp\\"+ "cont_" +fileNames[filePaths.index(paths)])

            # OCR
            ocr_result = pytesseract.image_to_string(roi)
            ocr_result = ocr_result.split("\n")
        
            if ocr_result == " " or ocr_result == "" or ocr_result == "[]":
                worksheet.write(filePaths.index(paths)+1, 3, "unreadable")
                continue
            else:
                results.append(ocr_result)
                worksheet.write(filePaths.index(paths)+1, 3, str(ocr_result))

# Close the Pandas Excel writer and output the Excel file.
writer.close()

img (1).jpg
img (10).jpg
img (11).jpg
img (12).jpg
img (13).jpg
img (14).jpg
img (15).jpg
img (2).jpg
img (3).jpg
img (4).jpg
img (5).jpg
img (6).jpg
img (7).jpg
img (8).jpg
img (9).jpg
