# PyTesseract

#### install libraries

In [None]:
# Install PyTesseract and setup on Colab
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Our Setup, Import Libaries, Create our Imshow Function and Download our Images
import cv2
import pytesseract
import numpy as np
from matplotlib import pyplot as plt

pytesseract.pytesseract.tesseract_cmd = (
    r'/usr/bin/tesseract'
)

# Define our imshow function 
def imshow(title = "Image", image = None, size = 10):
    w, h = image.shape[0], image.shape[1]
    aspect_ratio = w/h
    plt.figure(figsize=(size * aspect_ratio,size))
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title(title)
    plt.show()

# Download and unzip our images 
!wget https://moderncomputervision.s3.eu-west-2.amazonaws.com/OCRSamples.zip
!unzip -qq OCRSamples.zip

ModuleNotFoundError: ignored

#### First Try

In [None]:
img = cv2.imread('OCR Samples/OCR1.png')
imshow("Input Image", img)

#Passing image through PyTesseract
output_txt = pytesseract.image_to_string(img)

print("PyTesseract Extracted: {}".format(output_txt))

#### White Text on Black Background

In [None]:
img = cv2.imread('OCR Samples/OCR2.png')
imshow("Input Image", img)

output_txt = pytesseract.image_to_string(img)

print("Pytesseract Extracted: {}".format(output_txt))

#### Messy Background

In [None]:
img = cv2.imread('OCR Samples/OCR3.png')
imshow("Input Image", img)

output_txt = pytesseract.image_to_string(img)

print("PyTesseract Extracted: {}".format(output_txt))

#### Real Scan

In [None]:
img = cv2.imread('OCR Samples/scan2.jpeg')
imshow("Input Image", img, size = 48)

output_txt = pytesseract.image_to_string(img)

print("Pytesseract Extracted: {}".format(output_txt))

#### Cleaning up the Images

In [None]:
from skimage.filters import threshold_local

image = cv2.imread('OCR Samples/scan2.jpeg')
imshow("Input Image", image, size = 48)

H = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))[0]
S = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))[1]
V = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))[2]

imshow("Split", H)
imshow("Split", S)
imshow("Split", V)

T = threshold_local(V, 25, offset = 15, method="gaussian")

thresh = ( V > T).astype("uint8") * 255
imshow("threshold Local", thresh, size = 48)

output_txt = pytesseract.image_to_string(thresh)
print("PyTesseract Extracted: {}".format(output_txt))

# Thresholding

Good practices for OCR Recognition

* Bluring
* Thresholding
* Deskewing
* Dilaiton / Erosion / Opening / Closing
* Noise Removal

In [None]:
# Get our new test image
!wget https://github.com/rajeevratan84/ModernComputerVision/raw/main/Receipt-woolworth.jpg

In [None]:
from skimage.filters import threshold_local

image = cv2.imread('Receipt-woolworth.jpg')

V = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))[2]
T = threshold_local(V, 25, offset = 15, method="gaussian")

thresh = (V>T).astype("uint8") * 255
imshow("Threshold Local", thresh)

output_txt = pytesseract.image_to_string(thresh)
print("PyTesseract Extracted: {}".format(output_txt))

In [None]:
from pytesseract import Output

d = pytesseract.image_to_data(thresh, output_type = Output.DICT)
print(d.keys())

Using dictionary we can get each word detected, their bounding box information, the text in them and the confidence scores for each

In [None]:
n_boxes = len(d['text'])

for i in range(n_boxes):
    if int(d['conf'][i]) > 60:
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        image = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

imshow('Output', image, size = 12)