# Tesseract

In [1]:
import re
import cv2
import glob
import pytesseract
import numpy as np
import pandas as pd

pytesseract.pytesseract.tesseract_cmd=r"<local_path>/Tesseract-OCR/tesseract.exe"

imagelink = "<local_path>/Google-Tesseract/Images/"

## OPERATIONS ON IMAGES

In [2]:
# DISPLAY IMAGE

link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)

cv2.imshow("Image Displayed", image)
cv2.waitKey(0)

-1

In [3]:
# RESIZE IMAGE

link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)
image = cv2.resize(image, (500, 700))

cv2.imshow("Image Resized", image)
cv2.waitKey(0)

-1

In [4]:
# CROPPED IMAGE

link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)
image = image[50:, :200]

cv2.imshow("Image Cropped", image)
cv2.waitKey(0)

-1

In [5]:
# ROTATE IMAGE

link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)
image = cv2.rotate(image, cv2.cv2.ROTATE_90_CLOCKWISE)

cv2.imshow("Image Rotated", image)
cv2.waitKey(0)

-1

In [6]:
# TRANSLATED IMAGE

link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)

height, width = image.shape[:2]

tx, ty = width / 4, height / 4

translation_matrix = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float32)

image = cv2.warpAffine(src=image, M=translation_matrix, dsize=(width, height))

cv2.imshow("Image Translated", image)
cv2.waitKey(0)

-1

## TEXT EXTRACTION

## - Simple Extraction

In [7]:
link = imagelink + "example_1.jpg"

image = cv2.imread(link, 0)

data = pytesseract.image_to_string(image)

print(data)

A simple image with text to demonstrate
extraction of text using python and tesseract

“Two things are infinite: the universe and human stupidity; and I'm not sure
about the universe.” - Albert Einstein



In [8]:
link = imagelink + "example_2.jpg"

image = cv2.imread(link, 0)

data = pytesseract.image_to_string(image)

print(data)

“You've gotta dance like there's nobody watching,
Love like you'll never be hurt,
Sing like there's nobody listening,
And live like it’s heaven on earth.”
— William w. Pu rkey



## - Text Extraction With Manipulations

In [9]:
link = imagelink + "example_3.jpg"

image = cv2.imread(link,0)
image = cv2.resize(image, (500, 700))
image = image[25:300, :]

thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

Data = pytesseract.image_to_string(thresh, lang='eng',config='--psm 6')

print("\n{}.".format(Data))
print("-"*20)
print("\nWe notice that views on the screenshot are visible after a special character '©'.\nTherefore we use regex to extract the number of views.")

Views = re.findall(r'© .*',Data)[0]
Views = [int(i) for i in Views.split() if i.isdigit()][0]

print("-"*20)
print("\nExample 3 has {} views.".format(Views))


it © 13 ~*~ wu
C) Kimmy Long
C) Le Fevre Taylor
.
--------------------

We notice that views on the screenshot are visible after a special character '©'.
Therefore we use regex to extract the number of views.
--------------------

Example 3 has 13 views.
