In [50]:
# USAGE
# python text_detection_video.py --east frozen_east_text_detection.pb

# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
from imutils.object_detection import non_max_suppression
import numpy as np
import argparse
import imutils
import time
import cv2
import pytesseract
import math

In [51]:

def decode_predictions(scores, geometry):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < args["min_confidence"]:
                continue

            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences)

In [52]:
# construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-east", "--east", type=str, required=True,
# 	help="path to input EAST text detector")
# ap.add_argument("-v", "--video", type=str,
# 	help="path to optinal input video file")
# ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
# 	help="minimum probability required to inspect a region")
# ap.add_argument("-w", "--width", type=int, default=320,
# 	help="resized image width (should be multiple of 32)")
# ap.add_argument("-e", "--height", type=int, default=320,
# 	help="resized image height (should be multiple of 32)")
# args = vars(ap.parse_args())

min_confidence = 0.2
width = 640
height = 640
args = { 'east' : './frozen_east_text_detection.pb', 
        'video' : './videos/faster.mp4',
        'min_confidence' : min_confidence,
        'width' : width, 
        'height' : height}

In [53]:
# initialize the original frame dimensions, new frame dimensions,
# and ratio between the dimensions
(W, H) = (None, None)
(newW, newH) = (args["width"], args["height"])
(rW, rH) = (None, None)

# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
	"feature_fusion/Conv_7/Sigmoid",
	"feature_fusion/concat_3"]

In [54]:
# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet(args["east"])

[INFO] loading EAST text detector...


In [55]:
# if a video path was not supplied, grab the reference to the web cam
if not args.get("video", False):
	print("[INFO] starting video stream...")
	vs = VideoStream(src=0).start()
	time.sleep(1.0)

# otherwise, grab a reference to the video file
else:
	vs = cv2.VideoCapture(args["video"])

In [56]:
# start the FPS throughput estimator
fps = FPS().start()

# loop over frames from the video stream
while True:
    # grab the current frame, then handle if we are using a
    # VideoStream or VideoCapture object
    frame = vs.read()
    frame = frame[1] if args.get("video", False) else frame

    # check to see if we have reached the end of the stream
    if frame is None:
        break

    # resize the frame, maintaining the aspect ratio
    frame = imutils.resize(frame, width=1000)
    orig = frame.copy()

    # if our frame dimensions are None, we still need to compute the
    # ratio of old frame dimensions to new frame dimensions
    if W is None or H is None:
        (H, W) = frame.shape[:2]
        rW = W / float(newW)
        rH = H / float(newH)

    # resize the frame, this time ignoring aspect ratio
    frame = cv2.resize(frame, (newW, newH))

    # construct a blob from the frame and then perform a forward pass
    # of the model to obtain the two output layer sets
    blob = cv2.dnn.blobFromImage(frame, 1.0, (newW, newH),
        (123.68, 116.78, 103.94), swapRB=True, crop=False)
    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)

    # decode the predictions, then  apply non-maxima suppression to
    # suppress weak, overlapping bounding boxes
    (rects, confidences) = decode_predictions(scores, geometry)
    boxes = non_max_suppression(np.array(rects), probs=confidences)

    # loop over the bounding boxes
    #---Tesseract 
    tesseractOutputImage = np.zeros_like(orig)
    origTesseract = orig.copy()
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        #----tessaract
        extraPixelsX = 0.99
        extraPixelsY = 0.90
        croppedImage = origTesseract[int(startY*extraPixelsY):int(endY/extraPixelsY), int(startX*extraPixelsX):int(endX/extraPixelsX)]
        config = ('-l eng --oem 3 --psm 6')
        text = pytesseract.image_to_string(croppedImage, config=config).encode('utf-8')
        print(startX, startY, endX, endY)
        textScale = (2.0*(endX-startX)/200.0)/(len(text)+0.002)*5
        textThickness = max(4*(endX-startX)/200/(len(text)+1)*4, 0)
        print("Text Scale: ", textScale)
        print("Text Thickness: ", textThickness)
        cv2.putText(tesseractOutputImage, str(text), (startX,endY), cv2.FONT_HERSHEY_SIMPLEX, textScale, (0, 255, 0), textThickness, lineType = cv2.LINE_AA)
    #         plt.imshow(croppedImage)
    #         plt.show()
        print("Text:", str(text))


    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # draw the bounding box on the frame
        cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)

    # update the FPS counter
    fps.update()

    # show the output frame
    cv2.imshow("Text Detection", orig)
    cv2.imshow("Text recognition", tesseractOutputImage)
    key = cv2.waitKey(1) & 0xFF

    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

# stop the timer and display FPS information
fps.stop()
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# if we are using a webcam, release the pointer
if not args.get("video", False):
    vs.stop()

# otherwise, release the file pointer
else:
    vs.release()

# close all windows
cv2.destroyAllWindows()

(643, 94, 723, 110)
('Text Scale: ', 2000.0)
('Text Thickness: ', 4)
('Text:', '')
(535, 390, 657, 418)
('Text Scale: ', 0.6776271939568984)
('Text Thickness: ', 0)
('Text:', 'ace\nUSIVE')
(725, 93, 809, 111)
('Text Scale: ', 0.46656298600311036)
('Text Thickness: ', 0)
('Text:', '. EDITION')
(704, 391, 810, 419)
('Text Scale: ', 2.6473526473526476)
('Text Thickness: ', 0)
('Text:', '+,')
(465, 430, 590, 450)
('Text Scale: ', 0.6248750249950009)
('Text Thickness: ', 0)
('Text:', 'PEACLUSIVE')
(296, 85, 382, 109)
('Text Scale: ', 0.25291142218562523)
('Text Thickness: ', 0)
('Text:', 'eeneeennem\neT HIF')
(292, 432, 384, 450)
('Text Scale: ', 0.24207978107567626)
('Text Thickness: ', 0)
('Text:', 'AKSHA\nsai) Ta ot\nes')
(221, 36, 693, 71)
('Text Scale: ', 0.9832513957170236)
('Text Thickness: ', 0)
('Text:', '@\xc2\xae tAkshavSpeaksToArnab')
(400, 87, 546, 109)
('Text Scale: ', 0.6082319613397766)
('Text Thickness: ', 0)
('Text:', 'NEWSY INTERV')
(364, 391, 528, 420)
('Text Scale: ', 0

(192, 85, 234, 112)
('Text Scale: ', 1.048951048951049)
('Text Thickness: ', 0)
('Text:', 'ie')
(650, 390, 703, 417)
('Text Scale: ', 0.33116720819795054)
('Text Thickness: ', 0)
('Text:', '\xe2\x80\x94-\nTOs')
(643, 94, 723, 110)
('Text Scale: ', 2000.0)
('Text Thickness: ', 4)
('Text:', '')
(725, 94, 807, 111)
('Text Scale: ', 0.3726595164515542)
('Text Thickness: ', 0)
('Text:', '. EDITION |')
(535, 390, 657, 418)
('Text Scale: ', 0.6776271939568984)
('Text Thickness: ', 0)
('Text:', 'ace\nUSIVE')
(704, 391, 810, 419)
('Text Scale: ', 2.6473526473526476)
('Text Thickness: ', 0)
('Text:', 'Po')
(465, 430, 590, 450)
('Text Scale: ', 0.6248750249950009)
('Text Thickness: ', 0)
('Text:', 'pEACLUSIVE')
(292, 432, 384, 450)
('Text Scale: ', 0.24207978107567626)
('Text Thickness: ', 0)
('Text:', 'AKSHA\nsai) Ta ot\nes')
(228, 36, 693, 71)
('Text Scale: ', 1.0567221161712572)
('Text Thickness: ', 0)
('Text:', '@ tAkshaySpeaksToArnab')
(298, 85, 385, 109)
('Text Scale: ', 2.172827172827173)


(398, 87, 589, 109)
('Text Scale: ', 1.5911362879040318)
('Text Thickness: ', 0)
('Text:', 'Ma Mea')
(373, 391, 545, 420)
('Text Scale: ', 0.5374328208973879)
('Text Thickness: ', 0)
('Text:', 'Bea\nSUPER EACLUS')
(387, 430, 470, 450)
('Text Scale: ', 1.0369815092453774)
('Text Thickness: ', 0)
('Text:', 'A\nae')
(235, 86, 293, 112)
('Text Scale: ', 1450.0)
('Text Thickness: ', 4)
('Text:', '')
(192, 85, 234, 112)
('Text Scale: ', 0.6995336442371752)
('Text Thickness: ', 0)
('Text:', 'THE')
(548, 130, 770, 188)
('Text Scale: ', 5.544455544455546)
('Text Thickness: ', 4)
('Text:', 'la')
(593, 89, 632, 110)
('Text Scale: ', 0.487256371814093)
('Text Thickness: ', 0)
('Text:', '_\xe2\x80\x94')
(643, 95, 723, 111)
('Text Scale: ', 2000.0)
('Text Thickness: ', 4)
('Text:', '')
(725, 94, 807, 111)
('Text Scale: ', 0.4554543434792268)
('Text Thickness: ', 0)
('Text:', '. EDITION')
(290, 432, 382, 450)
('Text Scale: ', 0.3285244965004999)
('Text Thickness: ', 0)
('Text:', 'AKSHA\nToate\nes')
(

(642, 95, 723, 112)
('Text Scale: ', 2025.0)
('Text Thickness: ', 4)
('Text:', '')
(725, 94, 806, 111)
('Text Scale: ', 0.44990002221728503)
('Text Thickness: ', 0)
('Text:', '. EDITION')
(290, 432, 384, 450)
('Text Scale: ', 0.46990601879624067)
('Text Thickness: ', 0)
('Text:', 'it\noad\nez,')
(464, 430, 592, 450)
('Text Scale: ', 0.639872025594881)
('Text Thickness: ', 0)
('Text:', 'LEACLUSIVE')
(228, 36, 693, 71)
('Text Scale: ', 1.0567221161712572)
('Text Thickness: ', 0)
('Text:', 'a tAkshaySpeaksToArnab')
(298, 85, 384, 110)
('Text Scale: ', 0.5373656585853536)
('Text Thickness: ', 0)
('Text:', 'Py\nANDID')
(395, 87, 587, 109)
('Text Scale: ', 4800.0)
('Text Thickness: ', 12)
('Text:', '')
(387, 429, 471, 449)
('Text Scale: ', 0.46656298600311036)
('Text Thickness: ', 0)
('Text:', 'TENE:\n\xe2\x80\x94')
(454, 390, 520, 418)
('Text Scale: ', 0.2356806170547065)
('Text Thickness: ', 0)
('Text:', 'Ball Ka\nR EAGL')
(273, 389, 457, 419)
('Text Scale: ', 0.6132515664578055)
('Text Th