# Form cleaner

In [None]:
import cv2
import math
import numpy as np
import imutils
import subprocess
from IPython.display import Image

inputFile = "form11.png"
Image(filename='files/' + inputFile)

## Detect scanned page (if applicable)
- Detect edges
- Use the edges in the image to find the contour (outline) representing the piece of paper being scanned.
- Apply a perspective transform to obtain the top-down view of the document.

Objective:
- We want to work on the scanned page (if applicable) or the page itself.

See:
- http://bretahajek.com/2017/01/scanning-documents-photos-opencv/
- https://www.pyimagesearch.com/2014/09/01/build-kick-ass-mobile-document-scanner-just-5-minutes/
- https://github.com/Breta01/handwriting-ocr/blob/master/PageDetection.ipynb

In [None]:
img = cv2.imread("files/" + inputFile)

# TODO

cv2.imwrite("files/result-0.jpg", img)
Image(filename='files/result-0.jpg')

## Remove form / table structure

Objective:
- Remove horizontal and vertical lines
- Make it easier to detect text (remove unnecessary elements on page)

In [None]:
options = "files/result-0.jpg "
options += "-type "
options += "Grayscale "
options += "-negate "
options += "-define morphology:compose=darken "
options += "-morphology Thinning 'Rectangle:15x1+0+0<' "
options += "-negate "
options += "files/result-1.jpg"

# Make sure to install imagemagick, otherwise the following line will fail
subprocess.getoutput("convert " + options)
Image(filename='files/result-1.jpg')

## Remove noise and make text clearer

Objectives:
- Make text clearer
- Apply OSTU threshold to clean up the result

In [None]:
img = cv2.imread("files/result-1.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

gray = cv2.blur(gray,(1,1))
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

cv2.imwrite("files/result-2.jpg", gray)
Image(filename='files/result-2.jpg')

In [None]:
def my_blur(img):
    img = cv2.adaptiveThreshold(img, 255,
                                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 115, 4)
    
    k1 = np.ones((1,1),np.uint8)
    img = cv2.morphologyEx(img, cv2.MORPH_OPEN, k1)

    _,img = cv2.threshold(img,0,255,cv2.THRESH_BINARY_INV)

    k1 = np.ones((2,2),np.uint8)
    img = cv2.morphologyEx(img, cv2.MORPH_DILATE, k1)
    
    img = cv2.blur(img,(2,2))
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    return img

img1 = cv2.imread("files/result-0.jpg")
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
img1 = my_blur(img1)

img2 = cv2.imread("files/result-2.jpg")
img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
img2 = my_blur(img2)

img = cv2.absdiff(img1, img2)
img = cv2.bitwise_not(img)

#kernel = np.ones((1,1),np.uint8)
#img = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)

cv2.imwrite("files/result-3.jpg", img)
Image(filename='files/result-3.jpg')

## Shape detection and extraction

Objective:
- Extract text line

Possibilities:
- MSER
- Threshold (OTSU) and findContours

See:
- http://opencvpython.blogspot.ca/2012/06/hi-this-article-is-tutorial-which-try.html

In [None]:
# TODO

## OCR and handwriting recognition

See:
- Tesseract 4
- https://github.com/Breta01/handwriting-ocr (Handwriting recognition)

In [None]:
# TODO