In [47]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [1]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import numpy as np
try:
    from PIL import Image
    from io import BytesIO
    import pytesseract
    import re
    import math
    import cv2
    import matplotlib.pyplot as plt
except Exception as e:
    raise Exception("Be sure to set the right code env. {}".format(e))

### Set folder id

You must enter manually your input folder id

In [3]:
# Enter manually your input folder id (can be found in the folder URL)
input_folder_id = ''

### Some image processing functions
Here are the defintion of some image processing functions, you can add more functions and/or modify the existing ones

In [5]:
import cv2

# noise removal
def blurring(image):
    return cv2.medianBlur(image,5)
 
# thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# resizing
def resizing(image):
    return cv2.resize(image, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC)

# extract a rectangle from the image starting at (x, y) and ending at (x+width, y+height)
def cropping(image, x, y, width, height):
    return image[y:y+height, x:x+width]

# draw a bounding rectangle on the image starting at (x, y) and ending at (x+width, y+height)
# use this to know what rectangle to extract in the cropping function above
def draw_rectangle(image, x, y, width, height):
    image_copy = image.copy()
    cv2.rectangle(image_copy, pt1=(x, y), pt2=(x+width, y+height), color=(0, 255 ,0), thickness=5)
    return image_copy

# dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
# erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

# opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

In [6]:
#deskew image based on skew angle found by package deskew
from deskew import determine_skew
# import math
# import cv2

def deskew(image):
    def _rotate(image, angle):
        old_width, old_height = image.shape[:2]
        angle_radian = math.radians(angle)
        width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
        height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)

        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        rot_mat[1, 2] += (width - old_width) / 2
        rot_mat[0, 2] += (height - old_height) / 2
        return cv2.warpAffine(image, rot_mat, (int(round(height)), int(round(width))))
    
    angle = determine_skew(image)
    return _rotate(image, angle)   

### Test processing on some images
Before running the recipe, you can explore what image processing do to your images and their text extraction.
First we get the folder object and the filenames of the images

In [7]:
input_folder = dataiku.Folder(input_folder_id)
input_filenames = input_folder.list_paths_in_partition()

#### Select index of image on which you want to test processing
Here you can print the filenames and their corresponding indexes so you can choose some specific images to try some processing functions

In [48]:
for idx, filename in enumerate(input_filenames):
    print("Index: %s => Filename: %s" % (idx, filename))

Here we read the image into a numpy array

In [10]:
def read_image_from_index(test_index):
    test_file = input_filenames[test_index]
    with input_folder.get_download_stream(test_file) as stream:
        img_bytes = stream.read()
    return np.array(Image.open(BytesIO(img_bytes)))

#### Visualizing the image processing

In [27]:
def display_images_before_after(raw_image, processed_image):
    fig, ax = plt.subplots(1,2, figsize=(50,100))
    ax[0].axis('off')
    ax[1].axis('off')
    ax[0].imshow(raw_image, cmap='Greys_r')
    ax[1].imshow(processed_image, cmap='Greys_r')

Populating the interactive namespace from numpy and matplotlib


Here you can apply multiple processing functions on 'raw_image' to get a 'processed_image'.

In [32]:
def test_processing(img):
    # add the right functions here
    
    # img = blurring(img)
    # img = thresholding(img)
    # img = resizing(img)
    
    return img

You can visualize side by side the 'before' and 'after' processing using a test image index

In [49]:
test_index = 5
raw_image = read_image_from_index(test_index)
processed_image = test_processing(raw_image)
display_images_before_after(raw_image, processed_image)

#### Analyse the impact on text extraction

In [39]:
def text_extraction_before_after(raw_image, processed_image, lang='eng'):
    raw_image_text = pytesseract.image_to_string(raw_image, lang=lang)
    processed_image_text = pytesseract.image_to_string(processed_image, lang=lang)
    
    data = {'before':[raw_image_text, len(raw_image_text)],'after':[processed_image_text, len(processed_image_text)]}
    df = pd.DataFrame.from_dict(data)
    
    pd.options.display.max_rows
    pd.set_option('display.max_colwidth', -1)
    
    return df[['before','after']]

You can see the extracted text 'before' and 'after' processing images

In [50]:
text_extraction_before_after(raw_image, processed_image, lang='eng')

### Run processing on all images

Once you have found the processing functions to apply to the images, you can copy them in the Image Processing recipe form of the plugin.
Then, the recipe will process with your functions all images in its input folder.