In [0]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import pytesseract
import re
import cv2
from deskew import determine_skew
import math
import matplotlib.pyplot as plt

### Get folder id

In [0]:
client = dataiku.api_client()
project_name = dataiku.default_project_key()
project = client.get_project(project_name)

You can either enter manually your input folder id

In [0]:
# Enter manually your input folder id (can be found in the folder URL)
input_folder_id = '9KqYNNmD'

Or you can just enter your input and output folders name

In [0]:
# Retrieve folder id using folder name
input_folder_name = 'small-test'
input_folder_id_list = [folder['id'] for folder in project.list_managed_folders() if folder['name'] == input_folder_name]
if len(input_folder_id_list) > 0:
    input_folder_id = input_folder_id_list[0]
else:
    print("Wrong input folder name")

### Some image processing functions
Here are the defintion of some image processing functions, you can add more functions and/or modify the existing ones

In [0]:
# noise removal
def blurring(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

In [0]:
#deskew image based on skew angle found by package deskew
def deskew(image):
    def _rotate(image, angle):
        old_width, old_height = image.shape[:2]
        angle_radian = math.radians(angle)
        width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
        height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)

        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        rot_mat[1, 2] += (width - old_width) / 2
        rot_mat[0, 2] += (height - old_height) / 2
        return cv2.warpAffine(image, rot_mat, (int(round(height)), int(round(width))))
    
    angle = determine_skew(image)
    return _rotate(image, angle)   

### Test processing on some images
Before running the recipe, you can explore what image processing do to your images and their text extraction
First we get the folder object and the filenames of the images

In [0]:
input_folder = dataiku.Folder(input_folder_id)
input_filenames = input_folder.list_paths_in_partition()

#### Select index of image on which you want to test processing
Here you can print the filenames and their corresponding indexes so you can choose some specific images to try some processing functions

In [0]:
for idx, filename in enumerate(input_filenames):
    print("Index: %s => Filename: %s" % (idx, filename))

Choose an image index to test processing on it

In [0]:
test_index = 1

Here we read the image into a numpy array 'before_image'

In [0]:
test_file = input_filenames[test_index]
with input_folder.get_download_stream(test_file) as stream:
    data = stream.readlines()
before_image = np.array(Image.open(BytesIO(b"".join(data))))

#### Transform image using processing function
Here you can apply multiple processing functions on 'before_image' to get a 'after_image' that has been processed

In [0]:
after_image = before_image
# after_image = blurring(after_image)
# after_image = thresholding(after_image)
# after_image = deskew(after_image)

#### Visualizing the image processing

In [0]:
def display_images_before_after(before_image, after_image):
    fig, ax = plt.subplots(1,2, figsize=(50,100))
    ax[0].axis('off')
    ax[1].axis('off')
    ax[0].imshow(before_image, cmap='Greys_r')
    ax[1].imshow(after_image, cmap='Greys_r')

You can visualize side by side the 'before' and 'after' processing images 

In [0]:
# display_images_before_after(before_image, after_image)

In [0]:
#### Analyse the impact on text extraction

In [0]:
def text_extraction_before_after(before_image, after_image):
    before_text = pytesseract.image_to_string(before_image)
    after_text = pytesseract.image_to_string(after_image)
    
    data = {'before':[before_text],'after':[after_text]}
    df = pd.DataFrame.from_dict(data)
    
    pd.options.display.max_rows
    pd.set_option('display.max_colwidth', -1)
    
    return df[['before','after']]

You can see the extracted text 'before' and 'after' processing images

In [0]:
# text_extraction_before_after(before_image, after_image)

### Run processing on all images
Once you have chosen the processing to apply to the images, you need to add the functions here so you can run this notebook as a recipe

In [0]:
def final_processing(before_image):
    # add the right functions here
    
    after_image = before_image
    after_image = blurring(after_image)
    # after_image = thresholding(after_image)
    # after_image = deskew(after_image)
    
    return after_image

Enter your output folder id here:

In [0]:
output_folder_id = ""

Here is the final script that process the images and write them into the output folder

In [0]:
input_folder = dataiku.Folder(input_folder_id)
input_filenames = input_folder.list_paths_in_partition()
output_folder = dataiku.Folder(output_folder_id)

for sample_file in input_filenames:
    if sample_file.split('.')[-1] != "jpg":
        continue

    with input_folder.get_download_stream(sample_file) as stream:
        data = stream.readlines()
    before_image = np.array(Image.open(BytesIO(b"".join(data))))

    after_image = final_processing(before_image)
    
    buf = BytesIO()
    Image.fromarray(after_image).save(buf, format='JPEG')
    img_bytes = buf.getvalue()

    output_folder.upload_data(sample_file, img_bytes)