# Image preprocessing

images were saved as png files with full browser window.
Need to crop them to content and save as rgb

1. convert images to jpg
1. crop images to content
1. resize images
1. calculate the mean rgb values and mean std deviation for the image dataset

In [1]:
from PIL import Image
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np

In [28]:
def get_list_of_existing_images(image_dir):
    existing_flist = []

    for dirpath, dirnames, filenames in os.walk(image_dir):
        for fname in filenames:
            cur_fpath = os.path.join(dirpath,fname)
            existing_flist.append(cur_fpath)
    print("found {:d} existing images".format(len(existing_flist)))
    
    return existing_flist

# def center_crop_image_to_dims(img, h=400, w=400):

#     width, height = img.size   # Get dimensions

#     new_width = h
#     new_height = w

#     left = (width - new_width)/2
#     top = (height - new_height)/2
#     right = (width + new_width)/2
#     bottom = (height + new_height)/2

#     # Crop the center of the image
#     img = img.crop((left, top, right, bottom))
    
#     return img


def crop_to_non_black_region(img_raw, min_threshold=20, plot=False):
    # convert image to rgb and crop to non black region
    img_rgb = img_raw.convert("RGB")
    img_arr_rgb = np.array(img_rgb)
    img_gray = img_raw.convert('L')
    img_arr_gray = np.array(img_gray)

    # make a binary mask to select area of picture that is not black
    _, binary = cv2.threshold(img_arr_gray, 20, 256, cv2.THRESH_BINARY)

    # use a boolean filter to select area of interest
    y_bool = np.argmax(binary,axis=1) > 0
    x_bool = np.argmax(binary,axis=0) > 0
    
    # crop using boolean filter
    img_arr_cropped = img_arr_rgb[y_bool,:][:,x_bool]
    img_cropped = Image.fromarray(img_arr_cropped)
    
    if plot:
        # # show binary mask it
#         plt.imshow(binary, cmap="gray")
#         plt.show()

        # show image
        plt.subplot(1,2,1)
        plt.title('original image')
        plt.imshow(img_arr_rgb)
        plt.subplot(1,2,2)
        plt.title('cropped image')
        plt.imshow(img_arr_cropped)
        plt.show()
    
    return Image.fromarray(img_arr_cropped)


def calc_resize_with_apect(size, min_dimension):
    
    w = size[0]
    h = size[1]

    # if min(size) > min_dimension:

    new_w = (w / min(size)) * min_dimension
    new_h = (h / min(size)) * min_dimension
        
    new_size = (int(new_w), int(new_h))

    return new_size


def resize_image(pil_image, min_dimension):
    
    """resize a pil image to have the minimum dimension given on oneside"""
    
    new_size = calc_resize_with_apect(pil_image.size, min_dimension=min_dimension)  
    pil_image = pil_image.resize(new_size, resample = Image.ANTIALIAS)
    
    return pil_image


def preprocess_image(input_img_path, output_img_path): 

    img = Image.open(input_img_path)
    img = crop_to_non_black_region(img, min_threshold=20, plot=False)
    img = resize_image(img, 224)
    img.save(output_img_path)
    
    return
    

def process_dir_of_images(input_image_dir, output_image_dir):
    
    print('checking input directory:')
    existing_fpaths_input = get_list_of_existing_images(input_image_dir)
    print('checking output directory:')
    existing_fpaths_output = get_list_of_existing_images(output_image_dir)
    
#     existing_fnames_input = [os.path.split(f)[1] for f in existing_fpaths_input]
    existing_fnames_output = [os.path.split(f)[1] for f in existing_fpaths_output]
    
    fpaths_to_process = [f for f in existing_fpaths_input if os.path.split(f)[1] not in existing_fnames_output]
    num_images_to_proc = len(fpaths_to_process)

    # calculate when to print status output
    print_step = min(int(num_images_to_proc/10), 1000)
    print('num images to process {:,}'.format(num_images_to_proc))
    
    for i, input_img_path in enumerate(fpaths_to_process):

        # make_output_fpath
        output_subpath = input_img_path.replace(input_image_dir,'').strip('\\')
        output_img_path = os.path.join(output_image_dir, output_subpath)

        if os.path.exists(output_img_path):
            print('    info:image already exists {}'.format(output_img_path))
            continue

        # make output folder if needed
        output_fldr_path = os.path.dirname(output_img_path)
        if not os.path.exists(output_fldr_path):
            os.makedirs(output_fldr_path)

        #load, process and save altered image
        try:
            preprocess_image(input_img_path, output_img_path)
        except:
            print("    warning: could not preprocess image {}".format(input_img_path))
        
        if (((i+1) % print_step) == 0) or ((i+1) == num_images_to_proc):
            print("finished processing {:,} of {:,} images".format(i+1, num_images_to_proc))

    return        

# Main

In [29]:
# define directories
data_dir = os.path.join('..','data','raw','scraped')
input_image_dir = os.path.join(data_dir, 'images')
output_image_dir = os.path.join('..','data','processed','images')

In [32]:
process_dir_of_images(input_image_dir, output_image_dir)

checking input directory:
found 9995 existing images
checking output directory:
found 9814 existing images
num images to process 187
finished processing 18 of 187 images
finished processing 36 of 187 images
finished processing 54 of 187 images
finished processing 72 of 187 images
finished processing 90 of 187 images
finished processing 108 of 187 images
finished processing 126 of 187 images
finished processing 144 of 187 images
finished processing 162 of 187 images
finished processing 180 of 187 images
finished processing 187 of 187 images
