In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from itertools import compress
from lxml import etree
import shutil
import os
import glob
import cv2

In [2]:
image_dir = './plastrial/images/'
annotation_dir = './plastrial/annotation/'
# image_dir = './plasmodium-images/images/'
# annotation_dir = './plasmodium-images/annotation/'
save_dir_pos = './plastrial/extracted/positive/'
save_dir_neg = './plastrial/extracted/negative/'
size = 50 # size of patch
step = 50

In [3]:
def delete_files_in_folder(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)

def remove_folder(folder):
    try:
        if os.path.isdir(folder): 
            shutil.rmtree(folder)
    except Exception as e:
            print(e)

#http://air.ug/microscopy
def get_bounding_boxes_for_single_image(filename):
    '''
    Given an annotation XML filename, get a list of the bounding boxes around
    each object (the ground truth object locations).
    '''
    annofile = filename[:-3] + 'xml'
    file_exists = os.path.exists(filename)
    boundingboxes = []

    if (file_exists):
        # Read the bounding boxes from xml annotation
        tree = etree.parse(filename)
        r = tree.xpath('//bndbox')

        if (len(r) != 0):
            for i in range(len(r)):
                xmin = round(float(r[i].xpath('xmin')[0].text))
                xmin = max(xmin,1)
                xmax = round(float(r[i].xpath('xmax')[0].text))
                ymin = round(float(r[i].xpath('ymin')[0].text))
                ymin = max(ymin,1)
                ymax = round(float(r[i].xpath('ymax')[0].text))
                xmin, xmax, ymin, ymax = int(xmin),int(xmax),int(ymin),int(ymax)

                boundingboxes.append((xmin,xmax,ymin,ymax))

    if len(boundingboxes) == 0:
        return np.array([])

    return np.vstack(boundingboxes)

#http://codereview.stackexchange.com/questions/31352/overlapping-rectangles
def range_overlap(a_min, a_max, b_min, b_max):
    '''Neither range is completely greater than the other'''
    return (a_min <= b_max) and (b_min <= a_max)

def overlap(r1, r2):
    '''Overlapping rectangles overlap both horizontally & vertically'''
    return range_overlap(r1[0], r1[1], r2[0], r2[1]) and range_overlap(r1[2], r1[3], r2[2], r2[3])

def get_image_negatives(save_dir, img, imgbasename, boundingboxes, size, step=25):
    '''Negative-labelled patches, taken at random from any part of the image
    not overlapping an annotated bounding box. Number of patches is determined
    by the step size. A step size = size returns non-overlapping patches'''
    height, width, c = img.shape
    num_patches = 0
    j = 0
    y = 0
    while y+(size) < height:
        #rows
        x = 0

        while (x+(size) < width):
            left = int(x)
            right = int(x+(size))
            top = int(y)
            bottom = int(y+(size))

            is_pos=False
            for bb in boundingboxes:
                if overlap([left,right,top,bottom], bb):
                    is_pos=True
                    break

            if not is_pos:
                patch = img[top:bottom, left:right]
                imgname = imgbasename[:-3] + 'neg' + str(j) + '.jpg'
                imgpath = os.path.join(save_dir, imgname)
                cv2.imwrite(imgpath, patch)
                num_patches += 1
                j += 1
            x += step
        y += step
    print("%d negative images extracted from %s"%(num_patches, imgbasename))
    return num_patches

def extract_positive_patches(save_dir, img, imgbasename , boundingboxes, size):
    '''Extract positive examples from the annotation files'''
    i = 0
    for bb in boundingboxes:
        cy = (bb[0] + (bb[1]-bb[0])/2)
        cx = (bb[2] + (bb[3]-bb[2])/2)
        patch =  img[int(cx-size/2):int(cx+size/2),int(cy-size/2):int(cy+size/2)]
        s= patch.shape
        if s[0]<size or s[1]<size:
            continue
        imgname = imgbasename[:-3] + 'pos' + str(i) + '.jpg'
        imgpath = os.path.join(save_dir, imgname)
        cv2.imwrite(imgpath, patch)
        i += 1
    print("%d images extracted and saved to %s"%(i, save_dir))
    return i

In [4]:
#clean up
delete_files_in_folder(save_dir_neg)
delete_files_in_folder(save_dir_pos)
remove_folder('./plastrial/data/train/')
remove_folder('./plastrial/data/test/')
remove_folder('./plastrial/data/validation/')

In [5]:
# Get annotation and image files
annofiles = glob.glob(annotation_dir + "*.xml")
imgfiles = glob.glob(image_dir + "*.jpg")
print("%d image files found in annotation folder"%len(annofiles))

11 image files found in annotation folder


In [6]:
# https://www.pyimagesearch.com/2014/09/15/python-compare-two-images/
def mse(imgA, imgB):
	# the 'Mean Squared Error' between the two images is the
	# sum of the squared difference between the two images;
	# NOTE: the two images must have the same dimension
    imageA = cv2.imread(imgA)
    imageB = cv2.imread(imgB)
    err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
    err /= float(imageA.shape[0] * imageA.shape[1])
	
	# return the MSE, the lower the error, the more "similar"
	# the two images are
    return err

def mseproto(prototype, imgB):
    #MSE between a prototype of the positive class and negative images
    imageB = cv2.imread(imgB)
    err = np.sum((prototype - imageB.astype("float")) ** 2)
    err /= float(prototype.shape[0] * prototype.shape[1])
    return err

In [7]:
# Extract patches from annotation files 
# Select only 10 percent of negative files that are closest to the mean of the positive class
positiveimages = 0
negativeimages = 0
for imgidx in range(len(annofiles)):
    annoimage = annofiles[imgidx]
    boundingboxes = get_bounding_boxes_for_single_image(annoimage)
    imgbasename = os.path.basename(annoimage)[:-3] + 'jpg'
    imgfile = glob.glob(image_dir + imgbasename)[0]
    
    img = cv2.imread(imgfile)
#     imgbasename = os.path.basename(imgfiles[imgidx])
    
    if not boundingboxes.any():
        continue
        
    #Create positive examples 
    countpos = extract_positive_patches(save_dir_pos, img, imgbasename , boundingboxes, size)
    
    if not countpos:
        continue
    #Create negative examples
    get_image_negatives(save_dir_neg, img, imgbasename, boundingboxes, size, step)
    
    #Select which negative examples to keep
    posimgs = glob.glob(save_dir_pos + imgbasename[:-4] + "*.jpg")
    negimgs = glob.glob(save_dir_neg + imgbasename[:-4] + "*.jpg")
    
    posprototype = np.mean([cv2.imread(posimg) for posimg in posimgs], axis=0)
    scores = []
    for im in negimgs:
        if posprototype.shape[0] == size:
            scores.append(mseproto(posprototype, im))
        else:
            scores.append(mse(posimgs[0], im)) # when only one positive is present in image
            
#     threshold = np.percentile(scores, 10) #Keep the closed 10 percent
    threshold = np.percentile(scores, 90) #Try with worst 10 percent
    
    # Select images from negative classes based on threshold
    selected_negimgs = list(compress(negimgs, scores > threshold))
    negimgstodelete = set(negimgs) - set(selected_negimgs)
    print("%d negative images were retained for %s"%(len(selected_negimgs), imgbasename))
    for nimg in negimgstodelete:
        os.remove(nimg)
    positiveimages += len(posimgs)
    negativeimages += len(selected_negimgs)
    
print("\nTotal images: %d positive images and %d negative images"%(positiveimages, negativeimages))

5 images extracted and saved to ./plastrial/extracted/positive/
288 negative images extracted from plasmodium-0009.jpg
29 negative images were retained for plasmodium-0009.jpg
3 images extracted and saved to ./plastrial/extracted/positive/
288 negative images extracted from plasmodium-0008.jpg
29 negative images were retained for plasmodium-0008.jpg
1 images extracted and saved to ./plastrial/extracted/positive/
294 negative images extracted from plasmodium-0006.jpg
30 negative images were retained for plasmodium-0006.jpg
10 images extracted and saved to ./plastrial/extracted/positive/
262 negative images extracted from plasmodium-0007.jpg
27 negative images were retained for plasmodium-0007.jpg
37 images extracted and saved to ./plastrial/extracted/positive/
176 negative images extracted from plasmodium-0005.jpg
18 negative images were retained for plasmodium-0005.jpg
23 images extracted and saved to ./plastrial/extracted/positive/
229 negative images extracted from plasmodium-0010.jp