## Desc
This workbook is a refractorization of the previous data pre-processing workbook.  It is meant to take json output from the labelbox service and create data to plug into a pytorch or fastai dataloader.  Sources will include both bounding box data and segmentation data.  Output is CSV ready to load with data of the form:

filename | Bounding Box Coordinates (separated by spaces)
--- | --- |
0003.jpg | 96 155 269 350 
0004.jpeg | 46 125 149 210

**Note** that all bounding boxes are in pixel format, and of the form top-left box and bottom-right box. 


In [278]:
import os
import json
import ast
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import copy
import random
import cv2
import pandas as pd

import io
import requests
import collections
from matplotlib import patches, patheffects

In [279]:
class labelDataParser():
    
    def __init__(self, 
                 raw_json_file, 
                 download_images = False, 
                 images_directory= "", 
                 remove_negatives=True,
                 number_of_boxes=None,
                formats_to_allow=['.jpg', '.jpeg'],
                label_for_data=["Standard"]):
        
        self.formats_to_allow = formats_to_allow
        self.images_directory = images_directory
        self.label_for_data = label_for_data
        self.download_images = download_images
        self.data = None
        
        if not os.path.isdir(self.images_directory):
            os.makedirs(self.images_directory)
        
        with open(raw_json_file) as data_file:    
            self.raw_json_data = json.load(data_file)
        
        self.convert_to_bounding()
        
        self.remove_negatives = remove_negatives
        
        
        # Standard processing
        if self.remove_negatives:
            self._remove_negatives()
        if number_of_boxes is not None: 
            self.filter_data_by_number_of_boxes(number_of_boxes)
        self.check_color_channels()
        self.invert_y_axis()        
            
    def get_img(self, url="",path=""):
        if url != "":
            r = requests.get(url, allow_redirects=True)
            img = Image.open(io.BytesIO(r.content))
        else:
            img = Image.open(path)
        return img
    
    def filter_data_by_number_of_boxes(self, number_of_boxes=1):
        ret = []
        for x in self.data:
            if len(x['boxes']) == number_of_boxes:
                ret.append(x)
        self.data = ret
    
    def check_data_plausibility(self):
        ret = []
        omit = False
        
        for i in self.data:
            for b in i['boxes']:
                if b[0] > b[2] or b[1] < b[3]:
                    print("Omitting:", i)
                    omit = True
            
            if not omit:
                ret.append(i)
            omit = False
    
        self.data = ret
    
    def _remove_negatives(self):
        for i in range(len(self.data)):
            for b in range(len(self.data[i]['boxes'])):
                for c in range(len(self.data[i]['boxes'][b])):
                    self.data[i]['boxes'][b][c] = max(0, self.data[i]['boxes'][b][c])

    def check_color_channels(self):
        tmp = []
        for i in self.data:
            img = cv2.imread(os.path.join(self.images_directory,i['name']))
            if img.shape[2] == 3:
                tmp.append(i)
        self.data = tmp
    
    def invert_y_axis(self):
        for i in range(len(self.data)):
            img = cv2.imread(os.path.join(self.images_directory,self.data[i]['name']))
            for b in range(len(self.data[i]['boxes'])):              
                self.data[i]['boxes'][b][1] = max(img.shape[0] - self.data[i]['boxes'][b][1], 0)
                self.data[i]['boxes'][b][3] = max(img.shape[0] - self.data[i]['boxes'][b][3], 0)

    def convert_to_bounding(self):
        data = []
        
        if self.download_images:
            print("Downloading Images: ", end="")
            dl = 0
    
        for img in self.raw_json_data:
            img_dic = {}
            
            _, file_ext = os.path.splitext(img['External ID'])
            
            if (img['Label'] == 'Skip') or (file_ext not in self.formats_to_allow):
                continue
            
            img_dic['url']  = img['Labeled Data']
            img_dic['old name'] = img['External ID']
            img_dic['name'] = img['ID']+file_ext

            ## Download and Save image in a folder
            if self.download_images:
                dl += 1
                print(dl, end="")
                try:
                    self.get_img(url=img_dic['url']).save(os.path.join(self.images_directory, img_dic['name']))
                except OSError as e:
                    print("Error downloading {} \n Error: {}".format(img_dic['url'], e))
                    continue
            
            polys = None
            for l in self.label_for_data:
                if l in img['Label']:
                    polys = img['Label'][l]
                    break
            if polys is None:
                continue
            
            
            poly_list=[]
            for poly in polys:
                    matrix_point_poly=[]
                    for point in poly:
                        matrix_point_poly.append([point['x'],point['y']])
                    matrix_point_poly = np.array(matrix_point_poly)
                    min_x, min_y = np.min(matrix_point_poly,axis=0)
                    max_x, max_y = np.max(matrix_point_poly,axis=0)
                    poly_list.append([min_x.item(),max_y.item(),max_x.item(),min_y.item()])

            img_dic['boxes'] = poly_list
            data.append(img_dic)
                
        self.data = data          
            

In [280]:
def turn_to_fast_ai_format(data, fn):
    ret = copy.deepcopy(data)
    for i in range(len(ret)):
        ret[i]['boxes'] = " ".join([ str(x) for x in lb_bb(ret[i]['boxes'][0])])
    f = pd.DataFrame(ret)
    f = f[['name','boxes']]
    f.columns = ['fn', 'bbox']
    f.to_csv(fn,index=False)

In [304]:
json_file_1 = "google_weights.json"
json_file_2 = "youtube_weights.json"
images_directory = "weight_images"
download_images = False
IMG_PATH = "weight_images/"

googleParser = labelDataParser(json_file_1, 
                             images_directory=images_directory, 
                             remove_negatives=True, 
                             number_of_boxes=1,
                             download_images= download_images,
                            label_for_data=["standard"])

youTubeParser = labelDataParser(json_file_2, 
                             images_directory=images_directory, 
                             remove_negatives=True, 
                             number_of_boxes=1, 
                             download_images= download_images,
                            label_for_data=["standard"])

combined_data = googleParser.data + youTubeParser.data

combined_data_clean = copy.deepcopy(combined_data)

print(len(combined_data_clean))
print(len(remove_duplicates(combined_data_clean)))
print(len(googleParser.data))
print(len(remove_duplicates(googleParser.data)))

turn_to_fast_ai_format(remove_duplicates(combined_data_clean), "weights.csv")
turn_to_fast_ai_format(remove_duplicates(googleParser.data), "google_weights.csv")

648
494
154
154


In [301]:
def remove_duplicates(data):
    new_data = copy.deepcopy(data)
    d = {} 
    ret = []
    for i in new_data:
        if i['name'] not in d:
            d[i['name']] = True
            ret.append(i)
    return ret

In [294]:
d = {}

for i in bearParser.data:
    d[i['name']] = True

for i in goatParser.data:
    if i['name'] not in d:
        print(i.name)

In [269]:
def show_idx(i):
    im = open_image(IMG_PATH+combined_data[i]['name'])
    ax = show_img(im)
    draw_rect(ax, lb_hw(combined_data[i]['boxes'][0]))


[547, 186, 806, 500] [186 547 500 806] [186 547 500 806]
[607, 435, 734, 562] [435 607 562 734] [435 607 562 734]
[94, 60, 732, 641] [ 60  94 641 732] [ 60  94 641 732]
[130, 38, 939, 735] [ 38 130 735 939] [ 38 130 735 939]
[143, 37, 552, 498] [ 37 143 498 552] [ 37 143 498 552]
[0, 0, 383, 317] [  0   0 317 383] [  0   0 317 383]
[317, 248, 835, 829] [248 317 829 835] [248 317 829 835]
[320, 189, 661, 441] [189 320 441 661] [189 320 441 661]
[588, 217, 774, 436] [217 588 436 774] [217 588 436 774]
[380, 119, 841, 578] [119 380 578 841] [119 380 578 841]
[13, 0, 282, 247] [  0  13 247 282] [  0  13 247 282]
[155, 87, 584, 314] [ 87 155 314 584] [ 87 155 314 584]
[81, 56, 590, 653] [ 56  81 653 590] [ 56  81 653 590]
[223, 12, 670, 596] [ 12 223 596 670] [ 12 223 596 670]
[167, 33, 447, 460] [ 33 167 460 447] [ 33 167 460 447]
[245, 83, 623, 485] [ 83 245 485 623] [ 83 245 485 623]
[602, 403, 2052, 1946] [ 403  602 1946 2052] [ 403  602 1946 2052]
[239, 97, 559, 399] [ 97 239 399 559] 

In [267]:
def lb_hw(a): 
    tmp = [  a[0],   a[1], a[2]-a[0]+1 ,  a[3]-a[1]+1]
    return np.array(tmp)

def lb_bb(a):
    return np.array([a[1], a[0], a[3], a[2]])

def bb_hw(a): return np.array([a[1],a[0],a[3]-a[1]+1,a[2]-a[0]+1])

def hw_bb(bb): return np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])

In [158]:


def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im)
#     ax.get_xaxis().set_visible(False)
#     ax.get_yaxis().set_visible(False)
    return ax

def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])
    
def draw_rect(ax, b):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor='white', lw=2))
    draw_outline(patch, 4)

In [91]:
def open_image(fn):
    """ Opens an image using OpenCV given the file path.
    Arguments:
        fn: the file path of the image
    Returns:
        The image in RGB format as numpy array of floats normalized to range between 0.0 - 1.0
    """
    flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
    if not os.path.exists(fn):
        raise OSError('No such file or directory: {}'.format(fn))
    elif os.path.isdir(fn):
        raise OSError('Is a directory: {}'.format(fn))
    else:
        #res = np.array(Image.open(fn), dtype=np.float32)/255
        #if len(res.shape)==2: res = np.repeat(res[...,None],3,2)
        #return res
        try:
            im = cv2.imread(str(fn), flags).astype(np.float32)/255
            if im is None: raise OSError(f'File not recognized by opencv: {fn}')
            return cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        except Exception as e:
            raise OSError('Error handling image at: {}'.format(fn)) from e