For regression to work properly all the labels must be in a consistent order.  Starting from the top left, the coordinates should proceed clockwise.  This will automatically identify and fix all coordinates that are out of order.  These bad labels are likely to be particularly insidious for training as they dramatically throw off all eight predictions in the image.

## SETUP

In [24]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [25]:
import pandas as pd
import numpy as np
import os
import pathlib

In [26]:
# helper functions to reorder points

# point 1: of two smallest x, the one with smallest y
def p1(nums):
    coords = []
    # turn list of 8 numbers into list of 4 [x, y] pairs
    for i in range(0,8,2):
        coords.append([nums[i], nums[i+1]])
    # sort by x, take 2 pairs
    xs = sorted(coords)[:2]
    # sort by y, take 1 pair
    return sorted(xs, key=lambda x: x[1])[0]

# point 2: of two largest x, the one with smallest y
def p2(nums):
    coords = []
    for i in range(0,8,2):
        coords.append([nums[i], nums[i+1]])
    xs = sorted(coords, reverse=True)[:2]
    return sorted(xs, key=lambda x: x[1])[0]

# point 3: of two largest x, the one with largest y
def p3(nums):
    coords = []
    for i in range(0,8,2):
        coords.append([nums[i], nums[i+1]])
    xs = sorted(coords, reverse=True)[:2]
    return sorted(xs, key=lambda x: x[1], reverse=True)[0]

# point 4: of two smallest x, the one with largest y
def p4(nums):
    coords = []
    for i in range(0,8,2):
        coords.append([nums[i], nums[i+1]])
    xs = sorted(coords)[:2]
    return sorted(xs, key=lambda x: x[1], reverse=True)[0]

In [27]:
path = pathlib.Path.home()/'.fastai/data'

In [28]:
# copy label files from git
! mkdir -p $path/labels
! cp -r labels $path/

In [29]:
labels_dir = path/'labels'
gc_labels_path = labels_dir/'good_community_labels'

In [18]:
os.listdir(gc_labels_path)

['trainingdata_5000_to_5499_good.json',
 'trainingdata_500_to_999_good.json',
 'trainingdata_2500_to_2999_good.json',
 'trainingdata_1500_to_1999_good.json',
 'trainingdata_0_to_499_good.json',
 'trainingdata_7000_to_7499_good.json',
 'trainingdata_3500_to_3999_good.json',
 'trainingdata_9000_to_9499_good_2.json',
 'trainingdata_8500_to_8999_good.json',
 'trainingdata_4000_to_4499_good.json',
 'trainingdata_6500_to_6999_good.json',
 'trainingdata_1000_to_1499_good.json',
 'trainingdata_2000_to_2499_good.json',
 'trainingdata_5500_to_5999_good.json',
 'trainingdata_4500_to_4999_good.json',
 'trainingdata_8000_to_8499_good.json',
 'trainingdata_3000_to_3499_good.json',
 'trainingdata_7500_to_7999_good.json']

In [None]:
# load all community labels
df_files = []
for file in os.listdir(gc_labels_path):
    df_files.append(pd.read_json(gc_labels_path/file))
    df = pd.concat(df_files, axis=1) 

In [30]:
# uncomment to process a different label file
df = pd.read_json(labels_dir/'training_superclean_labels.json')

## FIND

In [31]:
len(df.keys())

9253

In [32]:
# find ones out of order
results = []
for key, value in df.items():
    nums = value[0]
    if len(nums) == 8:
        # test all four points, if any one is out of order add it to the list 
        if not (nums[0:2] == p1(nums) and 
                nums[2:4] == p2(nums) and 
                nums[4:6] == p3(nums) and 
                nums[6:8] == p4(nums)):
            results.append(key)     

In [33]:
results[:5]

[]

In [34]:
len(results)

0

## FIX

In [None]:
# takes a filename and returns correctly ordered coordinates
def fix(fname):
    coords = df[fname][0]

    return (p1(coords) + p2(coords) + p3(coords) + p4(coords))

In [None]:
# reorder the coordinates so they are clockwise starting from top-left
count = 0
for fname in results:
    df[fname] = [fix(fname)]
    count += 1
print(f'Updated {count} items.')

In [None]:
# change values to double nested array per AlphaPilot spec
for key in df.keys():
    df[key] = [[df[key][0]]]

In [None]:
# write out clean json file with all the community labels

# convert dataframe to JSON string and strip out outer '[]'
json_str = df.to_json(orient='records')[1:-1]

# write to file
f = open(labels_dir/'training_community_labels.json', 'w')
f.write(json_str)
f.close()

## DEBUGING

In [None]:
df['IMG_1571.JPG']

In [None]:
# find ones out of order
results = []
for key, value in df.items():
    coords = value[0]
    if len(coords) == 8:
        x1, y1, x2, y2, x3, y3, x4, y4 = tuple(coords)
    if x1 > x2 or y2> y3 or x3 < x4 or y4 < y1:
        results.append(key)

In [None]:
# takes a filename and returns correctly ordered coordinates
def fix_old(fname):
    arr = np.array(df[fname][0]).reshape(4,2)
    arr_sum = arr.sum(axis=1)        # x + y
    arr_diff = arr[:,0] - arr[:,1]   # x - y
    
    pt1 = arr[arr_sum.argmin()]
    pt2 = arr[arr_diff.argmax()]
    pt3 = arr[arr_sum.argmax()]
    pt4 = arr[arr_diff.argmin()]
    
    return list(np.hstack((pt1, pt2, pt3, pt4)))