In [1]:
import pandas as pd
import numpy as np

## Sample Submission

In [5]:
sample_sub = pd.read_csv("./datasets/train_val/sample_submission.csv")

In [6]:
sample_sub.describe()

Unnamed: 0,ImageId,EncodedPixels
count,88486,88486
unique,88486,1
top,9d3cd0de1.jpg,1 2
freq,1,88486


In [10]:
sample_sub[sample_sub["ImageId"] == "9d3cd0de1.jpg"]

Unnamed: 0,ImageId,EncodedPixels
54267,9d3cd0de1.jpg,1 2


In [41]:
# 14 unwanted test images
unwanted_test_images = ['13703f040.jpg',
 '14715c06d.jpg',
 '33e0ff2d5.jpg',
 '4d4e09f2a.jpg',
 '877691df8.jpg',
 '8b909bb20.jpg',
 'a8d99130e.jpg',
 'ad55c3143.jpg',
 'c8260c541.jpg',
 'd6c7f17c7.jpg',
 'dc3e7c901.jpg',
 'e44dffe88.jpg',
 'ef87bad36.jpg',
 'f083256d8.jpg',]

In [43]:
print("Submission ids", len(sample_sub["ImageId"].tolist()))
print("Unwanted", len(unwanted_test_images))

# Check if any unwanted images are in sample submission csv
len(np.setdiff1d(sample_sub['ImageId'].unique(), unwanted_test_images, assume_unique=True))


Submission ids 88486
Unwanted 14


88486

## Actual Submission

In [3]:
sub_blanks = pd.read_csv("./submission_20180817T1649.csv")
sub_blanks.head()

Unnamed: 0,ImageId,EncodedPixels
0,0001124c7.jpg,
1,000194a2d.jpg,
2,0001b1832.jpg,
3,00052ed46.jpg,
4,000532683.jpg,569080 1 569846 5 570613 8 571380 10 572148 10...


## Corrected Submission

In [5]:
df = sub_blanks.replace(r'^\s*$', np.nan, regex=True)
df.head()

In [4]:
# df.to_csv("submission_20180817T1649_2.csv", index=False)

## Remove duplicate pixels

In [140]:
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def rle_decode(mask_rle, shape=(768, 768)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

In [11]:
sub_df = pd.read_csv("~/Downloads/submission_20180818T1208.csv")
sub_df.describe()
sub_df.head()

Unnamed: 0,ImageId,EncodedPixels
0,0001124c7.jpg,
1,000194a2d.jpg,1565 3 1594 1 2322 124 2450 9 3087 144 3852 15...
2,000194a2d.jpg,362787 1 363554 4 364321 5 365089 6 365857 7 3...
3,000194a2d.jpg,259770 8 260537 9 261305 9 262073 9 262841 9 2...
4,0001b1832.jpg,


In [128]:
multiple_instances_mask = sub_df.duplicated(subset="ImageId", keep=False)
multiple_instances_df = sub_df[dup_mask]
multiple_instances_df.head()

Unnamed: 0,ImageId,EncodedPixels
1,000194a2d.jpg,1565 3 1594 1 2322 124 2450 9 3087 144 3852 15...
2,000194a2d.jpg,362787 1 363554 4 364321 5 365089 6 365857 7 3...
3,000194a2d.jpg,259770 8 260537 9 261305 9 262073 9 262841 9 2...
8,0005d6d95.jpg,265139 6 265906 9 266673 11 267439 13 268206 1...
9,0005d6d95.jpg,265911 3 266678 5 267445 7 268213 7 268980 9 2...


In [127]:
example_mask = multiple_instances_df["ImageId"] == "000194a2d.jpg"
image_id_df = multiple_instances_df[example_mask]
print(image_id_df)

         ImageId                                      EncodedPixels
1  000194a2d.jpg  1565 3 1594 1 2322 124 2450 9 3087 144 3852 15...
2  000194a2d.jpg  362787 1 363554 4 364321 5 365089 6 365857 7 3...
3  000194a2d.jpg  259770 8 260537 9 261305 9 262073 9 262841 9 2...


In [189]:
unique_ids_multiple_instances = multiple_instances_df.ImageId.unique()
print("Checking ", len(unique_ids_multiple_instances), "unique images")
out_pred_rows = []
for image_id in unique_ids_multiple_instances:
    
#     print(image_id)
    # Image masks in RLE
    img_masks = multiple_instances_df.loc[multiple_instances_df['ImageId'] == image_id, 'EncodedPixels'].tolist()
#     print("Initial", img_masks)
    
    # Mask array placeholder
    mask_array = np.zeros([768, 768, len(img_masks)],dtype=np.uint8)

    # Build mask array
    for index, mask in enumerate(img_masks):
        mask_array[:,:,index] = rle_decode(mask, [768, 768])

#     print("mask_array shape", mask_array.shape)
    loop_range = np.array(mask_array).shape[-1] - 1
#     print("loop_range", np.arange(loop_range))
    
    # Check for overlap and remove overlapped pixels
    for i in np.arange(loop_range):
#         print("i: ", i)
        mask = mask_array[:,:,i]
        
        loop_j = np.arange(loop_range-i)+i+1
#         print(loop_j)
        for j in loop_j:
#             print("j: ", j)
            next_mask = mask_array[:,:,j]
            
            index_of_overlap = np.logical_and(mask, next_mask)
            if any(index_of_overlap.flatten()):
                    print("OVERLAP ", image_id)
                    next_mask[index_of_overlap] = 0
#             else: 
#                 print("NO OVERLAP")
        
    # Convert back into RLE encoding
    re_encoded_to_rle_list = []
    for i in np.arange(np.array(mask_array).shape[-1]):
        boolean_mask = mask_array[:,:,i]
        re_encoded_to_rle = rle_encode(boolean_mask)
        re_encoded_to_rle_list.append(re_encoded_to_rle)

#     print("---------------------")

#     print("Ending", re_encoded_to_rle_list)
    
    if len(re_encoded_to_rle_list) == 0:
        out_pred_rows += [{'ImageId': image_id, 'EncodedPixels': None}]
    else:
        for rle_mask in re_encoded_to_rle_list:
            out_pred_rows += [{'ImageId': image_id, 'EncodedPixels': rle_mask}]
            
no_overlap_df = pd.DataFrame(out_pred_rows)[['ImageId', 'EncodedPixels']]
    
#     print(no_overlap_df)
    
    
    

Checking  10065 unique images
---------------------
OVERLAP  0005d6d95.jpg
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
OVERLAP  006da0c7b.jpg
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------
---------------------


KeyError: 'the label [EncodedPixels] is not in the [columns]'

In [179]:
        
np.arange(2)+1
        

array([1, 2])

In [175]:
a1 = np.array([[1,1,0,0],
               [1,1,0,0]])
a2 = np.array([[0,0,0,0],
               [0,1,1,0]])
a3 = np.array([[0,1,1,0],
               [0,0,0,0]])
masks = np.zeros([2,4,3])
masks[:,:,0] = a1
masks[:,:,1] = a2
masks[:,:,2] = a3
masks.shape

(2, 4, 3)

In [176]:
# print("Masks", masks)
for index, mask in enumerate(masks):
    for next_mask in masks[index+1:]:
        index_of_overlap = np.logical_and(mask, next_mask)
        if any(index_of_overlap.flatten()):
            next_mask[index_of_overlap] = 0
    
print(masks)

[[[ 1.  0.  0.]
  [ 1.  0.  1.]
  [ 0.  0.  1.]
  [ 0.  0.  0.]]

 [[ 0.  0.  0.]
  [ 0.  1.  0.]
  [ 0.  1.  0.]
  [ 0.  0.  0.]]]


In [80]:
index_of_overlap = np.logical_and(a1,a2,a3)
index_of_overlap

a2[index_of_overlap] = 0
a3[index_of_overlap] = 0

In [81]:
print(a1)
print(a2)
print(a3)

[[0 0 0]
 [1 0 0]]
[[0 0 0]
 [0 0 0]]
[[0 0 0]
 [0 0 0]]
