# Parse LabelMe Annotations and Images to Mask, train.csv, and val.csv

## Goal: 
1. Download collection folder from google drive 
2. Create another folder to store 'masks', 'labels', 'images'
3. Create train, test and validation csv files

# 1. Import Essential Libraries

In [1]:
import numpy as np
import os
from PIL import Image, ImageDraw
import cv2
import xmltodict
import sys
import errno
import shutil
import csv
import random
import gdown
from zipfile import ZipFile

In [2]:
def check_folder_exist(directory):
    try:
        os.stat(directory)
    except:
        print ('folder: ', directory, ' is not existed, please check')

def check_and_create_folder(directory):
    try:
        os.stat(directory)
        print ('folder: ', directory, 'is existed, do you want to remove it')
    except:
        os.mkdir(directory)
        print ('create ', directory)
        
# Copy Images and Labels dir to target dir
def copy_folder(src, dest):
    try:
        print (src)
        print (dest)
        shutil.copytree(src, dest)
    except OSError as e:
        # If the error was caused because the source wasn't a directory
        if e.errno == errno.ENOTDIR:
            shutil.copy(src, dest)
        else:
            print('Directory not copied. Error: %s' % e)

In [3]:
def xml2mask(xml_path, product_list, height, width):
    """
    Convert xml file to a label image

    Read a xml file, and generate a gray image.
    There are few kinds values in a image. For number of kinds, see 'classes_list':
    0: background

    Args:
        xml_path: xml path
    Returns:
        1-channel lable image
    """
    # check xml file exits
    file_exist = os.path.isfile(xml_path)    # True
    
    # create empty mask
    mask = np.zeros([height, width], dtype = np.uint8)
    
    # load xml file info
    if file_exist:
        with open(xml_path) as fd:
            label_dict = xmltodict.parse(fd.read())
    else:
        print (xml_path, 'does not exist')
        return mask

    # check objects labelled in xml file
    if 'object' in label_dict['annotation']:
        # only single object in xml file
        tmp_object_list = []
        if type(label_dict['annotation']['object']).__name__ != "list":
            tmp_object_list.append(label_dict['annotation']['object'])
        else:
            tmp_object_list.extend(label_dict['annotation']['object'])
        
        try:
            for object_ in tmp_object_list:
                if object_['name'] in product_list and object_['deleted'] == '0':
                    # print("object matched")
                    poly_vertice = []
                    for pts_idx in object_['polygon']['pt']:
                        poly_vertice.append([int(pts_idx['x']), int(pts_idx['y'])])
                    poly_vertice = np.array(poly_vertice, np.int32)
                    object_index = product_list.index(object_['name']) + 1
                    cv2.fillConvexPoly(mask, poly_vertice, object_index)
        except Exception as e:
            print (e)
           
    return mask

## Download dataset using "gdown" python package

In [4]:
# modify info here by your case
collection_url = 'https://drive.google.com/uc?id=1bULPUhuQ6BHjVgn_owuswF9qrOCC1fLO'
collection_name = 'shoes_collection'
source_annotations_dir = os.path.join(collection_name, 'Annotations/users/lab605/shoes_new')
source_images_dir = os.path.join(collection_name, 'Images/users/lab605/shoes_new')
target_output_dir = 'shoes_dataset_folder'
classes_list = ['left_shoe', 'right_shoe']
width = 640
height = 480
training_ratio = 0.8
val_ratio = 0.1
train_val_test_ratio = [0.8, 0.1, 0.1]


if not os.path.isdir(collection_name):
    gdown.download(collection_url, output=collection_name + '.zip', quiet=False)
    zip1 = ZipFile(collection_name + '.zip')
    zip1.extractall('./')
    zip1.close()
print 'Finished downloading dataset.'

Finished downloading dataset.


# 2. Setup Path of Annotations, Images, Output Foloder, and Classes

In [5]:
# do not modify
target_masks_dir = os.path.join(target_output_dir, 'masks')
target_images_dir = os.path.join(target_output_dir, 'images')
target_labels_dir = os.path.join(target_output_dir, 'labels')

check_folder_exist(source_annotations_dir)
check_folder_exist(source_images_dir)

check_and_create_folder(target_output_dir)
check_and_create_folder(target_masks_dir)

copy_folder(source_annotations_dir, target_labels_dir)
copy_folder(source_images_dir, target_images_dir)

('folder: ', 'shoes_dataset_folder', 'is existed, do you want to remove it')
('folder: ', 'shoes_dataset_folder/masks', 'is existed, do you want to remove it')
shoes_collection/Annotations/users/lab605/shoes_new
shoes_dataset_folder/labels
Directory not copied. Error: [Errno 17] File exists: 'shoes_dataset_folder/labels'
shoes_collection/Images/users/lab605/shoes_new
shoes_dataset_folder/images
Directory not copied. Error: [Errno 17] File exists: 'shoes_dataset_folder/images'


# 4. Create Mask Images

In [6]:
label_xmls = os.listdir(target_labels_dir)
for xml_name in sorted(label_xmls):
    # create mask depending on xml
    xml_path = os.path.join(target_labels_dir, xml_name)
    mask = xml2mask(xml_path, classes_list, height, width)

    # create mask file path
    mask_file_name = xml_name.split("xml")[0] + "png"
    save_path = os.path.join(target_masks_dir, mask_file_name)
    # print('save_path' + save_path)
    cv2.imwrite(save_path, mask,[int(cv2.IMWRITE_JPEG_QUALITY), 100])

### So far, you will see there is a new dataset folder created in project root.
### Inside the dataset folder, there are 3 sub folders include 'labels', 'masks', 'images'

# 5. Random generate train, test and validation csv

In [7]:
image_files = os.listdir(target_images_dir)
mask_files = os.listdir(target_masks_dir)
dataset = []

# match images and masks into dataset
for image_file in image_files:
    image_name = image_file.split(".")[0]
    for mask_file in mask_files:
        mask_name = mask_file.split(".png")[0]
        if mask_name == image_name:
            dataset.append('images/' + image_file + ',masks/' + mask_file)

assert sum(train_val_test_ratio) == 1.0, 'The sum of train_val_test_ratio have to be 1.0'

train_offset = int(0)
val_offset = int(np.floor(len(dataset) * train_val_test_ratio[0]))
test_offset = val_offset + int(np.floor(len(dataset) * train_val_test_ratio[1])) 

# split dataset into training and testing parts
random.shuffle(dataset)
train_dataset = dataset[train_offset:val_offset] #Remaining X% to training set
val_dataset = dataset[val_offset:test_offset] #Splits X% data to test set
test_dataset = dataset[test_offset:]

print('train_dataset_amount:\t{}\nval_dataset_amount:\t{}\ntest_dataset_amount:\t{}\n'.format(
    len(train_dataset), len(val_dataset), len(test_dataset)))

# write train.csv
with open((target_output_dir + '/train.csv'), 'w') as csvfile:
    writer = csv.writer(csvfile)    
    for train_data in train_dataset:
        data = train_data.split(",")
        writer.writerow([data[0], data[1]])

# write val.csv
with open((target_output_dir + '/val.csv'), 'w') as csvfile:
    writer = csv.writer(csvfile)    
    for val_data in val_dataset:
        data = val_data.split(",")
        writer.writerow([data[0], data[1]])


# write test.csv
with open((target_output_dir + '/test.csv'), 'w') as csvfile:
    writer = csv.writer(csvfile)    
    for test_data in test_dataset:
        data = test_data.split(",")
        writer.writerow([data[0], data[1]])

train_dataset_amount:	80
val_dataset_amount:	10
test_dataset_amount:	10

