In [1]:
import numpy as np
import os
from xml.etree import ElementTree
import json



### 打标签工具
[项目地址](https://github.com/wkentaro/labelme)

![image](http://raw.githubusercontent.com/tzutalin/labelImg/master/demo/demo3.jpg)

```
Ctrl + u	Load all of the images from a directory
Ctrl + r	Change the default annotation target dir
Ctrl + s	Save
Ctrl + d	Copy the current label and rect box
Space	Flag the current image as verified
w	Create a rect box
d	Next image
a	Previous image
del	Delete the selected rect box
Ctrl++	Zoom in
Ctrl--	Zoom out
↑→↓←	Keyboard arrows to move selected rect box
```

In [2]:
# 修改成自己的根路径
base_dir = '/Users/mac/tmp/ssd_data4/'

# 将图片保存到以下文件夹下面
image_save_path = base_dir + 'JPEGImages/' 
annotations_save_path = base_dir + 'Annotations/' 




val_ratio = 0.2
test_ratio = 0.05

label_lists = ['little', 'big']

layout_path = base_dir + 'Imagesets/Layout/'
if not os.path.exists(layout_path): 
    os.mkdir(layout_path)

main_path = base_dir + 'Imagesets/Main/'
if not os.path.exists(main_path): 
    os.mkdir(main_path)
    
json_save_path = base_dir + 'json/'
if not os.path.exists(json_save_path): 
    os.mkdir(json_save_path)


# Sagemaker 所需要的数据集
sagemaker_path = base_dir + 'sagemaker/'

if not os.path.exists(sagemaker_path): 
    os.mkdir(sagemaker_path)

train_path = sagemaker_path + 'train/'
if not os.path.exists(train_path): 
    os.mkdir(train_path)
    
train_annotation_path = sagemaker_path + 'train_annotation/'
if not os.path.exists(train_annotation_path): 
    os.mkdir(train_annotation_path)
    
validation_path = sagemaker_path + 'validation/'
if not os.path.exists(validation_path): 
    os.mkdir(validation_path)
    
validation_annotation_path = sagemaker_path + 'validation_annotation/'
if not os.path.exists(validation_annotation_path): 
    os.mkdir(validation_annotation_path)


In [3]:
class XML_preprocessor(object):

    def __init__(self, data_path, label_list, json_save_path):
        self.path_prefix = data_path
        self.num_classes = len(label_list)
        self.data = dict()
        self._label_list = label_list
        self._json_save_path = json_save_path
        self._preprocess_XML()

    def _preprocess_XML(self):
        filenames = os.listdir(self.path_prefix)
        for filename in filenames:

            if filename.startswith('.'):
                continue
            if not filename.endswith('.xml'):
                continue


            tree = ElementTree.parse(self.path_prefix + filename)
            root = tree.getroot()
            bounding_boxes = []
            one_hot_classes = []
            size_tree = root.find('size')
            width = float(size_tree.find('width').text)
            height = float(size_tree.find('height').text)
            for object_tree in root.findall('object'):
                for bounding_box in object_tree.iter('bndbox'):
                    xmin = float(bounding_box.find('xmin').text)/width
                    ymin = float(bounding_box.find('ymin').text)/height
                    xmax = float(bounding_box.find('xmax').text)/width
                    ymax = float(bounding_box.find('ymax').text)/height
                bounding_box = [xmin,ymin,xmax,ymax]
                bounding_boxes.append(bounding_box)
                class_name = object_tree.find('name').text
                one_hot_class = self._to_one_hot(class_name)
                one_hot_classes.append(one_hot_class)
            image_name = root.find('filename').text
            bounding_boxes = np.asarray(bounding_boxes)
            one_hot_classes = np.asarray(one_hot_classes)
            image_data = np.hstack((bounding_boxes, one_hot_classes))
            self.data[image_name] = image_data

    def _to_one_hot(self,name):
        one_hot_vector = [0] * self.num_classes

        _index = self._label_list.index(name)

        if _index < 0:
            print('Annotations 中的label 和配置文件中 不一致 unknown label: %s' % name)
        one_hot_vector[_index] = 1
        return one_hot_vector

    def _save_file(self, json_object, path):
        with open(path, "w") as f:
            json.dump(json_object, f)

    def to_json(self):

        train_val_list = list()

        filenames = os.listdir(self.path_prefix)

        for filename in filenames:

            if filename.startswith('.'):
                continue
            if not filename.endswith('.xml'):
                continue

            json_object = dict()
            tree = ElementTree.parse(self.path_prefix + filename)
            root = tree.getroot()
            size_tree = root.find('size')
            image_name = root.find('filename').text
            json_object['file'] = image_name

            width = int(size_tree.find('width').text)
            height = int(size_tree.find('height').text)

            annotations = list()
            categories = list()

            for object_tree in root.findall('object'):

                annotation = dict()
                category = dict()
                _top = 0
                _left = 0
                _width = 0
                _height = 0

                for bounding_box in object_tree.iter('bndbox'):
                    _top = int(bounding_box.find('ymin').text)
                    _left = int(bounding_box.find('xmin').text)
                    _width = int(bounding_box.find('xmax').text) - _left
                    _height = int(bounding_box.find('ymax').text) - _top


                class_name = object_tree.find('name').text

                class_id = self._label_list.index(class_name)
                if class_id < 0:
                    print('Annotations 中的label 和配置文件中 不一致 unknown label: %s' % class_name)
                annotation['class_id'] = class_id
                annotation['top'] = _top
                annotation['left'] = _left
                annotation['width'] = _width
                annotation['height'] = _height
                category['class_id'] = class_id
                category['name'] = class_name
                train_val = dict()
                train_val['name'] = image_name.split('.')[0]
                train_val['label'] = class_id + 1
                train_val_list.append(train_val)


                annotations.append(annotation)
                categories.append(category)

            image_list = list()
            image_size = dict()
            image_size['width'] = width
            image_size['height'] = height
            image_size['depth'] = 3
            image_list.append(image_size)

            json_object['image_size']= image_list
            json_object['annotations'] = annotations
            json_object['categories'] = categories

            path = self._json_save_path + image_name.split('.')[0] +'.json'
            self._save_file(json_object, path)

        return train_val_list



In [4]:

xml = XML_preprocessor(annotations_save_path, label_lists, json_save_path)


In [5]:
all_data = xml.to_json()

In [6]:
import random

data_length = len(all_data)
random.shuffle(all_data)
print('length: ', len(all_data))
 

val_count = int(data_length * val_ratio)
test_count = int(data_length * test_ratio)

val_list =  all_data[0:val_count]
test_list =  all_data[val_count: val_count+test_count]
train_list = all_data[val_count+test_count:]

print('val   :', len(val_list))
print('test  :', len(test_list))
print('train :', len(train_list))


def dict_to_set(list_dict):
    result_set = set()
    for i in list_dict:
        result_set.add(i['name'])
    return result_set


dict_to_set(val_list)

def write_list_to_file(_list, file_path):
    with open(file_path, "w") as f:
        for i in _list:
            f.write('{} {}\n'.format(i['name'], i['label']))

def write_set_to_file(_set, file_path):
    with open(file_path, "w") as f:
        for i in _set:
            f.write('{}\n'.format(i))            
            
            




length:  264
val   : 52
test  : 13
train : 199


### 将数据写入到sagemaker 文件夹里， 给sagemaker 训练使用

In [7]:
from shutil import copyfile
train_set = dict_to_set(train_list)
val_set = dict_to_set(val_list)
test_set = dict_to_set(test_list)

for item in train_set:
    copyfile('{}{}.jpg'.format(image_save_path, item) ,'{}{}.jpg'.format(train_path, item) )
    copyfile('{}{}.json'.format(json_save_path, item) ,'{}{}.json'.format(train_annotation_path, item) )
    

for item in val_set:
    copyfile('{}{}.jpg'.format(image_save_path, item) ,'{}{}.jpg'.format(validation_path, item) )
    copyfile('{}{}.json'.format(json_save_path, item) ,'{}{}.json'.format(validation_annotation_path, item) )


In [8]:
print(test_set)

{'0002', '0045', '0101', '0025', '0001', '0163', '0029', '0183', '0121', '0135', '0082', '0168', '0019'}


In [9]:
write_list_to_file(train_list, layout_path + 'train.txt')
write_list_to_file(val_list, layout_path + 'trainval.txt')
write_list_to_file(test_list, layout_path + 'val.txt')

write_set_to_file(dict_to_set(train_list), main_path + 'train.txt')
write_set_to_file(dict_to_set(val_list), main_path + 'trainval.txt')
write_set_to_file(dict_to_set(test_list), main_path + 'val.txt')

