# Dataset Prepare
I will merge kaggle train data and Stanford ImageDog DataSet into a train data set for this com
Kaggle dataset can run kaggle-cli downlown.

In [1]:
import os
import pandas as pd
import shutil
import xml.etree.ElementTree
from mxnet import gluon
from collections import Counter
import math

In [2]:
from mxnet.gluon.model_zoo import vision as models
import mxnet as mx

In [3]:
stanford_dataset_dir = "/home/ubuntu/nbs/data/ImageNetDogs"
stanford_images_root_dir = os.path.join(stanford_dataset_dir, 'Images')
stanford_annotations_root_dir = os.path.join(stanford_dataset_dir, 'Annotation')

In [4]:
kaggle_dataset_dir = "/home/ubuntu/nbs/data/dog-breed-identification"
kaggle_labels_file_path = os.path.join(kaggle_dataset_dir, "labels.csv")
kaggle_submission_file_path = os.path.join(kaggle_dataset_dir, "sample_submission.csv")
kaggle_train_images_root_dir = os.path.join(kaggle_dataset_dir,"train")
kaggle_tests_images_root_dir = os.path.join(kaggle_dataset_dir,"tests")

In [5]:
def parse_stanford_annotation(path):
    xml_root = xml.etree.ElementTree.parse(path).getroot()
    object = xml_root.findall('object')[0]
    name = object.findall('name')[0].text.lower()
    bound_box = object.findall('bndbox')[0]

    return {
        'breed': name,
        'bndbox_xmin': bound_box.findall('xmin')[0].text,
        'bndbox_ymin': bound_box.findall('ymin')[0].text,
        'bndbox_xmax': bound_box.findall('xmax')[0].text,
        'bndbox_ymax': bound_box.findall('ymax')[0].text
    }

In [6]:
def get_stanford_image_path(breed_dir, filename):
    return os.path.join(stanford_images_root_dir, breed_dir, filename + '.jpg')

# 将stanford 和 kaggle的图片数据依据breed放到对应的文件夹下

In [7]:
train_img_dir = 'input/train'
if os.path.exists(train_img_dir):
    shutil.rmtree(train_img_dir)

valid_img_dir = 'input/valid'
if os.path.exists(valid_img_dir):
    shutil.rmtree(valid_img_dir)
    
test_img_dir = 'input/test'
if os.path.exists(test_img_dir):
    shutil.rmtree(test_img_dir)

In [8]:
valid_ratio = 0.3

In [9]:
# create symlink for stanford dataset train imgages
for breed_dir in [d for d in os.listdir(stanford_annotations_root_dir)]:
    
    num_image = len(os.listdir(os.path.join(stanford_images_root_dir,breed_dir)))
    num_valid = math.floor(num_image * valid_ratio)
    cnt_image = 0
    for annotation_file in [f for f in os.listdir(os.path.join(stanford_annotations_root_dir, breed_dir))]:
        annotation = parse_stanford_annotation(os.path.join(stanford_annotations_root_dir, breed_dir, annotation_file))
        image_path = get_stanford_image_path(breed_dir, annotation_file)
        image_file = image_path.split("/")[-1]
        image_name = image_file.split('.')[0]
        breed = annotation['breed']
        breed_img_dir = ''
        
        if cnt_image > num_valid:
            breed_img_dir = '%s/%s' % (train_img_dir, breed)
        else:
            breed_img_dir = '%s/%s' % (valid_img_dir, breed)
        
        if not os.path.exists(breed_img_dir):
            os.makedirs(breed_img_dir)
        os.symlink(image_path, '%s/%s.jpg' % (breed_img_dir, image_name))
        cnt_image += 1

In [10]:
# create symlink for kaggle dataset train images
kg_labels = pd.read_csv(kaggle_labels_file_path)
min_num_train_per_label = (Counter(kg_labels['breed']).most_common()[:-2:-1][0][1])
# 验证集中每类狗的数量。
num_valid_per_label = math.floor(min_num_train_per_label * valid_ratio)
breed_count = dict()

for _, (image_name, breed) in kg_labels.iterrows():
    breed_count[breed] = breed_count.get(breed, 0) + 1
    breed_imgs_dir = ''
    if breed_count[breed] > num_valid_per_label:
        # copy to vaild dir
        breed_imgs_dir = '%s/%s' % (train_img_dir, breed)
    else:
        breed_imgs_dir = '%s/%s' % (valid_img_dir, breed)
    if not os.path.exists(breed_imgs_dir):
        os.makedirs(breed_imgs_dir)
    os.symlink('%s/%s.jpg' % (kaggle_train_images_root_dir, image_name), '%s/%s.jpg' % (breed_imgs_dir, image_name))

In [11]:
# create symlink for kaggle dataset test images
kg_submission = pd.read_csv(kaggle_submission_file_path)
breed = '0'
for fname in kg_submission['id']:
    breed_imgs_dir = '%s/%s' % (test_img_dir, breed)
    if not os.path.exists(breed_imgs_dir):
        os.makedirs(breed_imgs_dir)
    os.symlink('%s/%s.jpg' % (kaggle_tests_images_root_dir, fname), '%s/%s.jpg' % (breed_imgs_dir, fname))