## Purpose: Creates .tfrecord files for training and testing

In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import os
import io
import pandas as pd
import tensorflow as tf
import json

from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict

def class_text_to_int(row_label, label_map):
    if row_label in label_map:
        return label_map[row_label]['id']
    else:
        None

def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]


def create_tf_example(group, path, label_map):
    with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
        encoded_jpg = fid.read()
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename = group.filename.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
        xmins.append(row['xmin'] / width)
        xmaxs.append(row['xmax'] / width)
        ymins.append(row['ymin'] / height)
        ymaxs.append(row['ymax'] / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class'], label_map))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example

In [8]:
def createTFRecord(image_dir, csv_input, output_path, label_map):
    writer = tf.python_io.TFRecordWriter(output_path)
    path = os.path.join(image_dir)
    examples = pd.read_csv(csv_input)
    grouped = split(examples, 'filename')
    for group in grouped:
        tf_example = create_tf_example(group, path, label_map)
        writer.write(tf_example.SerializeToString())

    writer.close()

In [9]:
def createLabelMap(label_map_output_path, classes_path):
    fout = open(label_map_path, "w")
    fin = open(classes_path, "r")
    label_map = {}
    item_id = 1
    for class_item in fin:
        name = class_item.rstrip()
        writeLabelMapRecord(fout, item_id, name)
        label_map[name] = {"id": item_id}
        item_id = item_id + 1
    fout.close()
    fin.close()
    return json.loads(json.dumps(label_map))

In [10]:
def writeLabelMapRecord(fout, item_id, name):
    fout.write("item {\n")
    fout.write("  id: %d\n" % item_id)
    fout.write("  name: \'%s\'\n" % name)
    fout.write("}\n\n")

In [11]:
import glob
import pandas as pd
import pdb
import os
import re

def txt_to_csv(img_dir):
    txt_list = []
    for txt_file in glob.glob(img_dir + '/Label/*.txt'):
        f = open(txt_file, "r")
        
        
        for row in f:
            # parse row eg:  Apple, 1232, 343, 234, 234
            row_items = re.split(r' (?=[0-9])', row)
            # row_items = row.split(" ")
            value = (os.path.splitext(os.path.basename(f.name))[0] + '.jpg',
                    row_items[0],
                    int(float(row_items[1])),
                    int(float(row_items[2])),
                    int(float(row_items[3])),
                    int(float(row_items[4].rstrip())),
                    )
            txt_list.append(value)
        f.close()
    column_name = ['filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    txt_df = pd.DataFrame(txt_list, columns=column_name)
    return txt_df

In [12]:
def createSingularCSV(output_path, img_root_dir):
    df = pd.DataFrame(columns=['filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])
    for (dirpath, dirnames, filenames) in os.walk(img_root_dir):
        for directory in dirnames:
            txt_df = txt_to_csv(os.path.join(dirpath,directory))
            df = df.append(txt_df,ignore_index=True)
    df.to_csv(output_path)

In [13]:
import os
import itertools
import shutil

def flattenDirectoryStructure(root_path):
    all_files = []
    for root, _dirs, files in itertools.islice(os.walk(root_path), 1, None):
        for filename in files:
            if(filename != ".DS_Store"):
                all_files.append(os.path.join(root, filename))
    for filename in all_files:
        shutil.move(filename, os.path.join(root_path, os.path.basename(filename)))

In [16]:
# create label map
label_map_path = '/Users/Shanth/Documents/instaFresh_ml/data/labelMap.pbtxt'
classes_path = '/Users/Shanth/Documents/instaFresh_ml/classes.txt'
label_map = createLabelMap(label_map_path, classes_path)

In [20]:
# Train set
# create csv file
train_root_path = '/Users/Shanth/OIDv4_ToolKit/OID/Dataset/train'
train_csv_path = '/Users/Shanth/Documents/instaFresh_ml/data/train.csv'
createSingularCSV(train_csv_path, train_root_path)

# flatten images directory
train_images_path = '/Users/Shanth/Documents/instaFresh_ml/data/train'
flattenDirectoryStructure(train_images_path)

train_record_path = '/Users/Shanth/Documents/instaFresh_ml/data/train.record'
createTFRecord(train_images_path, train_csv_path, train_record_path, label_map)

In [173]:
# Test set
# create csv file
test_root_path = '/Users/Shanth/OIDv4_ToolKit/OID/Dataset/test'
test_csv_path = '/Users/Shanth/Documents/instaFresh_ml/data/test.csv'
createSingularCSV(test_csv_path, test_root_path)

# Flatten
test_images_path = '/Users/Shanth/Documents/instaFresh_ml/data/test'
flattenDirectoryStructure(test_images_path)

# create record
test_record_path = '/Users/Shanth/Documents/instaFresh_ml/data/test.record'
createTFRecord(test_images_path, test_csv_path, test_record_path, label_map)