In [1]:
import numpy as np
import tensorflow as tf
import glob
from PIL import Image
from tqdm.notebook import tqdm

In [2]:
# IMG_DIR = '/data/images' # a folder of .jpegs
# ANN_DIR = '/data/annotations' # folder of annotations

ANN_DIR = "/Users/codyfalkosky/Documents/hidden_desktop/PORTFOLIO_PROJ/working/YOLOv2/steps/data/annotations"
IMG_DIR = "/Users/codyfalkosky/Documents/hidden_desktop/PORTFOLIO_PROJ/working/YOLOv2/steps/data/images"

image_paths = glob.glob(IMG_DIR+'/*.jpg')

In [3]:
def extract_filename(path):
    'extracts filename from path'
    root, file_name = path.rsplit('/', 1)
    file_name, ext  = file_name.rsplit('.', 1)
    return file_name

def load_annotation(file_name):
    'load annotation associated with image based off filename'
    path = ANN_DIR + '/' + file_name + '.txt'
    
    with open(path, 'r') as file:
        annotation = file.read()

    return annotation

def load_image(path):
    'load image from path'
    image = Image.open(path)
    image = np.array(image)
    image = image.tolist()
    return image 

def annotation_to_labels_boxes(annotation):
    'parse annotations to labels and boxes'
    annotation = annotation.split()
    annotation = np.array(annotation, dtype=np.float32)
    annotation = np.reshape(annotation, (-1, 5))

    labels = annotation[:, 0:1]
    boxes  = annotation[:, 1:5]

    labels = labels.tolist()
    boxes = boxes.tolist()
    return labels, boxes

def image_labels_boxes(image_path):
    '''
    returns image, labels and boxes from an image path
    above functions combined into a single call
    '''
    image = load_image(image_path)
    labels, boxes = annotation_to_labels_boxes(load_annotation(extract_filename(image_path)))
    return image, labels, boxes    

In [4]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))


def serialize_example(image, labels, boxes):
    """Reads a single (image, labels, boxes) example and serializes for storage as TFRecord"""

    feature = {
        'image'  : _bytes_feature(tf.io.encode_jpeg(image)),
        'labels' : _float_feature(labels),
        'boxes'  : _float_feature(boxes)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [1]:
# SHARD DATA
REC_DIR = '/data/records/'
full_shard = 256
example_no = 0
shard_no   = 0

print('Sharding: Data')
for image_path in tqdm(image_paths, total=len(image_paths)):
    if example_no % full_shard == 0:
        if example_no != 0:
            shard_no += 1
            writer.close()            
        shard_filename = REC_DIR + f'hollywood_traffic_shard_{shard_no}.tfrecords'
        writer = tf.io.TFRecordWriter(shard_filename)
    
    serialized_example = serialize_example(*image_labels_boxes(image_path))
    writer.write(serialized_example)
    example_no += 1

writer.close()

In [None]:
# push to hf hub

In [10]:
# from datasets import Dataset, load_dataset, load_from_disk

# DATASET = []

# for img_path in tqdm(image_paths):
#     image, labels, boxes = image_labels_boxes(img_path)
#     DATASET.append({'image':image, 'labels':labels, 'boxes':boxes})

# dataset = Dataset.from_list(DATASET)
# data.push_to_hub('codyfalkosky/hollywood_traffic', token='')