In [0]:
#  !wget http://files.fast.ai/data/dogscats.zip

In [0]:
# !unzip dogscats.zip

In [0]:
import tensorflow as tf
import threading, random, sys, os
from datetime import datetime
import numpy as np
import six

In [0]:
tf.enable_eager_execution()

In [0]:
NUM_THREADS = 2 #@param {type:"integer"}
OUTPUT_DIR = './' #@param {type:"string"}
TRAIN_DIR = './dogscats/train' #@param {type:"string"}
VALID_DIR = './dogscats/valid' #@param {type:"string"}
NUM_TRAIN_SHARDS = 2 #@param {type:"integer"}
NUM_VALID_SHARDS = 2 #@param {type:"integer"}
JPEG_FILE_EXT = '.jpg' #@param {type:"string"}
BUCKET = 'gs://gs_colab' #@param {type:"string"}


In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
def _int64_feature(value):
  if not isinstance(value, list):
    value = [value]
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
  if six.PY3 and isinstance(value, str):
    value = six.binary_type(value, encoding='utf-8')
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [0]:
def _process_image_files_batch(thread_index, ranges, name, filenames, labels, num_shards):
  num_threads = len(ranges)
  assert not num_shards % num_threads
  num_shards_per_batch = num_shards // num_threads

  shard_ranges = np.linspace(ranges[thread_index][0], ranges[thread_index][1], num_shards_per_batch + 1).astype(int)
  num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]

  counter = 0
  for s in range(num_shards_per_batch):
    # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
    shard = thread_index * num_shards_per_batch + s
    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
    output_file = os.path.join(OUTPUT_DIR, output_filename)
    writer = tf.io.TFRecordWriter(output_file)

    shard_counter = 0
    files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)

    for i in files_in_shard:
      filename = filenames[i]
      label = labels[i]

      with tf.gfile.GFile(filename, 'rb') as f:
        image_buffer = f.read()
        example = tf.train.Example(features=tf.train.Features(feature={
          'image': _bytes_feature(image_buffer),
          'label': _int64_feature(label),
        }))
        writer.write(example.SerializeToString())
        
      shard_counter += 1
      counter += 1

      if not counter % 1000:
        print(f'{datetime.now()} [thread {thread_index}]: Processed {counter} of {num_files_in_thread} images in thread batch.')
        sys.stdout.flush()

    writer.close()
    print(f'{datetime.now()} [thread {thread_index}]: Wrote {shard_counter} images to {output_file}')
    sys.stdout.flush()
    shard_counter = 0
  print(f'{datetime.now()} [thread {thread_index}]: Wrote {counter} images to {num_files_in_thread} shards.')
  sys.stdout.flush()

In [0]:
def _process_image_files(name, filenames, labels, num_shards):
  spacing = np.linspace(0, len(filenames), NUM_THREADS + 1).astype(np.int)
  ranges = []
  threads = []
  for i in range(len(spacing) - 1):
    ranges.append([spacing[i], spacing[i + 1]])

  print(f'Launching {NUM_THREADS} threads for spacings: {ranges}')
  sys.stdout.flush()

  coord = tf.train.Coordinator()

  threads = []
  for thread_index in range(len(ranges)):
    args = (thread_index, ranges, name, filenames, labels, num_shards)
    t = threading.Thread(target=_process_image_files_batch, args=args)
    t.start()
    threads.append(t)

  coord.join(threads)
  
  print(f'{datetime.now()}: Finished writing all {len(filenames)} images in data set.')
  sys.stdout.flush()

In [0]:
def _find_image_files(data_dir):
  synsets = ['cats', 'dogs']

  labels = []
  filenames = []

  for i, synset in enumerate(synsets):
    jpeg_file_path = f'{data_dir}/{synset}/*{JPEG_FILE_EXT}'
    matching_files = tf.gfile.Glob(jpeg_file_path)
    labels.extend([i] * len(matching_files))
    filenames.extend(matching_files)

  print(f'Found {len(filenames)} JPEG files across {len(synsets)} labels inside {data_dir}.')
  return filenames, labels

In [0]:
def _process_dataset(name, directory, num_shards):
  filenames, labels = _find_image_files(directory)
  _process_image_files(name, filenames, labels, num_shards)

In [0]:
_process_dataset('valid', VALID_DIR, NUM_VALID_SHARDS)

Found 2000 JPEG files across 2 labels inside ./dogscats/valid.
Launching 2 threads for spacings: [[0, 1000], [1000, 2000]]
2019-03-22 08:42:23.758890 [thread 0]: Processed 1000 of 1000 images in thread batch.
2019-03-22 08:42:23.760710 [thread 0]: Wrote 1000 images to ./valid-00000-of-00002
2019-03-22 08:42:23.763128 [thread 1]: Processed 1000 of 1000 images in thread batch.
2019-03-22 08:42:23.764385 [thread 1]: Wrote 1000 images to ./valid-00001-of-00002
2019-03-22 08:42:23.765273 [thread 1]: Wrote 1000 images to 1000 shards.
2019-03-22 08:42:23.769293 [thread 0]: Wrote 1000 images to 1000 shards.
2019-03-22 08:42:24.282481: Finished writing all 2000 images in data set.


In [0]:
_process_dataset('train', TRAIN_DIR, NUM_TRAIN_SHARDS)

Found 23000 JPEG files across 2 labels inside ./dogscats/train.
Launching 2 threads for spacings: [[0, 11500], [11500, 23000]]
2019-03-22 08:42:32.902963 [thread 1]: Processed 1000 of 11500 images in thread batch.
2019-03-22 08:42:32.912562 [thread 0]: Processed 1000 of 11500 images in thread batch.
2019-03-22 08:42:33.455519 [thread 1]: Processed 2000 of 11500 images in thread batch.
2019-03-22 08:42:33.460451 [thread 0]: Processed 2000 of 11500 images in thread batch.
2019-03-22 08:42:34.124575 [thread 0]: Processed 3000 of 11500 images in thread batch.
2019-03-22 08:42:34.157232 [thread 1]: Processed 3000 of 11500 images in thread batch.
2019-03-22 08:42:34.743346 [thread 0]: Processed 4000 of 11500 images in thread batch.
2019-03-22 08:42:34.774315 [thread 1]: Processed 4000 of 11500 images in thread batch.
2019-03-22 08:42:35.258749 [thread 0]: Processed 5000 of 11500 images in thread batch.
2019-03-22 08:42:35.295373 [thread 1]: Processed 5000 of 11500 images in thread batch.
201

In [0]:
ls -l

total 1397072
-rw-r--r-- 1 root root      2520 Mar 22 08:40 adc.json
drwxrwxr-x 7 root root      4096 Oct  9  2016 [0m[01;34mdogscats[0m/
-rw-r--r-- 1 root root 857214334 Apr  1  2017 dogscats.zip
drwxr-xr-x 1 root root      4096 Mar  8 17:26 [01;34msample_data[0m/
-rw-r--r-- 1 root root 241835086 Mar 22 08:42 train-00000-of-00002
-rw-r--r-- 1 root root 285894746 Mar 22 08:42 train-00001-of-00002
-rw-r--r-- 1 root root  20797080 Mar 22 08:42 valid-00000-of-00002
-rw-r--r-- 1 root root  24829150 Mar 22 08:42 valid-00001-of-00002


In [0]:
!gsutil cp train* $BUCKET/dvc_tfrec

Copying file://train-00000-of-00002 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/230.6 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://train-00001-of-00002 [Content-Type=application/octet-stream]...
|
Operation completed over 2 objects/503.3 MiB.                                    


In [0]:
!gsutil cp valid* $BUCKET/dvc_tfrec

Copying file://valid-00000-of-00002 [Content-Type=application/octet-stream]...
Copying file://valid-00001-of-00002 [Content-Type=application/octet-stream]...
- [2 files][ 43.5 MiB/ 43.5 MiB]                                                
Operation completed over 2 objects/43.5 MiB.                                     
