In [0]:
import numpy as np
import tensorflow as tf
import functools, os, json

In [0]:
BATCH_SIZE = 1000 #@param {type: "integer"}
BUCKET = 'gs://gs_colab' #@param {type: "string"}
IMAGE_SIZE = 299 #@param {type: "integer"}

In [3]:
TPU_ADDRESS = f'grpc://{os.environ["COLAB_TPU_ADDR"]}'
TPU_ADDRESS

'grpc://10.27.254.50:8470'

In [4]:
from google.colab import auth
auth.authenticate_user()
  
# Upload the credentials to TPU.
with tf.Session(TPU_ADDRESS) as sess:    
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [0]:
def _preprocess(image_bytes, central_fraction=0.875):
  image = tf.image.decode_jpeg(image_bytes, channels=3)
  image = tf.image.convert_image_dtype(image, dtype=tf.float32)
  image = tf.image.central_crop(image, central_fraction=central_fraction)
  image = tf.expand_dims(image, 0)
  image = tf.image.resize_bilinear(image, [IMAGE_SIZE, IMAGE_SIZE], align_corners=False)
  image = tf.squeeze(image, [0])
  image = (image - 0.5) * 2.0
  image.set_shape([IMAGE_SIZE, IMAGE_SIZE, 3])
  return image

In [0]:
def _parse(serialized_example):
  features = tf.parse_single_example(
    serialized_example,
    features={
        "image": tf.FixedLenFeature([], tf.string),
        "label": tf.FixedLenFeature([], tf.int64),
    })

  image_bytes = features['image']
  image = _preprocess(image_bytes)
  label = features['label']
  return image, label

In [0]:
def get_ds_from_tfrec(data_dir, training, batch_size=BATCH_SIZE, num_parallel_calls=2):
  file_pattern = os.path.join(data_dir, 'train-*' if training else 'valid-*')
  dataset = tf.data.Dataset.list_files(file_pattern)
      
  def fetch_dataset(filename):
    buffer_size = 8 * 1024 * 1024  # 8 MiB per file
    dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
    return dataset

  dataset = dataset.apply(
    tf.data.experimental.parallel_interleave(
      fetch_dataset, cycle_length=num_parallel_calls, sloppy=True))
  
  dataset = dataset.apply(
    tf.data.experimental.map_and_batch(
      _parse,
      batch_size=batch_size,
      num_parallel_batches=num_parallel_calls,
      drop_remainder=True))

  dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
  return dataset

In [0]:
data_dir = f'{BUCKET}/dvc_tfrec'

train_input_fn = lambda params: get_ds_from_tfrec(data_dir, training=True)
valid_input_fn = lambda params: get_ds_from_tfrec(data_dir, training=False)

In [0]:
def model_fn(features, labels, mode, params):
  ir2 = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(
      include_top=False, weights=None, pooling='avg')
  pred = ir2(features)
  
  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.contrib.tpu.TPUEstimatorSpec(mode, predictions={
        'neck': pred,
        'label': labels,
    })
  elif mode == tf.estimator.ModeKeys.TRAIN: # dummy code to get a checkpoint
    loss = tf.reduce_sum(pred - pred)
    opt = tf.train.GradientDescentOptimizer(0.1)
    opt = tf.contrib.tpu.CrossShardOptimizer(opt)
    train_op = opt.minimize(loss)
    return tf.contrib.tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op)
  else:
    raise NotImplementedError()

In [0]:
model_dir = f'{BUCKET}/dvc_ir2'

training_config = tf.contrib.tpu.RunConfig(
    cluster=tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS),
    model_dir=model_dir,
    tpu_config=tf.contrib.tpu.TPUConfig(BATCH_SIZE),
)

In [11]:
weight_files = f'{BUCKET}/ir2/*'

!gsutil rm -r $model_dir
!gsutil cp $weight_files $model_dir

Removing gs://gs_colab/dvc_ir2/train_neck.tfrecord#1553327326126039...
Removing gs://gs_colab/dvc_ir2/valid_neck.tfrecord#1553327451338370...
/ [2 objects]                                                                   
Operation completed over 2 objects.                                              
Copying gs://gs_colab/ir2/checkpoint [Content-Type=application/octet-stream]...
Copying gs://gs_colab/ir2/keras_model.ckpt.data-00000-of-00001 [Content-Type=application/octet-stream]...
Copying gs://gs_colab/ir2/keras_model.ckpt.index [Content-Type=application/octet-stream]...
Copying gs://gs_colab/ir2/keras_model.ckpt.meta [Content-Type=application/octet-stream]...
- [4 files][212.1 MiB/212.1 MiB]                                                
Operation completed over 4 objects/212.1 MiB.                                    


In [12]:
estimator = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn,
    train_batch_size=BATCH_SIZE,
    predict_batch_size=BATCH_SIZE,
    config=training_config,
)

INFO:tensorflow:Using config: {'_model_dir': 'gs://gs_colab/dvc_ir2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.27.254.50:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2a666ae9e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.27.254.50:8470', '_evaluation_master': 'grpc://10.27.254.50:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=None, num_cores_per_replica=N

In [0]:
# estimator.train(train_input_fn, steps=1)

In [0]:
def _float32_feature(value):
  if not isinstance(value, list):
    value = [value]
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
  if not isinstance(value, list):
    value = [value]
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
  if isinstance(value, str):
    value = bytes(value, encoding='utf-8')
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [0]:
def neck_to_tfrec(input_fn, output_file):
  with tf.io.TFRecordWriter(output_file) as writer:
    for i, pred in enumerate(estimator.predict(input_fn)):
      neck, label = pred['neck'], pred['label']
      example = tf.train.Example(features=tf.train.Features(feature={
        'neck': _float32_feature(neck.tolist()),
        'label': _int64_feature(label),
      }))
      writer.write(example.SerializeToString())

In [16]:
train_output_file = f'{model_dir}/train_neck.tfrecord'
valid_output_file = f'{model_dir}/valid_neck.tfrecord'

neck_to_tfrec(train_input_fn, train_output_file)

Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Querying Tensorflow master (grpc://10.27.254.50:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 14529340411014639260)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 4494893675167142098)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 48285550346468396)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 2792205893177002525)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 10141940

In [17]:
neck_to_tfrec(valid_input_fn, valid_output_file)

INFO:tensorflow:Calling model_fn.
ERROR:tensorflow:Operation of type Placeholder (input_1) is not supported on the TPU. Execution will fail if this op is used in the graph. 
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:TPU job name worker
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from gs://gs_colab/dvc_ir2/keras_model.ckpt
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Init TPU system
INFO:tensorflow:Initialized TPU in 7 seconds
INFO:tensorflow:Starting infeed thread controller.
INFO:tensorflow:Starting outfeed thread controller.
INFO:tensorflow:Initialized dataset iterators in 0 seconds
INFO:tensorflow:Enqueue next (1) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (1) batch(es) of data from outfeed.
INFO:tensorflow:Enqueue next (1) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (1) batch(es) of data from outfeed.
INFO:tensorflow:Enqueue next (1) batch(es) of data to inf

In [18]:
!gsutil rm -r $model_dir/keras*
!gsutil rm -r $model_dir/checkpoint

Removing gs://gs_colab/dvc_ir2/keras_model.ckpt.data-00000-of-00001#1553372496550134...
Removing gs://gs_colab/dvc_ir2/keras_model.ckpt.index#1553372496741214...
Removing gs://gs_colab/dvc_ir2/keras_model.ckpt.meta#1553372497060471...
/ [3 objects]                                                                   
Operation completed over 3 objects.                                              
Removing gs://gs_colab/dvc_ir2/checkpoint#1553372496195846...
/ [1 objects]                                                                   
Operation completed over 1 objects.                                              
