In [0]:
import numpy as np
import tensorflow as tf
import functools, os, json, datetime

In [0]:
BUCKET = 'gs://gs_colab' #@param {type: "string"}
NUM_NECK = 1536

BATCH_SIZE = 512 #@param {type: "integer"}
EPOCHS = 50 #@param {type:"integer"}


In [3]:
TPU_ADDRESS = f'grpc://{os.environ["COLAB_TPU_ADDR"]}'
TPU_ADDRESS

'grpc://10.27.254.50:8470'

In [4]:
from google.colab import auth
auth.authenticate_user()
  
# Upload the credentials to TPU.
with tf.Session(TPU_ADDRESS) as sess:    
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [0]:
def _parse(serialized_example):
  features = tf.parse_single_example(
    serialized_example,
    features={
        'neck': tf.FixedLenFeature([NUM_NECK], tf.float32),
        "label": tf.FixedLenFeature([], tf.int64),
    })

  neck = features['neck']
  label = features['label']
  return neck, label

In [0]:
def get_ds_from_tfrec(data_dir, training, batch_size=BATCH_SIZE, num_parallel_calls=2):
  file_pattern = os.path.join(data_dir, 'train*' if training else 'valid*')
  dataset = tf.data.Dataset.list_files(file_pattern)

  def fetch_dataset(filename):
    buffer_size = 8 * 1024 * 1024  # 8 MiB per file
    dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
    return dataset

  dataset = dataset.apply(
    tf.data.experimental.parallel_interleave(
      fetch_dataset, cycle_length=num_parallel_calls, sloppy=True))
  
  if training:
    dataset = dataset.shuffle(50000, reshuffle_each_iteration=True)
    
  dataset = dataset.repeat()
  
  dataset = dataset.apply(
    tf.data.experimental.map_and_batch(
      _parse,
      batch_size=batch_size,
      num_parallel_batches=num_parallel_calls,
      drop_remainder=True))

  dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
  return dataset

In [0]:
data_dir = f'{BUCKET}/dvc_ir2'

train_input_fn = lambda params: get_ds_from_tfrec(data_dir, training=True)
valid_input_fn = lambda params: get_ds_from_tfrec(data_dir, training=False)

In [0]:
def model_fn(features, labels, mode, params):
  phase = 1 if mode == tf.estimator.ModeKeys.TRAIN else 0
  tf.keras.backend.set_learning_phase(phase)

  dense1 = tf.keras.layers.Dense(256, activation='relu')
#   dropout1 = tf.keras.layers.Dropout(0.5)
  dense2 = tf.keras.layers.Dense(2)
#   logits = dense2(dropout1(dense1(features)))
  logits = dense2(dense1(features))

  loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
  step = tf.train.get_or_create_global_step()
    
  opt = tf.train.AdamOptimizer()
  opt = tf.contrib.tpu.CrossShardOptimizer(opt)
  train_op = opt.minimize(loss, global_step=step)

  classes = tf.math.argmax(logits, axis=-1)
  metric_fn = lambda classes, labels: {'accuracy': tf.metrics.accuracy(classes, labels)}
  tpu_metrics = (metric_fn, [classes, labels])
    
  return tf.contrib.tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op, 
                                           eval_metrics = tpu_metrics)

In [9]:
len_train = 23000
len_valid = 2000
steps_per_epoch = len_train // BATCH_SIZE

now = datetime.datetime.now()
MODEL_DIR = BUCKET+"/dvc_jobs/job-{}-{:02d}-{:02d}-{:02d}:{:02d}:{:02d}".format(now.year, now.month, now.day, now.hour, now.minute, now.second)

training_config = tf.contrib.tpu.RunConfig(
    cluster=tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS),
    model_dir=MODEL_DIR,
    tpu_config=tf.contrib.tpu.TPUConfig(
    iterations_per_loop=steps_per_epoch,
    per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
   
estimator = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn,
    model_dir=MODEL_DIR,
    train_batch_size=BATCH_SIZE,
    eval_batch_size=len_valid,
    config=training_config)

INFO:tensorflow:Using config: {'_model_dir': 'gs://gs_colab/dvc_jobs/job-2019-03-23-20:39:21', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.27.254.50:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2717735470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.27.254.50:8470', '_evaluation_master': 'grpc://10.27.254.50:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=44, num_shards=None, 

In [10]:
estimator.train(train_input_fn, steps=steps_per_epoch*EPOCHS)

INFO:tensorflow:Querying Tensorflow master (grpc://10.27.254.50:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 14529340411014639260)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 4494893675167142098)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 48285550346468396)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 2792205893177002525)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 10141940144573909372)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/j

<tensorflow.contrib.tpu.python.tpu.tpu_estimator.TPUEstimator at 0x7f27177350b8>

In [11]:
estimator.evaluate(input_fn=valid_input_fn, steps=1)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-23T20:42:55Z
INFO:tensorflow:TPU job name worker
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from gs://gs_colab/dvc_jobs/job-2019-03-23-20:39:21/model.ckpt-2200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Init TPU system
INFO:tensorflow:Initialized TPU in 7 seconds
INFO:tensorflow:Starting infeed thread controller.
INFO:tensorflow:Starting outfeed thread controller.
INFO:tensorflow:Initialized dataset iterators in 0 seconds
INFO:tensorflow:Enqueue next (1) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (1) batch(es) of data from outfeed.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Stop infeed thread control

{'accuracy': 0.7441406, 'global_step': 2200, 'loss': 0.5057915}