Copyright (C) 2018 Software Platform Lab, Seoul National University


Licensed under the Apache License, Version 2.0 (the "License");


you may not use this file except in compliance with the License.


You may obtain a copy of the License at


    http://www.apache.org/licenses/LICENSE-2.0
    
    
Unless required by applicable law or agreed to in writing, software


distributed under the License is distributed on an "AS IS" BASIS,


WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.


See the License for the specific language governing permissions and


limitations under the License.

# DS2_HW2: Training CIFAR-10 with Resnet50 model
Train CIFAR-10 dataset with Resnet50 model, save checkpoint and evaluate the model.
- Create input pipeline using Tensorflow Dataset API
- Define optimizer for train
- Save checkpoints while training, and use those checkpoints for evaluation.
- Add summaries on Tensorboard.

In [7]:
#@title Run me to download the CIFAR-10 dataset!
# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html

import os, sys, time
import tarfile
import urllib

def reporthook(count, block_size, total_size):
  global start_time
  if count == 0:
    start_time = time.time()
    return
  duration = time.time() - start_time
  progress_size = int(count * block_size)
  percent = int(count * block_size * 100 / total_size)
  sys.stdout.write('\r...%d%%, %d MB, %d seconds passed' %
                   (percent, progress_size / (1024 * 1024), duration))
  sys.stdout.flush()

cifar10url = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
cifar10 = cifar10url.split('/')[-1]

if not os.path.isfile(cifar10):
  urllib.urlretrieve(cifar10url, cifar10, reporthook)
print()
print('Download finished!')

cifar10_extracted = 'cifar-10-batches-bin'

if not os.path.isdir(cifar10_extracted):
  tarfile.open(cifar10, 'r:gz').extractall()
print('Uncompression finished!')

()
Download finished!
Uncompression finished!


In [8]:
!mkdir train_ckpt

mkdir: cannot create directory ‘train_ckpt’: File exists


In [9]:
!ls

cifar-10-batches-bin  cifar-10-binary.tar.gz  sample_data  train_ckpt


## Define Resnet50 Model

In [0]:
"""ResNet model.

Related papers:
https://arxiv.org/pdf/1603.05027v2.pdf
https://arxiv.org/pdf/1512.03385v1.pdf
https://arxiv.org/pdf/1605.07146v1.pdf
"""
from collections import namedtuple

import numpy as np
import tensorflow as tf
import six

from tensorflow.python.training import moving_averages


HParams = namedtuple('HParams',
                     'batch_size, num_classes, min_lrn_rate, lrn_rate, '
                     'num_residual_units, use_bottleneck, weight_decay_rate, '
                     'relu_leakiness')


class ResNet(object):
  """ResNet model."""

  def __init__(self, hps, images, labels, mode):
    """ResNet constructor.

    Args:
      hps: Hyperparameters.
      images: Batches of images. [batch_size, image_size, image_size, 3]
      labels: Batches of labels. [batch_size, num_classes]
      mode: One of 'train' and 'eval'.
    """
    self.hps = hps
    self._images = images
    self.labels = labels
    self.mode = mode

    self._extra_train_ops = []

  def build_graph(self):
    """Build a whole graph for the model."""
    self.global_step = tf.train.get_or_create_global_step()
    self._build_model()
    if self.mode == 'train':
      self._build_train_op()
    self.summaries = tf.summary.merge_all()

  def _stride_arr(self, stride):
    """Map a stride scalar to the stride array for tf.nn.conv2d."""
    return [1, stride, stride, 1]

  def _build_model(self):
    """Build the core model within the graph."""
    with tf.variable_scope('init'):
      x = self._images
      x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))

    strides = [1, 2, 2]
    activate_before_residual = [True, False, False]
    if self.hps.use_bottleneck:
      res_func = self._bottleneck_residual
      filters = [16, 64, 128, 256]
    else:
      res_func = self._residual
      filters = [16, 16, 32, 64]
      # Uncomment the following codes to use w28-10 wide residual network.
      # It is more memory efficient than very deep residual network and has
      # comparably good performance.
      # https://arxiv.org/pdf/1605.07146v1.pdf
      # filters = [16, 160, 320, 640]
      # Update hps.num_residual_units to 4

    with tf.variable_scope('unit_1_0'):
      x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
                   activate_before_residual[0])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_1_%d' % i):
        x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)

    with tf.variable_scope('unit_2_0'):
      x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
                   activate_before_residual[1])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_2_%d' % i):
        x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)

    with tf.variable_scope('unit_3_0'):
      x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
                   activate_before_residual[2])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_3_%d' % i):
        x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)

    with tf.variable_scope('unit_last'):
      x = self._batch_norm('final_bn', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._global_avg_pool(x)

    with tf.variable_scope('logit'):
      logits = self._fully_connected(x, self.hps.num_classes)
      self.predictions = tf.nn.softmax(logits)

    with tf.variable_scope('costs'):
      xent = tf.nn.softmax_cross_entropy_with_logits(
          logits=logits, labels=self.labels)
      self.cost = tf.reduce_mean(xent, name='xent')
      self.cost += self._decay()

      tf.summary.scalar('cost', self.cost)

  def _build_train_op(self):
    """Build training specific ops for the graph."""
    self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
    tf.summary.scalar('learning_rate', self.lrn_rate)

    trainable_variables = tf.trainable_variables()
    grads = tf.gradients(self.cost, trainable_variables)

    
    #########################################################################
    #### FIXME: Create an optimizer using self.lrn_rate as learning rate ####
    #########################################################################
#     optimizer = tf.train.AdamOptimizer(self.lrn_rate)
    optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)

    apply_op = optimizer.apply_gradients(
        zip(grads, trainable_variables),
        name='train_step')
    with tf.control_dependencies([apply_op]):
      apply_op = tf.assign_add(self.global_step, 1)
   
    train_ops = [apply_op] + self._extra_train_ops
    self.train_op = tf.group(*train_ops)

  # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
  def _batch_norm(self, name, x):
    """Batch normalization."""
    with tf.variable_scope(name):
      params_shape = [x.get_shape()[-1]]

      beta = tf.get_variable(
          'beta', params_shape, tf.float32,
          initializer=tf.constant_initializer(0.0, tf.float32))
      gamma = tf.get_variable(
          'gamma', params_shape, tf.float32,
          initializer=tf.constant_initializer(1.0, tf.float32))

      if self.mode == 'train':
        mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')

        moving_mean = tf.get_variable(
            'moving_mean', params_shape, tf.float32,
            initializer=tf.constant_initializer(0.0, tf.float32),
            trainable=False)
        moving_variance = tf.get_variable(
            'moving_variance', params_shape, tf.float32,
            initializer=tf.constant_initializer(1.0, tf.float32),
            trainable=False)

        self._extra_train_ops.append(moving_averages.assign_moving_average(
            moving_mean, mean, 0.9))
        self._extra_train_ops.append(moving_averages.assign_moving_average(
            moving_variance, variance, 0.9))
      else:
        mean = tf.get_variable(
            'moving_mean', params_shape, tf.float32,
            initializer=tf.constant_initializer(0.0, tf.float32),
            trainable=False)
        variance = tf.get_variable(
            'moving_variance', params_shape, tf.float32,
            initializer=tf.constant_initializer(1.0, tf.float32),
            trainable=False)
        tf.summary.histogram(mean.op.name, mean)
        tf.summary.histogram(variance.op.name, variance)
      # epsilon used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
      y = tf.nn.batch_normalization(
          x, mean, variance, beta, gamma, 0.001)
      y.set_shape(x.get_shape())
      return y

  def _residual(self, x, in_filter, out_filter, stride,
                activate_before_residual=False):
    """Residual unit with 2 sub layers."""
    if activate_before_residual:
      with tf.variable_scope('shared_activation'):
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)
        orig_x = x
    else:
      with tf.variable_scope('residual_only_activation'):
        orig_x = x
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)

    with tf.variable_scope('sub1'):
      x = self._conv('conv1', x, 3, in_filter, out_filter, stride)

    with tf.variable_scope('sub2'):
      x = self._batch_norm('bn2', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])

    with tf.variable_scope('sub_add'):
      if in_filter != out_filter:
        orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID')
        orig_x = tf.pad(
            orig_x, [[0, 0], [0, 0], [0, 0],
                     [(out_filter-in_filter)//2, (out_filter-in_filter)//2]])
      x += orig_x

    tf.logging.debug('image after unit %s', x.get_shape())
    return x

  def _bottleneck_residual(self, x, in_filter, out_filter, stride,
                           activate_before_residual=False):
    """Bottleneck residual unit with 3 sub layers."""
    if activate_before_residual:
      with tf.variable_scope('common_bn_relu'):
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)
        orig_x = x
    else:
      with tf.variable_scope('residual_bn_relu'):
        orig_x = x
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)

    with tf.variable_scope('sub1'):
      x = self._conv('conv1', x, 1, in_filter, out_filter/4, stride)

    with tf.variable_scope('sub2'):
      x = self._batch_norm('bn2', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv2', x, 3, out_filter/4, out_filter/4, [1, 1, 1, 1])

    with tf.variable_scope('sub3'):
      x = self._batch_norm('bn3', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv3', x, 1, out_filter/4, out_filter, [1, 1, 1, 1])

    with tf.variable_scope('sub_add'):
      if in_filter != out_filter:
        orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
      x += orig_x

    tf.logging.info('image after unit %s', x.get_shape())
    return x

  def _decay(self):
    """L2 weight decay loss."""
    costs = []
    for var in tf.trainable_variables():
      if var.op.name.find(r'DW') > 0:
        costs.append(tf.nn.l2_loss(var))
        # tf.summary.histogram(var.op.name, var)

    return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))

  def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
    """Convolution."""
    with tf.variable_scope(name):
      n = filter_size * filter_size * out_filters
      kernel = tf.get_variable(
          'DW', [filter_size, filter_size, in_filters, out_filters],
          tf.float32, initializer=tf.random_normal_initializer(
              stddev=np.sqrt(2.0/n)))
      return tf.nn.conv2d(x, kernel, strides, padding='SAME')

  def _relu(self, x, leakiness=0.0):
    """Relu, with optional leaky support."""
    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')

  def _fully_connected(self, x, out_dim):
    """FullyConnected layer for final output."""
    x = tf.reshape(x, [self.hps.batch_size, -1])
    w = tf.get_variable(
        'DW', [x.get_shape()[1], out_dim],
        initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
    b = tf.get_variable('biases', [out_dim],
                        initializer=tf.constant_initializer())
    return tf.nn.xw_plus_b(x, w, b)

  def _global_avg_pool(self, x):
    assert x.get_shape().ndims == 4
    return tf.reduce_mean(x, [1, 2])

## CIFAR10 input

In [0]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""CIFAR dataset input module.
"""

import tensorflow as tf

def build_input(dataset, data_path, batch_size, mode):
  """Build CIFAR image and labels.

  Args:
    dataset: Either 'cifar10' or 'cifar100'.
    data_path: Filename for data.
    batch_size: Input batch size.
    mode: Either 'train' or 'eval'.
  Returns:
    images: Batches of images. [batch_size, image_size, image_size, 3]
    labels: Batches of labels. [batch_size, num_classes]
  Raises:
    ValueError: when the specified dataset is not supported.
  """
  image_size = 32
  if dataset == 'cifar10':
    label_bytes = 1
    label_offset = 0
    num_classes = 10
  elif dataset == 'cifar100':
    label_bytes = 1
    label_offset = 1
    num_classes = 100
  else:
    raise ValueError('Not supported dataset %s', dataset)

  depth = 3
  image_bytes = image_size * image_size * depth
  record_bytes = label_bytes + label_offset + image_bytes

  def parse_data(value): 
    # Convert these examples to dense labels and processed images.
    record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
    label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)

    # Convert from string to [depth * height * width] to [depth, height, width].
    depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]),
                           [depth, image_size, image_size])
    # Convert from [depth, height, width] to [height, width, depth].
    image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)

    if mode == 'train':
      image = tf.image.resize_image_with_crop_or_pad(
        image, image_size+4, image_size+4)
      image = tf.random_crop(image, [image_size, image_size, 3])
      image = tf.image.random_flip_left_right(image)
      # Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
      # image = tf.image.random_brightness(image, max_delta=63. / 255.)
      # image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
      # image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
      image = tf.image.per_image_standardization(image)

    else:
      image = tf.image.resize_image_with_crop_or_pad(
        image, image_size, image_size)
      image = tf.image.per_image_standardization(image)

    return image, label

  data_files = tf.gfile.Glob(data_path)
  data_files.sort()
  ds = tf.data.Dataset.from_tensor_slices(data_files)
  #############################################################################
  #### FIXME: Create an input pipline using tf.data.Dataset and parse_data ####
  #############################################################################
#   ??? 
#   iterator = ???
  ds = tf.data.FixedLengthRecordDataset(data_files,record_bytes)
  ds = ds.map(parse_data)
  # Dataset.shuffle: shuffle data
  ds = ds.shuffle(10000)
  # Dataset.repeat: repeate the dataset (not only 1 epoch)
  ds = ds.repeat()
  # Dataset.batch: Combine elements to generate a batch of data
  ds = ds.batch(batch_size)
  iterator = ds.make_one_shot_iterator()
  
  images, labels = iterator.get_next()
  

  images = tf.reshape(images, [batch_size, image_size, image_size, depth])
  labels = tf.reshape(labels, [batch_size, 1])
  indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
  labels = tf.sparse_to_dense(
      tf.concat(values=[indices, labels], axis=1),
      [batch_size, num_classes], 1.0, 0.0) 
  return images, labels

## Resnet Train
Do not be frightened if you face such an error: 
`An exception has occurred, use %tb to see the full traceback.  SystemExit`

In [6]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""ResNet Train/Eval module.
"""
import time
import six
import sys

import numpy as np
import tensorflow as tf

# Global
train_data_path = './cifar-10-batches-bin/data_batch*'
image_size = 32
ckpt_dir = './train_ckpt'
ckpt_prefix = ckpt_dir + '/cifar10-train'

def train(hps , batch_size): #batch_size 넣음
  """Training loop."""
  with tf.Graph().as_default():
    images, labels = build_input(
      'cifar10', train_data_path, hps.batch_size, 'train')
    model = ResNet(hps, images, labels, 'train')
    model.build_graph()

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    init = tf.global_variables_initializer()

    ###########################################
    #### FIXME: Create an checkpoint Saver ####
    #### WARNING: use max_to_keep arg      ####
    ###########################################
    saver = tf.train.Saver(max_to_keep=30)

    with tf.Session() as sess:
      sess.run(init)
      for i in range(3001):
        _, global_step, cost, precision_ = \
          sess.run([model.train_op, model.global_step, model.cost, precision])

        if global_step % 100 == 0:
#           elapsed = time.time() - start # 시간 추가
#           print('step: %d, loss: %.3f, precision: %.3f , throughput = %.3f img/sec' % (global_step, cost, precision_ , 100 * batch_size / elapsed))
          print('step: %d, loss: %.3f, precision: %.3f ' % (global_step, cost, precision_ ))
          #################################################
          #### FIXME: Save the model using Saver       ####
          #### Use ckpt_prefix as checkpoint save path ####
          #################################################
          saver.save(sess, ckpt_prefix , global_step=global_step)
#           start = time.time()

        
def main(_):
  batch_size = 128

  hps = HParams(batch_size=batch_size,
                             num_classes=10,
                             min_lrn_rate=0.0001,
                             lrn_rate=0.1,
                             num_residual_units=5,
                             use_bottleneck=False,
                             weight_decay_rate=0.0002,
                             relu_leakiness=0.1)

  train(hps , batch_size)

if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.app.run()

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

step: 0, loss: 2.591, precision: 0.125 
step: 100, loss: 1.951, precision: 0.328 
step: 200, loss: 1.774, precision: 0.375 
step: 300, loss: 1.624, precision: 0.406 
step: 400, loss: 1.562, precision: 0.547 
step: 500, loss: 1.460, precision: 0.531 
step: 600, loss: 1.320, precision: 0.578 
step: 700, loss: 1.219, precision: 0.648 
step: 800, loss: 1.326, precision: 0.562 
step: 900, loss: 1.073, precision: 0.727 
step: 1000, loss: 1.336, precision: 0.547 
step: 1100, loss: 1.106, precision: 0.664 
step: 1200, loss: 1.053, precision: 0.734 
step: 1300, loss: 1.031, precision: 0.719 
step: 1400, loss: 1.136, precision: 0.648 
step: 1500, loss: 1.221, precision: 0.602 
step: 1

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### You can see *cifar10-train-0~3000* checkpoint files when you run following code, after you train model.

In [12]:
!ls train_ckpt

checkpoint				cifar10-train-2300.data-00000-of-00001
cifar10-train-1000.data-00000-of-00001	cifar10-train-2300.index
cifar10-train-1000.index		cifar10-train-2300.meta
cifar10-train-1000.meta			cifar10-train-2400.data-00000-of-00001
cifar10-train-100.data-00000-of-00001	cifar10-train-2400.index
cifar10-train-100.index			cifar10-train-2400.meta
cifar10-train-100.meta			cifar10-train-2500.data-00000-of-00001
cifar10-train-1100.data-00000-of-00001	cifar10-train-2500.index
cifar10-train-1100.index		cifar10-train-2500.meta
cifar10-train-1100.meta			cifar10-train-2600.data-00000-of-00001
cifar10-train-1200.data-00000-of-00001	cifar10-train-2600.index
cifar10-train-1200.index		cifar10-train-2600.meta
cifar10-train-1200.meta			cifar10-train-2700.data-00000-of-00001
cifar10-train-1300.data-00000-of-00001	cifar10-train-2700.index
cifar10-train-1300.index		cifar10-train-2700.meta
cifar10-train-1300.meta			cifar10-train-2800.data-00000-of-00001
cifar10-train-1400.data-00000-of-00001	cifar10-train-2

## Resnet Eval

Before you run this code
click Runtime->**restart runtime**

(If you want to erase all the local files, then click *RESET ALL RUNTIMES* or **DO NOT CLICK!**)

and restart **Define Resnet50 Model**,  **CIFAR10 input**

Do not be frightened if you face such an error: 
`An exception has occurred, use %tb to see the full traceback.  SystemExit`

In [13]:
!rm -rf './tensorboard'
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""ResNet Train/Eval module.
"""
import time
import six
import sys

import numpy as np
import tensorflow as tf

eval_data_path = './cifar-10-batches-bin/test_batch.bin'
ckpt_dir = './train_ckpt'
tensorboard_path = './tensorboard'

def evaluate(hps):
  """Eval loop."""
  with tf.Graph().as_default():
    images, labels = build_input(
        'cifar10', './cifar-10-batches-bin/test_batch.bin', hps.batch_size, 'eval')
    model = ResNet(hps, images, labels, 'eval')
    model.build_graph()

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    saver = tf.train.Saver() 

    ######################################
    #### FIXME: Make a summary writer ####
    #### Use tensorboard_path as path ####
    ######################################
    summary_writer = tf.summary.FileWriter(tensorboard_path, sess.graph)

    try:
      ########################################
      #### FIXME: Create checkpoint state ####
      #### Use ckpt_dir as path           ####
      ########################################
#       ckpt_state = ???
      # Get checkpoint state from checkpoint dir
      ckpt_state = tf.train.get_checkpoint_state(ckpt_dir)
      
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
    if not (ckpt_state):
      tf.logging.info('No model to eval yet at %s', ckpt_dir)

    best_precision = 0.
    ############################################################
    #### FIXME: Restore checkpoint from first one to latest ####
    #### Use ckpt_state                                     ####
    ###########################################################
    for i in range(len(ckpt_state.all_model_checkpoint_paths)): #range 를 어떻게 해야할지 모르겠다. 아직 Code 분석 없이 돌아가게만 만들었음.10/6 02:13
#     for i in range(30):
#       tf.logging.info('Loading checkpoint %s', ???)
#       saver.restore(sess, ???)
      if ckpt_state is not None:
        # all_model_checkpoint_paths: the array of checkpoints
        tf.logging.info('Loading checkpoint %s', ckpt_state.all_model_checkpoint_paths[i])
        # Restore the lastest checkpoint
        saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i])

      total_prediction, correct_prediction = 0, 0

      for _ in six.moves.range(100):
        (summaries, loss, predictions, truth, train_step) = sess.run(
            [model.summaries, model.cost, model.predictions,
             model.labels, model.global_step])

        truth = np.argmax(truth, axis=1)
        predictions = np.argmax(predictions, axis=1)
        correct_prediction += np.sum(truth == predictions)
        total_prediction += predictions.shape[0]

      precision = 1.0 * correct_prediction / total_prediction
      best_precision = max(precision, best_precision)


      ############################################################
      #### FIXME: Add summary of precision and best_precision ####
      ############################################################
      precisionSum = tf.Summary()
      precisionSum.value.add(tag='precisionSum' , simple_value=precision)
      summary_writer.add_summary(precisionSum , train_step)
      best_precisionSum = tf.Summary()
      best_precisionSum.value.add(tag='best_precisionSum' , simple_value=best_precision)
      summary_writer.add_summary(best_precisionSum , train_step)

      tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                      (loss, precision, best_precision))
      summary_writer.flush()

      tf.logging.info('loss: %.3f, precision: %.3f' %
                      (loss, precision))


def main(_):

  hps = HParams(batch_size=100,
                num_classes=10,
                min_lrn_rate=0.0001,
                lrn_rate=0.1,
                num_residual_units=5,
                use_bottleneck=False,
                weight_decay_rate=0.0002,
                relu_leakiness=0.1)

  evaluate(hps)


if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.app.run()

INFO:tensorflow:Loading checkpoint ./train_ckpt/cifar10-train-100
INFO:tensorflow:Restoring parameters from ./train_ckpt/cifar10-train-100
INFO:tensorflow:loss: 1.981, precision: 0.326, best precision: 0.326
INFO:tensorflow:loss: 1.981, precision: 0.326
INFO:tensorflow:Loading checkpoint ./train_ckpt/cifar10-train-200
INFO:tensorflow:Restoring parameters from ./train_ckpt/cifar10-train-200
INFO:tensorflow:loss: 2.080, precision: 0.349, best precision: 0.349
INFO:tensorflow:loss: 2.080, precision: 0.349
INFO:tensorflow:Loading checkpoint ./train_ckpt/cifar10-train-300
INFO:tensorflow:Restoring parameters from ./train_ckpt/cifar10-train-300
INFO:tensorflow:loss: 1.535, precision: 0.456, best precision: 0.456
INFO:tensorflow:loss: 1.535, precision: 0.456
INFO:tensorflow:Loading checkpoint ./train_ckpt/cifar10-train-400
INFO:tensorflow:Restoring parameters from ./train_ckpt/cifar10-train-400
INFO:tensorflow:loss: 1.778, precision: 0.417, best precision: 0.456
INFO:tensorflow:loss: 1.778, p

SystemExit: ignored

In [14]:
ckpt_state = tf.train.get_checkpoint_state(ckpt_dir)
ckpt_state.all_model_checkpoint_paths

[u'./train_ckpt/cifar10-train-100', u'./train_ckpt/cifar10-train-200', u'./train_ckpt/cifar10-train-300', u'./train_ckpt/cifar10-train-400', u'./train_ckpt/cifar10-train-500', u'./train_ckpt/cifar10-train-600', u'./train_ckpt/cifar10-train-700', u'./train_ckpt/cifar10-train-800', u'./train_ckpt/cifar10-train-900', u'./train_ckpt/cifar10-train-1000', u'./train_ckpt/cifar10-train-1100', u'./train_ckpt/cifar10-train-1200', u'./train_ckpt/cifar10-train-1300', u'./train_ckpt/cifar10-train-1400', u'./train_ckpt/cifar10-train-1500', u'./train_ckpt/cifar10-train-1600', u'./train_ckpt/cifar10-train-1700', u'./train_ckpt/cifar10-train-1800', u'./train_ckpt/cifar10-train-1900', u'./train_ckpt/cifar10-train-2000', u'./train_ckpt/cifar10-train-2100', u'./train_ckpt/cifar10-train-2200', u'./train_ckpt/cifar10-train-2300', u'./train_ckpt/cifar10-train-2400', u'./train_ckpt/cifar10-train-2500', u'./train_ckpt/cifar10-train-2600', u'./train_ckpt/cifar10-train-2700', u'./train_ckpt/cifar10-train-2800', 

### Display our graph on tensorboard!

In [15]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

#run tensorboard
LOG_DIR = './tensorboard'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)
#run ngrok
get_ipython().system_raw('./ngrok http 6006 &')

--2018-10-11 04:21:17--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.73.9.93, 52.72.251.164, 54.152.208.69, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.73.9.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2018-10-11 04:21:18 (9.42 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [16]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://1a547024.ngrok.io
