# Cifar10: TF Custom training: walkthrough



# 0. 환경 셋업

In [23]:
# ! pip install tensorflow-gpu==2.4.1

In [24]:
import tensorflow as tf
import numpy as np

import os

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
print(tf.__version__)

2.4.1


In [25]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # 텐서플로가 첫 번째 GPU만 사용하도록 제한
  try:
    tf.config.experimental.set_visible_devices(gpus[7], 'GPU')
  except RuntimeError as e:
    # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
    print(e)

# 1. 데이터 준비

In [26]:
HEIGHT = 32
WIDTH = 32
DEPTH = 3
NUM_CLASSES = 10
NUM_DATA_BATCHES = 5
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 10000 * NUM_DATA_BATCHES
INPUT_TENSOR_NAME = 'inputs_input'  # needs to match the name of the first layer + "_input"



def get_filenames(channel_name, channel):
    if channel_name in ['train', 'validation', 'eval']:
        return [os.path.join(channel, channel_name + '.tfrecords')]
    else:
        raise ValueError('Invalid data subset "%s"' % channel_name)

def _input(epochs, batch_size, channel, channel_name):

    print(f"\nChannel Name: {channel_name}\n")     
    filenames = get_filenames(channel_name, channel)
    dataset = tf.data.TFRecordDataset(filenames)
    #dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=3)

    ds_size = sum(1 for _ in dataset)    
    # print("# of batches loading TFRecord : {0}".format(tf.data.experimental.cardinality(dataset).numpy()))    
    print("# of batches loading TFRecord : {0}".format(ds_size)) 
    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    dataset = dataset.repeat(1)    
    
    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        # buffer_size = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        buffer_size = ds_size
        dataset = dataset.shuffle(buffer_size=buffer_size)
 
        print("buffer_size: ", buffer_size)
    
    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset


def _train_preprocess_fn(image):
    """Preprocess a single training image of layout [height, width, depth]."""
    # Resize the image to add four extra pixels on each side.
    image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)

    # Randomly crop a [HEIGHT, WIDTH] section of the image.
    image = tf.image.random_crop(image, [HEIGHT, WIDTH, DEPTH])

    # Randomly flip the image horizontally.
    image = tf.image.random_flip_left_right(image)

    return image


def _dataset_parser(value):
    """Parse a CIFAR-10 record from value."""
    featdef = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }

    example = tf.io.parse_single_example(value, featdef)
    image = tf.io.decode_raw(example['image'], tf.uint8)
    image.set_shape([DEPTH * HEIGHT * WIDTH])

    # Reshape from [depth * height * width] to [depth, height, width].
    image = tf.cast(
        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
        tf.float32)
    label = tf.cast(example['label'], tf.int32)

    image = _train_preprocess_fn(image)

    return image, label    
#    return image, tf.one_hot(label, NUM_CLASSES)

def save_model(model, output):
    tf.saved_model.save(model, output+'/1/')
    logging.info("Model successfully saved at: {}".format(output))
    return



In [27]:
train_dir = '../../data/cifar10/train'
validation_dir = '../../data/cifar10/validation'
eval_dir = '../../data/cifar10/eval'

In [34]:
train_dataset = _input(5, 8, train_dir, 'train')
train_batch_size = sum(1 for _ in train_dataset)    
print("# of batches in train: ", train_batch_size)

train_dataset2 = _input(5, 8, train_dir, 'train')
train_batch_size = sum(1 for _ in train_dataset2)    
print("# of batches in train: ", train_batch_size)

train_dataset3 = _input(5, 8, train_dir, 'train')
train_batch_size = sum(1 for _ in train_dataset3)    
print("# of batches in train: ", train_batch_size)




Channel Name: train

# of batches loading TFRecord : 40000
buffer_size:  40000
# of batches in train:  5000

Channel Name: train

# of batches loading TFRecord : 40000
buffer_size:  40000
# of batches in train:  5000

Channel Name: train

# of batches loading TFRecord : 40000
buffer_size:  40000
# of batches in train:  5000


In [36]:
batch_num = 1
for images, labels in train_dataset.take(batch_num):
    labels = labels.numpy()
    print(labels)
    # print(labels.numpy().mean())
    break

batch_num = 1
for images, labels in train_dataset2.take(batch_num):
    labels = labels.numpy()
    print(labels)
    # print(labels.numpy().mean())
    break

batch_num = 1
for images, labels in train_dataset3.take(batch_num):
    labels = labels.numpy()
    print(labels)
    # print(labels.numpy().mean())
    break

    



[1 8 9 3 6 9 7 0]
[1 1 8 6 4 1 7 2]
[2 9 2 2 9 5 4 2]


In [6]:
train_dataset = _input(5, 256, train_dir, 'train')
train_batch_size = sum(1 for _ in train_dataset)    
print("# of batches in train: ", train_batch_size)


validation_dataset = _input(5, 10000, validation_dir, 'validation')
validation_batch_size = sum(1 for _ in validation_dataset)    
print("# of batches in validation: ", validation_batch_size)


eval_dataset = _input(5, 10000, eval_dir, 'eval')
eval_batch_size = sum(1 for _ in eval_dataset)    
print("# of batches in eval: ", eval_batch_size)


Channel Name: train

# of batches loading TFRecord : 40000
buffer_size:  40000
# of batches in train:  156

Channel Name: validation

# of batches loading TFRecord : 10000
# of batches in validation:  1

Channel Name: eval

# of batches loading TFRecord : 10000
# of batches in eval:  1


# 2. 모델 정의

In [7]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
        tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)

# 3. 모델 생성

## Gradient 생성 함수 정의

In [8]:

loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss



In [9]:
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
#test_accuracy = tf.keras.metrics.CategoricalCrossentropy(name='test_accuracy')

@tf.function
def test_step(images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)
    print("t_loss: ", t_loss)

    test_loss(t_loss)
    test_accuracy(labels, predictions)


In [11]:
@tf.function
def test_step2(test_loss,test_accuracy, images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)
    print("t_loss: ", t_loss)

    test_loss(t_loss)
    test_accuracy(labels, predictions)



# 4. 모델 훈련

In [12]:
print_interval = 200
EPOCHS = 2

for epoch in range(EPOCHS):
    for batch, (images, labels) in enumerate(train_dataset):
        loss_value = train_step(images, labels)
        
        if batch % print_interval == 0:
            print("Step #%d\tLoss: %.6f" %  (batch, loss_value))
        
        
    # Reset the metrics at the start of the next epoch
    test_loss.reset_states()
    test_accuracy.reset_states()
        

    for test_images, test_labels in validation_dataset:
#         test_step(test_images, test_labels)
        test_step2(test_loss, test_accuracy, test_images, test_labels)        


    template = 'Epoch {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                        test_loss.result(),
                        test_accuracy.result()*100))


print('Training Finished.')

Step #0	Loss: 1.730120
t_loss:  Tensor("sparse_categorical_crossentropy/weighted_loss/value:0", shape=(), dtype=float32)
Epoch 1, Test Loss: 1.7549176216125488, Test Accuracy: 36.869998931884766
Step #0	Loss: 1.836365
Epoch 2, Test Loss: 1.6288703680038452, Test Accuracy: 40.88999938964844
Training Finished.


# 5. 추론

In [12]:
# Reset the metrics at the start of the next epoch
test_loss.reset_states()
test_accuracy.reset_states()

n_batch = 10

#for batch_id, (test_images, test_labels) in enumerate(validation_dataset.take(n_batch)):
for batch_id, (test_images, test_labels) in enumerate(eval_dataset):
    print(batch_id)
    test_step(test_images, test_labels)

    print(
    #ㅁ    f'Epoch {epoch + 1}, '
    f'Test Loss: {test_loss.result()}, '
    f'Test Accuracy: {test_accuracy.result() * 100}'
    )
    break

0
Test Loss: 1.22633695602417, Test Accuracy: 56.849998474121094
