In [1]:
import math
import pandas as pd

from absl import app
from absl import flags

import tensorflow as tf
import functools
from tqdm import tqdm
import os
from pathlib import Path
from model.model import GeMPoolingLayer, DelfArcFaceModel, ArcFaceLayer
from utils.preprocessing import CreateDataset

strategy = tf.distribute.MirroredStrategy()


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [2]:
EPOCHS = 10
batch_size = 32
image_size = 224
learning_rate = 1e-4  # should be smaller than training on single GPU
feature_size = 2048  # Embedding size before the output layer
save_interval = 1000

# ArcFace params
margin = 0.1  # DELG used 0.1, original ArcFace paper used 0.5. When margin is 0, it should be the same as doing a normal softmax but with embedding and weight normalised.
logit_scale = int(math.sqrt(feature_size))

# GeM params
gem_p = 3.
train_p = False  # whether to learn gem_p or not

data_dir = "/home/ubuntu/Dacon/jin/NIA"

training_csv_path = os.path.join(data_dir, "train.csv")
train_csv = pd.read_csv(str(training_csv_path))
num_samples = len(train_csv["id"].tolist())
unique_landmark_ids = train_csv["landmark_id"].unique().tolist()
unique_landmark_ids = tf.convert_to_tensor(unique_landmark_ids, dtype=tf.int64)

In [3]:
BUFFER_SIZE = num_samples

BATCH_SIZE_PER_REPLICA = 128
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10

In [4]:
checkpoint_dir = "/home/ubuntu/Dacon/jin/NIA/checkpoint/"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
train_tf_records_dir = "/home/ubuntu/Dacon/jin/NIA/tfrecords/train-*"
test_tf_records_dir = "/home/ubuntu/Dacon/jin/NIA/tfrecords/validation-*"

with strategy.scope():
    training_set = CreateDataset(train_tf_records_dir, unique_landmark_ids).batch(GLOBAL_BATCH_SIZE)
    train_dist_dataset = strategy.experimental_distribute_dataset(training_set)

    test_set = CreateDataset(test_tf_records_dir, unique_landmark_ids).batch(GLOBAL_BATCH_SIZE)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_set)
    


In [None]:
     with strategy.scope():
        model = DelfArcFaceModel(
                input_shape=(image_size, image_size, 3), n_classes=len(unique_landmark_ids), margin=margin, logit_scale=logit_scale,
                p=gem_p, train_p=train_p, feature_size=feature_size
            )
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        train_loss = tf.keras.metrics.Mean(name='train_loss')
        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
        test_loss = tf.keras.metrics.Mean(name='test_loss')
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
        checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

        def train_step(images, labels):
            with tf.GradientTape() as tape:
                # training=True is only needed if there are layers with different
                # behavior during training versus inference (e.g. Dropout).
                predictions = model((images, labels), training=True)

                loss = compute_loss(labels, predictions)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            train_loss(loss * strategy.num_replicas_in_sync)
            train_accuracy(labels, predictions)
            return loss

        def test_step(images, labels):
            # training=False is only needed if there are layers with different
            # behavior during training versus inference (e.g. Dropout).
            predictions = model(images, training=False)
            t_loss = loss_object(labels, predictions)

            test_loss(t_loss)
            test_accuracy(labels, predictions)    

        @tf.function    
        def distributed_train_step(images, labels):
            per_replica_losses = strategy.run(train_step, args=(images, labels,))
            return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                               axis=None)

        @tf.function
        def distributed_test_step(images, labels):
            return strategy.run(test_step, args=(images, labels,))

        for epoch in range(EPOCHS):
            # 훈련 루프
            total_loss = 0.0
            num_batches = 0
        
    with tqdm(total=int(num_samples)*0.8) as pbar:
        template = 'Epoch {}, Training, Loss: {:.4f}, Accuracy: {:.4f}'
            for x, y in train_dist_dataset:
              total_loss += distributed_train_step(x, y)
              num_batches += 1
              pbar.set_description(template.format(epoch + 1, train_loss.result(), train_accuracy.result() * 100))              
            train_loss = total_loss / num_batches

            

            # 테스트 루프
            for x, y in test_dist_dataset:
              distributed_test_step(x, y)

            if epoch % 1 == 0:
              checkpoint.save(checkpoint_prefix)

            template = ("에포크 {}, 손실: {}, 정확도: {}, 테스트 손실: {}, "
                        "테스트 정확도: {}")
            print (template.format(epoch+1, train_loss,
                                   train_accuracy.result()*100, test_loss.result(),
                                   test_accuracy.result()*100))

            test_loss.reset_states()
            train_accuracy.reset_states()
            test_accuracy.reset_states()          

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu