In [1]:
import math
import pandas as pd

from absl import app
from absl import flags

import tensorflow as tf
import functools
from tqdm import tqdm
import os
from pathlib import Path
from model.model import GeMPoolingLayer, DelfArcFaceModel, ArcFaceLayer
from utils.preprocessing import CreateDataset

In [2]:
FLAGS = flags.FLAGS

flags.DEFINE_string('train_directory', '/tmp/', 'Training data directory.')
flags.DEFINE_string('test_directory', None,
                    '(Optional) Testing data directory. Required only if '
                    'test_csv_path is not None.')
flags.DEFINE_string('output_directory', '/tmp/', 'Output data directory.')
flags.DEFINE_string('train_csv_path', '/tmp/train.csv',
                    'Training data csv file path.')
flags.DEFINE_string('test_csv_path', None,
                    '(Optional) Testing data csv file path. If None or absent,'
                    'TFRecords for the images in the test dataset are not'
                    'generated')
flags.DEFINE_integer('num_shards', 128, 'Number of shards in output data.')
flags.DEFINE_boolean('generate_train_validation_splits', False,
                     '(Optional) Whether to split the train dataset into'
                     'TRAIN and VALIDATION splits.')
flags.DEFINE_float('validation_split_size', 0.2,
                   '(Optional) The size of the VALIDATION split as a fraction'
                   'of the train dataset.')
flags.DEFINE_integer('seed', 0,
                     '(Optional) The seed to be used while shuffling the train'
                     'dataset when generating the TRAIN and VALIDATION splits.'
                     'Recommended for splits reproducibility purposes.')

In [3]:
EPOCHS = 10
image_size = 224
learning_rate = 1e-5  # should be smaller than training on single GPU
feature_size = 2048  # Embedding size before the output layer
save_interval = 1000

# ArcFace params
margin = 0.1  # DELG used 0.1, original ArcFace paper used 0.5. When margin is 0, it should be the same as doing a normal softmax but with embedding and weight normalised.
logit_scale = int(math.sqrt(feature_size))

# GeM params
gem_p = 3.
train_p = False  # whether to learn gem_p or not

data_dir = "/home/ubuntu/Dacon/jin/NIA"

In [4]:
checkpoint_dir = "/home/ubuntu/Dacon/jin/NIA/checkpoint/"
train_tf_records_dir = "/home/ubuntu/Dacon/jin/NIA/tfrecords/train-00000-of-00128"
test_tf_records_dir = "/home/ubuntu/Dacon/jin/NIA/tfrecords/validation-00000-of-00128"

strategy = tf.distribute.MirroredStrategy()

training_csv_path = os.path.join(data_dir, "train.csv")
train_csv = pd.read_csv(str(training_csv_path))
num_samples = len(train_csv["id"].tolist())
unique_landmark_ids = train_csv["landmark_id"].unique().tolist()
unique_landmark_ids = tf.convert_to_tensor(unique_landmark_ids, dtype=tf.int64)


with strategy.scope():

    training_set = CreateDataset(train_tf_records_dir, unique_landmark_ids)
    training_set = strategy.experimental_distribute_dataset(training_set)

    test_set = CreateDataset(test_tf_records_dir, unique_landmark_ids)
    test_set = strategy.experimental_distribute_dataset(test_set)

    train_iter = iter(training_set)
    validation_iter = iter(test_set)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [7]:
with strategy.scope():
    def train_step(images, labels):
        with tf.GradientTape() as tape:
            # training=True is only needed if there are layers with different
            # behavior during training versus inference (e.g. Dropout).
            predictions = model((images, labels), training=True)

            loss = compute_loss(labels, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss * strategy.num_replicas_in_sync)
        train_accuracy(labels, predictions)
        return loss


    @tf.function
    def distributed_train_steps(training_set_iter, steps_per_call):
        for _ in tf.range(steps_per_call):
            per_replica_losses = strategy.run(train_step, next(training_set_iter))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                           axis=None)


    def test_step(images, labels):
        # training=False is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(images, training=False)
        t_loss = loss_object(labels, predictions)

        test_loss(t_loss)
        test_accuracy(labels, predictions)


    @tf.function
    def distributed_test_step(images, labels):
        return strategy.run(test_step, args=(images, labels, ))    
    
    model = DelfArcFaceModel(
            input_shape=(image_size, image_size, 3), n_classes=len(unique_landmark_ids), margin=margin, logit_scale=logit_scale,
            p=gem_p, train_p=train_p, feature_size=feature_size
        )
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=batch_size)
    
    
    for epoch in range(EPOCHS):
        # Reset the metrics at the start of the next epoch
        train_loss.reset_states()
        train_accuracy.reset_states()
        test_loss.reset_states()
        test_accuracy.reset_states()
        step = 0

        with tqdm(total=int(num_samples)*0.8) as pbar:
            while True:
                distributed_train_step(x, y)
                template = 'Epoch {}, Training, Loss: {:.4f}, Accuracy: {:.4f}'
                pbar.set_description(template.format(epoch + 1, train_loss.result(), train_accuracy.result() * 100))
                if step % save_interval == 0:
                    if step == 0:
                        model.summary()
                        print()
                        print("\nlearning rate: {}\nmargin: {}\nlogit_scale: {}\ngem_p: {}\ntrain_p{}\n".format(learning_rate, margin, logit_scale, gem_p, train_p))

                    checkpoint_path = str(os.path.join(checkpoint_dir, "cp_epoch_{}_step_{}".format(epoch, step)))
                    model.save_weights(checkpoint_path)
                    print("Model saved to {}".format(checkpoint_path))
                step += batch_size * STEPS_PER_TPU_CALL
                pbar.update(batch_size * STEPS_PER_TPU_CALL)
                if step >= int(num_samples)*0.8:
                    break

        with tqdm(total=int(num_samples)*0.2) as pbar:
            for test_images, test_labels in test_set:
                distributed_test_step(validation_iter, test_labels)
                template = 'Epoch {}, Validation, Loss: {:.4f}, Accuracy: {:.4f}'
                pbar.set_description(template.format(epoch + 1, test_loss.result(), test_accuracy.result() * 100))
                pbar.update(batch_size)

        template = 'Epoch {}, \nTraining Loss: {}, Accuracy: {}\nTest Loss: {}, Accuracy: {}'
        print(template.format(epoch + 1, train_loss.result(), train_accuracy.result() * 100, test_loss.result(), test_accuracy.result() * 100))

  0%|          | 0/23728.0 [00:00<?, ?it/s]

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1_conv[0][0]                 
___________________________________________________________________________________________




ValueError: in user code:

    <ipython-input-5-2bc346cf8aa0>:19 distributed_train_steps  *
        for _ in tf.range(steps_per_call):
    /home/ubuntu/anaconda3/envs/landmark/lib/python3.7/site-packages/tensorflow/python/autograph/operators/control_flow.py:338 for_stmt
        symbol_names, opts)
    /home/ubuntu/anaconda3/envs/landmark/lib/python3.7/site-packages/tensorflow/python/autograph/operators/control_flow.py:543 _tf_range_for_stmt
        opts)
    /home/ubuntu/anaconda3/envs/landmark/lib/python3.7/site-packages/tensorflow/python/autograph/operators/control_flow.py:876 _tf_while_stmt
        _verify_loop_init_vars(init_vars, symbol_names)
    /home/ubuntu/anaconda3/envs/landmark/lib/python3.7/site-packages/tensorflow/python/autograph/operators/control_flow.py:114 _verify_loop_init_vars
        raise ValueError("'{}' must be defined before the loop.".format(name))

    ValueError: 'per_replica_losses' must be defined before the loop.
