In [1]:
import os
import math
import json
import pickle
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from keras_vggface.vggface import VGGFace

from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
from tensorflow.keras.optimizers import RMSprop, SGD, Adam
from tensorflow.keras.applications import MobileNet, ResNet50, InceptionV3
from tensorflow.keras.applications.mobilenet import preprocess_input as mobilenet_preprocess
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.inception_v3 import preprocess_input as inception_preprocess
from keras_vggface.utils import preprocess_input as vggface_preprocess
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing import image
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Conv1D, Attention, GlobalAveragePooling1D, BatchNormalization, Layer
from keras_facenet import FaceNet

random.seed(123)
tf.random.set_seed(12)
np.random.seed(123)

2021-09-24 23:19:16.461959: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

2021-09-24 23:19:19.225728: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcuda.so.1
2021-09-24 23:19:19.287113: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-24 23:19:19.287460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.815GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2021-09-24 23:19:19.287486: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-09-24 23:19:19.316450: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcublas.so.11
2021-09-24 23:19:19.316523: I tensorflow/stream_execu

In [3]:
input_shape = (224, 224, 3)
train_path = './data/train'

# Training pairs generating

Available training pairs from csv files are splitted to train - validation sets. Those pairs are positive(there is blood relation). For each set(train/valid) we additionally generate negative pairs.

Positive pairs are generated according to the input csv file. For each person of positive pair we create one negative pair.
In total we'll have twice more negative than positive pairs.

In [4]:
def make_image_pair(pair, input_shape, shuffle=True, slice_imgs=1):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    ''' 
    p1, p2 = [os.path.join(train_path, p) for p in pair]
    
    p1_imgs = os.listdir(p1)
    p2_imgs = os.listdir(p2)
    
    if shuffle:
        random.shuffle(p1_imgs)
        random.shuffle(p2_imgs)
    
    p1_imgs = p1_imgs[:slice_imgs]
    p2_imgs = p2_imgs[:slice_imgs]
    
    for i in range(len(p1_imgs)):
        for j in range(len(p2_imgs)):
            img1_path = os.path.join(p1, p1_imgs[i])
            img2_path = os.path.join(p2, p2_imgs[j])
            img1 = image.load_img(img1_path, target_size=(input_shape[0], input_shape[1]))
            img2 = image.load_img(img2_path, target_size=(input_shape[0], input_shape[1]))
            img1 = np.array(img1).astype('float32')
            img2 = np.array(img2).astype('float32')
            
            yield img1, img2

In [5]:
def pairs_set(input_pairs, input_shape, shuffle=True, slice_imgs=1):
    for pair, label in input_pairs:
        try:
            emb_pairs = make_image_pair(pair, input_shape, shuffle, slice_imgs)
            for emb_pair in emb_pairs:
                yield emb_pair, label
        except (KeyError, FileNotFoundError):
            continue

def batched_pairs(input_pairs, batch_size, dataset_period, input_shape, preprocess, shuffle=True, slice_imgs=1):
    imgs1 = []
    imgs2 = []
    labels = []
    counter = 0
    for example in pairs_set(input_pairs, input_shape, shuffle, slice_imgs):
        # Get every nth sample
        counter += 1
        if counter % dataset_period:
            continue
        
        exmpls, label = example
        exmpl1, exmpl2 = exmpls
        imgs1.append(exmpl1)
        imgs2.append(exmpl2)
        labels.append(label)
        if len(labels) == batch_size:
            yield {'input_1':preprocess(np.array(imgs1)), 'input_2':preprocess(np.array(imgs2))}, np.array(labels).astype(float)
            imgs1, imgs2, labels = [], [], []

In [6]:
with open('train_val_set.json', 'r') as f:
    train_val_set = json.load(f)

train_rlt_list, neg_train_rltshps, valid_rlt_list, neg_valid_rltshps = list(train_val_set.values())
train_rlt_list = train_rlt_list * 4

train_rlts = list(zip(train_rlt_list + neg_train_rltshps, [True]*len(train_rlt_list) + [False]*len(neg_train_rltshps)))
val_rlts = list(zip(valid_rlt_list + neg_valid_rltshps, [True]*len(valid_rlt_list) + [False]*len(neg_valid_rltshps)))

random.shuffle(train_rlts)
random.shuffle(val_rlts)

# Siamese network

Initial experimenting is done with conv1D deep neural network, as additional option for experimenting there is simple attention module.

In [7]:
def mobilenet(input_shape, l2_value, dropout):
    mobile = MobileNet(
        input_shape=input_shape,
        dropout=dropout,
        include_top=False,
        pooling='avg',
        alpha=1.,
        weights='imagenet'
    )
    
    for layer in mobile.layers:
        layer.trainable = True
        if hasattr(layer, 'kernel_regularizer'):
            setattr(layer, 'kernel_regularizer', keras.regularizers.l2(l2_value))
        
    x = Dense(512, kernel_regularizer=l2(l2_value), activation='relu')(mobile.output)
    x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x)
    return Model(mobile.input, x)

def inception(input_shape, l2_value, dropout):
    inception = InceptionV3(
        input_shape=input_shape,
        include_top=False,
        pooling='avg',
        weights='imagenet'
    )
    
    for layer in inception.layers:
        layer.trainable = True
        if hasattr(layer, 'kernel_regularizer'):
            setattr(layer, 'kernel_regularizer', keras.regularizers.l2(l2_value))
        
    x = Dense(128, kernel_regularizer=l2(l2_value), activation='relu')(inception.output)
    x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x)
    return Model(inception.input, x)

def resnet50(input_shape, l2_value, dropout):
    resnet = ResNet50(
        input_shape=input_shape,
        include_top=False,
        pooling='avg',
        weights='imagenet'
    )
    
    for layer in resnet.layers:
        layer.trainable = True
        if hasattr(layer, 'kernel_regularizer'):
            setattr(layer, 'kernel_regularizer', keras.regularizers.l2(l2_value))
        
    x = Dense(32, kernel_regularizer=l2(l2_value), activation='relu')(resnet.output)
    x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x)
    return Model(resnet.input, x)

def vggface_resnet50(input_shape, l2_value, dropout):
    vggface_res = VGGFace(model='resnet50', include_top=False, input_shape=input_shape)
    
    for layer in vggface_res.layers:
        layer.trainable = True
        if hasattr(layer, 'kernel_regularizer'):
            setattr(layer, 'kernel_regularizer', keras.regularizers.l2(l2_value))
    
    last_layer = vggface_res.get_layer('avg_pool').output
    x = Flatten(name='flatten')(last_layer)
    x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x)
    return Model(vggface_res.input, x)

# Loss and metrics functions

In [8]:
MARGIN = 0.28

# Margins for positive and negative pairs in the batch
margin_pos = 0.8 * MARGIN
margin_neg = 1.2 * MARGIN

def euclidean_distance(vectors):
    x, y = vectors
    sum_square = K.sum(K.square(x - y), axis=1)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def cosine_distance(vectors):
    x, y = vectors
    x_norm = tf.norm(x, axis=1)
    y_norm = tf.norm(y, axis=1)
    x_y_dot = tf.einsum('ij,ij->i', x, y)
    cos_sim = x_y_dot / (x_norm * y_norm + K.epsilon())
    return 1. - cos_sim

def cos_euc_dist(vectors):
    euc = euclidean_distance(vectors)
    cos_dist = cosine_distance(vectors)
    return (1. - cos_dist) * euc

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    weight_pos = 1.4
    weight_neg = 1.
    
    pos_pred = y_true * y_pred
    pos_neg = (1 - y_true) * y_pred
    
    pred_pos_m = K.mean(pos_pred, axis=0)
    pred_neg_m = K.mean(pos_neg, axis=0)
    
    variance_loss = 0.5 * (K.mean(K.square(pred_pos_m - pos_pred)) + K.mean(K.square(pred_neg_m - pos_neg)))
    
    square_pred = K.square(K.maximum(y_pred - margin_pos, 0))
    square_neg = K.square(K.maximum(margin_neg - y_pred, 0))
    
    return K.mean(y_true * weight_pos * square_pred + (1 - y_true) * weight_neg * square_neg) + 0.1 * variance_loss

# Run training

In [9]:
learning_rate = 8e-7
l2_value = 1e-9
dropout = 0.
epochs = 1000
batch_size = 24
eval_batch_size = 128
dataset_period = 1
eval_dataset_period = 1
model_type = 'vggface_resnet50'
preprocess = vggface_preprocess
# 'euclidian' or 'cosine_distance'
distance_type = 'cosine_distance'
optimizer = 'Adam'

# Learning rate scheduler
def scheduler(epoch, lr):
    if epoch == 10:
        return 0.5 * lr
    elif epoch == 180:
        return 0.8 * lr
    elif epoch == 250:
        return 0.5 * lr
    elif epoch == 300:
        return 0.7 * lr
    elif epoch == 400:
        return 0.3 *lr
    elif epoch == 700:
        return 0.6 *lr
    return lr
    
lr_callback = LearningRateScheduler(scheduler)

# Create dictionary of parameters for saving configuration
train_config = {}
for name in [
    'learning_rate',
    'l2_value',
    'dropout',
    'epochs',
    'batch_size',
    'model_type',
    'dataset_period',
    'eval_dataset_period',
    'distance_type',
    'optimizer',
    'MARGIN'
]:
    train_config[name] = eval(name)

In [10]:
data_augmentation = keras.Sequential([
    keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
    keras.layers.experimental.preprocessing.RandomTranslation(height_factor=0.2, width_factor=0.2),
    keras.layers.experimental.preprocessing.RandomContrast(factor=0.2),
    keras.layers.experimental.preprocessing.RandomZoom(height_factor=0.2)
])

2021-09-24 23:19:19.579357: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-24 23:19:19.579635: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.815GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2021-09-24 23:19:19.579735: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-24 23:19:19.579984: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-24 23:19:19.580196: I tensorflow/c

In [11]:
base_network = eval(model_type)(input_shape, l2_value, dropout)
# base_network.load_weights('pretrained/checkpoints/model_001_mobile_512/weights.609.hdf5')
base_network.count_params()

23561152

In [12]:
# Creation of Siamese network
input1 = Input(shape=input_shape, name='input_1')
input2 = Input(shape=input_shape, name='input_2')

processed1 = base_network(data_augmentation(input1))
processed2 = base_network(data_augmentation(input2))



In [13]:
dist_function = eval(distance_type)
distance = Lambda(dist_function,
                  output_shape=eucl_dist_output_shape)([processed1, processed2])

model = Model([input1, input2], distance)
optimizer = eval(optimizer)(learning_rate=learning_rate)
model.compile(loss=contrastive_loss, optimizer=optimizer)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 224, 224, 3)  0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
model (Functional)              (None, 2048)         23561152    sequential[0][0]           

## Run tensorboard plugin in order to track changes of training

In [14]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [15]:
%tensorboard --logdir=./logs --port=7007

Reusing TensorBoard on port 7007 (pid 773877), started 3 days, 2:04:05 ago. (Use '!kill 773877' to kill it.)

## Training Callbacks

In [16]:
# Get training set length
train_len = 0
for pair, label in train_rlts:
    try:
        p1, p2 = [os.path.join(train_path, p) for p in pair]
        p1_imgs = os.listdir(p1)
        p2_imgs = os.listdir(p2)
        train_len += 1
    except FileNotFoundError:
        continue

train_len = train_len // dataset_period

val_len = 0
for pair, label in val_rlts:
    try:
        p1, p2 = [os.path.join(train_path, p) for p in pair]
        p1_imgs = os.listdir(p1)
        p2_imgs = os.listdir(p2)
        val_len += 2*2
    except FileNotFoundError:
        continue

val_len = val_len // eval_dataset_period

print(f'Train set length: {train_len}')
print(f'Valid set length: {val_len}')

Train set length: 19429
Valid set length: 8084


In [17]:
def val_distance_stats(predictions, labels):
    val_pos = predictions[labels.astype(np.bool)]
    val_neg = predictions[(1 - labels).astype(np.bool)]
    val_pos_m, val_pos_s = np.mean(val_pos), np.std(val_pos)
    val_neg_m, val_neg_s = np.mean(val_neg), np.std(val_neg)
    
    return val_pos_m, val_pos_s, val_neg_m, val_neg_s

# Get upper and lower boundary for the predicted distances
lower_lim = max(MARGIN - 2 * (MARGIN - margin_pos), 0.)
upper_lim = min(MARGIN + 2 * (margin_neg - MARGIN), 2.)
   
def dist_to_prob(predictions):
    y_prob = 1 - (np.clip(predictions, lower_lim, upper_lim) - lower_lim) / (upper_lim - lower_lim)
    return y_prob
  
class MetricCallback(keras.callbacks.Callback):
    def __init__(self, logdir):
        super(Callback, self).__init__()
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.train_writer = tf.summary.create_file_writer(logdir + '/train')
        self.valid_writer = tf.summary.create_file_writer(logdir + '/valid')
        self.class_encoded = {
            0: 'not_related',
            1: 'related'
        }
        
    def tb_writer(self, items_to_write, wtype, epoch):
        writer = self.train_writer if wtype == 'train' else self.valid_writer
        
        with writer.as_default():
            for name, value in items_to_write.items():
                tf.summary.scalar(name, value, epoch)
            writer.flush()
        
    def on_epoch_end(self, epoch, logs={}):
        val_true = []
        val_pred = []
        batches = batched_pairs(
            val_rlts,
            eval_batch_size,
            eval_dataset_period,
            input_shape,
            preprocess,
            shuffle=False,
            slice_imgs=2
        )
        
        for batch in batches:
            val_pred.append(self.model.predict(batch[0]))
            val_true.extend(list(batch[1]))
        
        val_true = np.array(val_true)
        val_pred = np.concatenate(val_pred, axis=0).squeeze()
        val_loss = contrastive_loss(K.constant(val_true), K.constant(val_pred))
        
        val_true = val_true.astype(int)
        val_pos_m, val_pos_s, val_neg_m, val_neg_s = val_distance_stats(val_pred, val_true)
        threshold = MARGIN
        
        # Precision and recall
        val_cls = (val_pred < threshold).astype(int)
        val_precision, val_recall, _, _ = precision_recall_fscore_support(val_true, val_cls, labels=[0, 1])
        val_accuracy = accuracy_score(val_true, val_cls)
        
        # Area under ROC
        val_probs = dist_to_prob(val_pred)
        val_roc_auc = roc_auc_score(val_true, val_probs)
        train_loss = logs['loss']
        tb_logs = {}
        tb_logs['train/loss'] = train_loss
        
        self.tb_writer(tb_logs, wtype='train', epoch=epoch)
        
        tb_logs = {}
        tb_logs['valid/loss'] = val_loss
        logs['val_loss'] = val_loss
        for k, v in self.class_encoded.items():
            tb_logs['valid/precision/' + v] = val_precision[k]
            tb_logs['valid/recall/' + v] = val_recall[k]
            tb_logs['valid/dist_mean/' + v] = val_pos_m if k else val_neg_m
            tb_logs['valid/dist_std/' + v] = val_pos_s if k else val_neg_s
        
        tb_logs['valid/accuracy'] = val_accuracy
        logs['val_roc_auc'] = val_roc_auc
        tb_logs['valid/roc_auc'] = val_roc_auc

        self.tb_writer(tb_logs, wtype='valid', epoch=epoch)

In [18]:
model_name = 'vggface_res50_2048_cos_005'

#Save training configuration
with open(f'configs/{model_name}.json', 'w') as f:
    json.dump(train_config, f)

logdir = os.path.join('logs', model_name)
ckpt_dir = os.path.join('checkpoints', model_name)
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)
ckpt_callback = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(ckpt_dir, 'model.hdf5'),
    monitor='val_roc_auc',
    save_best_only=True,
    mode='max',
    save_weights_only=False,
    verbose=1,
)
metric_callback = MetricCallback(logdir)

reduce_on_plateau = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.4,
    patience=10,
    verbose=1,
    min_delta=1e-5,
    min_lr = 5e-9
)

2021-09-24 23:19:23.430412: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-09-24 23:19:23.430431: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2021-09-24 23:19:23.431296: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1611] Profiler found 1 GPUs
2021-09-24 23:19:23.447961: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcupti.so.11.4
2021-09-24 23:19:23.554701: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-09-24 23:19:23.554858: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1744] CUPTI activity buffer flushed


In [19]:
def repeat_generator(rlts, batch_size, dataset_period, input_shape, preprocess):
    while True:
        for e in batched_pairs(rlts, batch_size, dataset_period, input_shape, preprocess):
            yield e
            
model.fit(
    repeat_generator(train_rlts, batch_size, dataset_period, input_shape, preprocess),
    epochs=epochs,
    steps_per_epoch=train_len//batch_size,
    callbacks=[metric_callback, ckpt_callback, reduce_on_plateau]
)

Epoch 1/1000


2021-09-24 23:19:26.532213: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-24 23:19:26.552904: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3600000000 Hz




2021-09-24 23:19:31.438564: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudnn.so.8
2021-09-24 23:19:32.594049: I tensorflow/stream_executor/cuda/cuda_dnn.cc:380] Loaded cuDNN version 8202
2021-09-24 23:19:33.966891: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcublas.so.11
2021-09-24 23:19:35.353791: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcublasLt.so.11



Epoch 00001: val_roc_auc improved from -inf to 0.55227, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 2/1000

Epoch 00002: val_roc_auc improved from 0.55227 to 0.76979, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 3/1000

Epoch 00003: val_roc_auc improved from 0.76979 to 0.79424, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 4/1000

Epoch 00004: val_roc_auc improved from 0.79424 to 0.80903, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 5/1000

Epoch 00005: val_roc_auc improved from 0.80903 to 0.81587, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 6/1000

Epoch 00006: val_roc_auc improved from 0.81587 to 0.82312, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 7/1000

Epoch 00007: val_roc_auc improved from 0.82312 to 0.82791, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 8/1000

Epoch 00008: val_roc_auc improved from 0.82791 to 0.83191, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 9/1000

Epoch 00009: val_roc_auc improved from 0.83191 to 0.83406, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 10/1000

Epoch 00010: val_roc_auc improved from 0.83406 to 0.83832, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 11/1000

Epoch 00011: val_roc_auc did not improve from 0.83832
Epoch 12/1000

Epoch 00012: val_roc_auc improved from 0.83832 to 0.83912, saving model to checkpoints/vggface_res50_2048_cos_005/model.hdf5




Epoch 13/1000
 99/809 [==>...........................] - ETA: 3:15 - loss: 0.0042

KeyboardInterrupt: 

# Submission

In [None]:
# Load submission pairs
submission_path = 'data/sample_submission.csv'
submission_df = pd.read_csv(submission_path)

In [None]:
# Load models
ckpt_path = 'checkpoints/vggface_res50_512_cos_002/model.hdf5'
model.load_weights(ckpt_path)
# model = keras.models.load_model(ckpt_path, custom_objects={'contrastive_loss': contrastive_loss})

In [None]:
# Iterate over submission pairs
submission_df = submission_df.astype({'is_related': 'float'})
is_related = submission_df['is_related']
predictions = []
for idx, row in submission_df.iterrows():
    # Load images
    img_pair = row['img_pair']
    img1_name, img2_name = img_pair.split('-')
    img1_path = os.path.join('data/test', img1_name)
    img2_path = os.path.join('data/test', img2_name)
    img1 = image.load_img(img1_path)
    img2 = image.load_img(img2_path)
    img1 = preprocess(np.array(img1).astype('float32'))
    img2 = preprocess(np.array(img2).astype('float32'))
    img1 = np.expand_dims(img1, 0)
    img2 = np.expand_dims(img2, 0)
    
    # Do an inference, and calculate probability according to distance
    y_pred = model.predict({'input_1':img1, 'input_2':img2})
    y_pred = y_pred.squeeze()
    y_prob = dist_to_prob(y_pred)
    predictions.append(y_prob)
    is_related[idx] = y_prob
    
    # Print step
    if idx % 100 == 0:
        print(f'Processed rows: {idx}')
        
submission_df.to_csv(f'submission.csv', index=False)

In [None]:
plt.subplots(figsize=(30, 8))
plt.hist(predictions, 1000)
plt.locator_params(axis='x', nbins=100)
plt.xticks(rotation = 45)
plt.show()

In [None]:
submission_df = pd.read_csv(submission_path)
is_related = submission_df['is_related']
# print(is_related.sum())
for i, pred in enumerate(predictions):
    if pred < 0.149:
        is_related[i] = 1
submission_df.to_csv(f'submission.csv', index=False)