In [1]:
import os
import json
import pickle
import keras
import pandas as pd
import numpy as np
import tensorflow as tf

from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from keras.optimizers import RMSprop, SGD, Adam
from keras.regularizers import l2
from keras.preprocessing import image
from keras import backend as K
from keras.models import Model
from keras.callbacks import Callback, LearningRateScheduler
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Conv1D, Attention, GlobalAveragePooling1D, BatchNormalization
from keras_facenet import FaceNet

tf.random.set_seed(12)
np.random.seed(123)

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [3]:
with open('data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(train_embeddings.values())[0][0].shape
print(f'Embeddings shape: {embedding_shape}')

The keys examples: ['F0002\\MID1', 'F0002\\MID2', 'F0002\\MID3', 'F0005\\MID1', 'F0005\\MID2']
Embeddings shape: (512,)


# Training pairs generating

Available training pairs from csv files are splitted to train - validation sets. Those pairs are positive(there is blood relation). For each set(train/valid) we additionally generate negative pairs.

Positive pairs are generated according to the input csv file. For each person of positive pair we create one negative pair.
In total we'll have twice more negative than positive pairs.

In [4]:
def create_negative_paris(train_rltshps):
    '''
    Create negative pairs: for each person of positive pair create negative pair
    by picking some random person with whome they are not in the relationship.
    '''
    
    all_persons = train_rltshps['p1'].unique().tolist() + \
                    train_rltshps['p2'].unique().tolist()
    n = len(all_persons)
    negative_rltshps = []
    train_rltshps = [set(e) for e in zip(train_rltshps['p1'], train_rltshps['p2'])]
    
    for pair_set in train_rltshps:
        p1, p2 = list(pair_set)
        
        # Add negative pairs
        # For the person p1
        rnd_idx = np.random.randint(n)
        negative_sample = all_persons[rnd_idx]
                           
        while(negative_sample == p1 or \
              (set([p1, negative_sample]) in train_rltshps) or \
              (set([p1, negative_sample]) in negative_rltshps)):
            rnd_idx = np.random.randint(n)
            negative_sample = all_persons[rnd_idx]
            
        negative_rltshps.append(set([p1, negative_sample]))

        # For the person p2
        rnd_idx = np.random.randint(n)
        negative_sample = all_persons[rnd_idx]

        while(negative_sample == p2 or \
              (set([p2, negative_sample]) in train_rltshps) or \
              (set([p2, negative_sample]) in negative_rltshps)):
            rnd_idx = np.random.randint(n)
            negative_sample = all_persons[rnd_idx]
            
        negative_rltshps.append(set([p2, negative_sample]))
        
    return pd.DataFrame(negative_rltshps, columns=['p1', 'p2'])

In [5]:
def make_pairs(p1, p2):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
    pairs = []
    img_path1 = p1.replace('/', '\\')
    img_path2 = p2.replace('/', '\\')
    
    dir1 = np.expand_dims(train_embeddings[img_path1], axis=-1)
    dir2 = np.expand_dims(train_embeddings[img_path2], axis=-1)
    
    for i in range(len(dir1)):
        for j in range(len(dir2)):
            pairs.append([dir1[i], dir2[j]])
    return pairs

In [6]:
def pairs_set(input_pairs, positive=True):
    '''
    Generate pairs of images of persons.
    
    Arguments:
    input_pairs -- pandas DataFrame with pair paths
    positive -- if pair is positive (persons are related)
    
    Returns:
    train_pairs -- array of pairs of embeddings
    labels -- labels for each pair, 1 for positive(in blood relation) and 0 for negative
    '''
    pairs = []
    labels = []
    errors = 0
    for idx, row in input_pairs.iterrows():
        try:
            new_pairs = make_pairs(row['p1'], row['p2'])
            pairs += new_pairs
            label = 1. if positive else 0.
            labels += [label] * len(new_pairs)  
        except KeyError:
            errors += 1
    print(f'\nThere are {errors} key errors of relationships.')
    return pairs, labels

In [7]:
# Read relatives' pairs
train_rltshps = pd.read_csv("data/train_relationships.csv")
train_rltshps.head()

Unnamed: 0,p1,p2
0,F0002/MID1,F0002/MID3
1,F0002/MID2,F0002/MID3
2,F0005/MID1,F0005/MID2
3,F0005/MID3,F0005/MID2
4,F0009/MID1,F0009/MID4


In [8]:
# Create negative relationships
negative_rltshps = create_negative_paris(train_rltshps)

In [9]:
# Shuffle rows in pandas DataFrame
train_rltshps = train_rltshps.sample(frac=1, random_state=123).reset_index(drop=True)
negative_rltshps = negative_rltshps.sample(frac=1, random_state=123).reset_index(drop=True)

In [10]:
# Create training and validation sets
# Split positive pairs
VAL_FACTOR = 0.12
val_threshold = int(len(train_rltshps.index) * VAL_FACTOR)
val_rltshps = train_rltshps.iloc[:val_threshold]
train_rltshps = train_rltshps.iloc[val_threshold:]
print(train_rltshps.shape)
print(val_rltshps.shape)

# Split negative pairs
val_threshold = int(len(negative_rltshps.index) * VAL_FACTOR)
val_neg_rltshps = negative_rltshps.iloc[:val_threshold]
train_neg_rltshps = negative_rltshps.iloc[val_threshold:]
print(train_neg_rltshps.shape)
print(val_neg_rltshps.shape)

(3167, 2)
(431, 2)
(6333, 2)
(863, 2)


In [11]:
# Load embeddings for splitted data
train_pairs, train_labels = pairs_set(train_rltshps, True)
val_pairs, val_labels = pairs_set(val_rltshps, True)

train_neg_pairs, train_neg_labels = pairs_set(train_neg_rltshps, False)
val_neg_pairs, val_neg_labels = pairs_set(val_neg_rltshps, False)

train_pairs = np.array(train_pairs + train_neg_pairs)
train_labels = np.array(train_labels + train_neg_labels)
val_pairs = np.array(val_pairs + val_neg_pairs)
val_labels = np.array(val_labels + val_neg_labels)


There are 204 key errors of relationships.

There are 27 key errors of relationships.

There are 426 key errors of relationships.

There are 61 key errors of relationships.


In [12]:
print(train_pairs.shape)
print(val_pairs.shape)

(329529, 2, 512, 1)
(46301, 2, 512, 1)


# Siamise network

Initial experimenting is done with conv1D deep neural network, as additional option for experimenting there is simple attention module.

In [13]:
def conv1D_model(input_shape, l2_value, dropout):
    '''
    Create deep Keras model.
    
    Arguments:
    input_shape -- shape of the input layer
    
    Returns:
    Model -- Keras model
    '''
    def residual(x, kernel, l2_value, activation='relu'):
        x1 = Conv1D(x.shape[-1], kernel, kernel_regularizer=l2(l2_value), activation=activation, padding='same')(x)
        x1 = BatchNormalization()(x1)
        return x + x1
    
    input = Input(shape=input_shape)
    x = Conv1D(input.shape[1] // 64, 7, kernel_regularizer=l2(l2_value), activation='tanh')(input)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 7, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 64, 11, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 11, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 32, 17, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 17, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 32, 17, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 17, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 16, 19, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 19, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 16, 19, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 19, l2_value, 'tanh')
    
    x = Conv1D(input.shape[1] // 8, 19, kernel_regularizer=l2(l2_value), activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = residual(x, 19, l2_value, 'tanh')
    
    x = Flatten()(x)
    
    x = Dense(input.shape[1] // 16, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    return Model(input, x)

def attention_model(input_shape, train_mode=True):
    '''
    Inspired by code example:
    https://www.tensorflow.org/api_docs/python/tf/keras/layers/Attention
    '''
    input = Input(shape=input_shape, dtype='int32')
    query_input = value_input = K.squeeze(input, -1)
    
    # Embedding lookup.
    token_embedding = tf.keras.layers.Embedding(input_dim=input_shape[1], output_dim=64)
    # Query embeddings of shape [batch_size, Tq, dimension].
    query_embeddings = token_embedding(query_input)
    # Value embeddings of shape [batch_size, Tv, dimension].
    value_embeddings = token_embedding(value_input)

    query_seq_encoding = Conv1D(input.shape[1] // 4, 5, activation='relu', padding='same')(
        query_embeddings)
    value_seq_encoding = Conv1D(input.shape[1] // 4, 5, activation='relu', padding='same')(
        value_embeddings)
    
    query_value_attention_seq = tf.keras.layers.Attention()(
        [query_seq_encoding, value_seq_encoding], training=train_mode)
    
    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
        query_seq_encoding)
    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
        query_value_attention_seq)
    
    # Concatenate query and document encodings to produce a DNN input layer.
    attn_out_layer = tf.keras.layers.Concatenate()([query_encoding, query_value_attention])
    return Model(input, attn_out_layer)

# Loss and metrics functions

In [14]:
MARGIN = 1.

def euclidean_distance(vectors):
    x, y = vectors
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def cosine_similarity(vectors):
    x, y = vectors
    x_norm = tf.norm(x, axis=1)
    y_norm = tf.norm(y, axis=1)
    x_y_dot = tf.einsum('ij,ij->i', x, y)
    cos_sim = x_y_dot / (x_norm * y_norm + K.epsilon())
    return 1. - cos_sim

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss_euc(y_true, y_pred):
    '''
    Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(MARGIN - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

def contrastive_loss_cos(y_true, y_pred):
    margin_dist = K.maximum(MARGIN - y_pred, 0)
    return K.mean(y_true * y_pred + (1 - y_true) * margin_dist)

# Run training

In [15]:
learning_rate = 1e-5
l2_value = 1e-7
dropout = 0.05
epochs = 2000
batch_size = 8
dataset_period = 2
# 'euclidian' or 'cosine'
distance_type = 'cosine'

# Learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 50:
        return lr
    elif epoch < 100:
        return lr / 2
    elif epoch < 200:
        return lr / 5
    else:
        return lr / 10
    
lr_callback = LearningRateScheduler(scheduler)

# Create dictionary of parameters for saving configuration
train_config = {}
for name in ['learning_rate','l2_value','dropout','epochs','batch_size','dataset_period', 'distance_type']:
    train_config[name] = eval(name)

In [16]:
train_pairs = train_pairs[::dataset_period,...]
train_labels = train_labels[::dataset_period,...]
val_pairs = val_pairs[::dataset_period,...]
val_labels = val_labels[::dataset_period,...]

In [17]:
# Input has 512 embeddings
base_network = conv1D_model(train_pairs.shape[-2:], l2_value, dropout)

In [18]:
base_network.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512, 1)]     0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 506, 8)       64          input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 506, 8)       32          conv1d[0][0]                     
__________________________________________________________________________________________________
dropout (Dropout)               (None, 506, 8)       0           batch_normalization[0][0]        
_______________________________________________________________________________________

In [19]:
# Creation of Siamese network
input1 = Input(shape=embedding_shape[0])
input2 = Input(shape=embedding_shape[0])
processed1 = base_network(input1)
processed2 = base_network(input2)

In [20]:
distances_losses = {
    'euclidian': {
        'dist': euclidean_distance,
        'loss': contrastive_loss_euc
    },
    'cosine': {
        'dist': cosine_similarity,
        'loss': contrastive_loss_cos
    }
}
distance = Lambda(distances_losses[distance_type]['dist'],
                  output_shape=eucl_dist_output_shape)([processed1, processed2])

model = Model([input1, input2], distance)
optimizer = RMSprop(learning_rate=learning_rate)
model.compile(loss=distances_losses[distance_type]['loss'], optimizer=optimizer)

## Run tensorboard plugin in order to track changes of training

In [21]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [22]:
%tensorboard --logdir=./logs --port=7007

Reusing TensorBoard on port 7007 (pid 857), started 6:24:59 ago. (Use '!kill 857' to kill it.)

## Training Callbacks

In [23]:
class MetricCallback(keras.callbacks.Callback):
    def __init__(self, model_name, logs_dir='./logs'):
        super(Callback, self).__init__()
        logdir = os.path.join(logs_dir, model_name)
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.train_writer = tf.summary.create_file_writer(logdir + '/train')
        self.valid_writer = tf.summary.create_file_writer(logdir + '/valid')
        self.step_number = 0
        
    def tb_writer(self, items_to_write, wtype):
        writer = self.train_writer if wtype == 'train' else self.valid_writer
        
        with writer.as_default():
            for name, value in items_to_write.items():
                tf.summary.scalar(name, value, self.step_number)
            writer.flush()
        
    def on_epoch_end(self, epoch, logs={}):
        class_encoded = {
            0: 'not_related',
            1: 'related'
        }
        
        train_pred = self.model.predict([train_pairs[:, 0], train_pairs[:, 1]])
        train_true = train_labels.astype(int)
        train_pos = train_pred[train_true.astype(np.bool)]
        train_neg = train_pred[(1 - train_true).astype(np.bool)]
        train_pos_m, train_pos_s = np.mean(train_pos), np.std(train_pos)
        train_neg_m, train_neg_s = np.mean(train_neg), np.std(train_neg)
        
        val_pred = self.model.predict([val_pairs[:, 0], val_pairs[:, 1]])  
        val_true = val_labels.astype(int)
        val_pos = val_pred[val_true.astype(np.bool)]
        val_neg = val_pred[(1 - val_true).astype(np.bool)]
        val_pos_m, val_pos_s = np.mean(val_pos), np.std(val_pos)
        val_neg_m, val_neg_s = np.mean(val_neg), np.std(val_neg)
        
        # Precision and recall
        threshold = ((val_pos_m + val_pos_s) + (val_neg_m - val_neg_s)) / 2
        val_pred = (val_pred.squeeze() < threshold).astype(int)
        valid_precision, valid_recall, _, _ = precision_recall_fscore_support(val_true, val_pred, labels=[0, 1])
        
        train_loss = logs['loss']
        valid_loss = logs['val_loss']
        logs = {}
        logs['train/loss'] = train_loss
        
        for k, v in class_encoded.items():
            logs['train/dist_mean/' + v] = train_pos_m if k else train_neg_m
            logs['train/dist_std/' + v] = train_pos_s if k else train_neg_s
        
        self.tb_writer(logs, wtype='train')
        
        logs = {}
        logs['valid/loss'] = valid_loss
        
        for k, v in class_encoded.items():
            logs['valid/precision/' + v] = valid_precision[k]
            logs['valid/recall/' + v] = valid_recall[k]
            logs['valid/dist_mean/' + v] = train_pos_m if k else train_neg_m
            logs['valid/dist_std/' + v] = train_pos_s if k else train_neg_s

        self.tb_writer(logs, wtype='valid')
        self.step_number += 1

In [24]:
model_name = 'model_2'

#Save training configuration
with open(f'configs/{model_name}.json', 'w') as f:
    json.dump(train_config, f)

logdir = os.path.join('logs', model_name)
ckpt_dir = os.path.join('checkpoints', model_name)
os.makedirs(ckpt_dir)
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)
ckpt_callback = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(ckpt_dir, 'weights.{epoch:02d}-{val_loss:.2f}.hdf5'),
    save_weights_only=True,
    period=10
)
metric_callback = MetricCallback(model_name)



In [None]:
model.fit([train_pairs[:, 0],
           train_pairs[:, 1]],
           train_labels,
           batch_size=batch_size,
           epochs=epochs,
           validation_data=([val_pairs[:, 0], val_pairs[:, 1]], val_labels),
           callbacks=[metric_callback, ckpt_callback, lr_callback]
         )

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000

# Submission

In [None]:
# Load submission pairs
submission_path = 'data/sample_submission.csv'
submission_df = pd.read_csv(submission_path)

In [None]:
# Load models
ckpt_path = 'checkpoints/model_1/weights.200-0.12.hdf5'
model.load_weights(ckpt_path)
embedder = FaceNet()

In [None]:
# Iterate over submission pairs
is_related = submission_df['is_related']
for idx, row in submission_df.iterrows():
    # Load images
    img_pair = row['img_pair']
    img1_name, img2_name = img_pair.split('-')
    img1_path = os.path.join('data/test', img1_name)
    img2_path = os.path.join('data/test', img2_name)
    img1 = image.load_img(img1_path)
    img2 = image.load_img(img2_path)
    img1 = np.array(img1).astype('float32')
    img2 = np.array(img2).astype('float32')
    
    # Get FaceNet embeddings
    embedding1 = embedder.embeddings([img1])
    embedding2 = embedder.embeddings([img2])
    
    # Do an inference, if distance is smaller than margin=1.0 (from contrastive loss)
    # then there is the relation
    y_pred = model.predict([embedding1, embedding2])
    if y_pred.squeeze() < (MARGIN / 1.5):
        is_related[idx] = 1
    
    # Print step
    if idx % 100 == 0:
        print(f'Processed rows: {idx}')
submission_df.to_csv(f'submission_{model_name}.csv', index=False)

In [None]:
submission_df.to_csv(f'submission_model_1.csv', index=False)