In [None]:
import os
import pickle
import keras
import pandas as pd
import numpy as np
import tensorflow as tf

from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from keras.optimizers import RMSprop, SGD, Adam
from keras.regularizers import l2
from keras import backend as K
from keras.models import Model
from keras.callbacks import Callback
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Conv1D, Attention, GlobalAveragePooling1D

np.random.seed(123)

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [None]:
with open('data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(train_embeddings.values())[0][0].shape
print(f'Embeddings shape: {embedding_shape}')

# Loss and metrics functions

In [None]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''
    Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1.0
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

def accuracy(y_true, y_pred):
    # Compute classification accuracy with a fixed threshold on distances.
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

# Training pairs generating

Positive pairs are generated according to the input csv file, and negative pairs are generated mathcing persons from random differnet families.

In [None]:
def pairs_set(input_pairs):
    '''
    Create positive pairs according to input .csv file.
    Negative pairs: for each person of positive pair create negative pair
    by picking some random person with whome they are not in the relationship.
    
    Arguments:
    input_pairs -- pandas DataFrame with positive pairs paths
    
    Returns:
    train_pairs -- array of pairs of embeddings
    labels -- labels for each pair, 1 for positive(in blood relation) and 0 for negative
    '''
    def if_pair_exists(df, anchor, new_sample):
        '''
        Check if a pair exists in the dataframe.
        '''
        pair_exists = ((df['p1'] == anchor) & (df['p2'] == new_sample)).any() \
                            or ((df['p1'] == new_sample) & (df['p2'] == anchor)).any()
        return pair_exists
    
    all_persons = input_pairs['p1'].unique().tolist() + \
                    input_pairs['p2'].unique().tolist()
    n = len(all_persons)
    train_pairs = []
    labels = []
    errors = 0
    for idx, row in input_pairs.iterrows():
        try:
            # Add positive pair
            new_pairs = make_pairs(row['p1'], row['p2'])
            train_pairs += new_pairs
            labels += [1.] * len(new_pairs)
            
            # Add negative pairs
            # For the person p1
            rnd_idx = np.random.randint(n)
            negative_sample = all_persons[rnd_idx]
            
            while(if_pair_exists(input_pairs, row['p1'], negative_sample)):
                rnd_idx = np.random.randint(n)
                negative_sample = all_persons[rnd_idx]
            
            new_pairs = make_pairs(row['p1'], negative_sample)
            train_pairs += new_pairs
            labels += [0.] * len(new_pairs)
            
            # For the person p2
            rnd_idx = np.random.randint(n)
            negative_sample = all_persons[rnd_idx]
            
            while(if_pair_exists(input_pairs, row['p2'], negative_sample)):
                rnd_idx = np.random.randint(n)
                negative_sample = all_persons[rnd_idx]
            
            new_pairs = make_pairs(row['p2'], negative_sample)
            train_pairs += new_pairs
            labels += [0.] * len(new_pairs)
            
        except KeyError:
            errors += 1
    print(f'\nThere are {errors} key errors of relationships.')
    return np.array(train_pairs), np.array(labels)

In [None]:
def make_pairs(p1, p2):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
    pairs = []
    img_path1 = p1.replace('/', '\\')
    img_path2 = p2.replace('/', '\\')
    
    dir1 = np.expand_dims(train_embeddings[img_path1], axis=-1)
    dir2 = np.expand_dims(train_embeddings[img_path2], axis=-1)
    n = min(len(dir1), len(dir2))
    
    for i in range(n):
        pairs.append([dir1[i], dir2[i]])
    return pairs

In [None]:
# Read relatives' pairs
train_rltshps = pd.read_csv("data/train_relationships.csv")
train_rltshps.head()

In [None]:
# Shuffle rows in pandas DataFrame
train_rltshps = train_rltshps.sample(frac=1, random_state=123).reset_index(drop=True)
train_rltshps.info

In [None]:
# Create training and validation sets
val_rltshps = train_rltshps.iloc[3300:]
train_rltshps = train_rltshps.iloc[:3300]
print(train_rltshps.shape)
print(val_rltshps.shape)

In [None]:
train_pairs, train_labels = pairs_set(train_rltshps)
val_pairs, val_labels = pairs_set(val_rltshps)

In [None]:
print(train_pairs.shape)
print(val_pairs.shape)

# Siamise network

In [None]:
def conv1D_model(input_shape, l2_value, dropout):
    '''
    Create deep Keras model.
    
    Arguments:
    input_shape -- shape of the input layer
    
    Returns:
    Model -- Keras model
    '''
    input = Input(shape=input_shape)
    x = Conv1D(input.shape[1] // 64, 7, kernel_regularizer=l2(l2_value), activation='relu')(input)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 64, 11, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 32, 17, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 32, 17, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 16, 19, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 16, 19, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Conv1D(input.shape[1] // 8, 19, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    
    x = Flatten()(x)
    
    x = Dense(input.shape[1] // 16, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(input.shape[1] // 32, kernel_regularizer=l2(l2_value), activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(input.shape[1] // 32, kernel_regularizer=l2(l2_value), activation='relu')(x)
    return Model(input, x)

def attention_model(input_shape, train_mode=True):
    '''
    Inspired by code example:
    https://www.tensorflow.org/api_docs/python/tf/keras/layers/Attention
    '''
    input = Input(shape=input_shape, dtype='int32')
    query_input = value_input = K.squeeze(input, -1)
    
    # Embedding lookup.
    token_embedding = tf.keras.layers.Embedding(input_dim=input_shape[1], output_dim=64)
    # Query embeddings of shape [batch_size, Tq, dimension].
    query_embeddings = token_embedding(query_input)
    # Value embeddings of shape [batch_size, Tv, dimension].
    value_embeddings = token_embedding(value_input)

    query_seq_encoding = Conv1D(input.shape[1] // 4, 5, activation='relu', padding='same')(
        query_embeddings)
    value_seq_encoding = Conv1D(input.shape[1] // 4, 5, activation='relu', padding='same')(
        value_embeddings)
    
    query_value_attention_seq = tf.keras.layers.Attention()(
        [query_seq_encoding, value_seq_encoding], training=train_mode)
    
    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
        query_seq_encoding)
    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
        query_value_attention_seq)
    
    # Concatenate query and document encodings to produce a DNN input layer.
    attn_out_layer = tf.keras.layers.Concatenate()([query_encoding, query_value_attention])
    return Model(input, attn_out_layer)

# Run training

In [None]:
learning_rate = 1e-5
l2_value = 0
dropout = 0
epochs = 2000
batch_size = 8

In [None]:
# Input has 512 embeddings
base_network = conv1D_model(train_pairs.shape[-2:], l2_value, dropout)

In [None]:
base_network.summary()

In [None]:
# Creation of Siamese network
input1 = Input(shape=embedding_shape[0])
input2 = Input(shape=embedding_shape[0])
processed1 = base_network(input1)
processed2 = base_network(input2)

In [None]:
distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed1, processed2])

model = Model([input1, input2], distance)
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss=contrastive_loss, optimizer=optimizer)

## Run tensorboard plugin in order to track changes of training

In [None]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [None]:
%tensorboard --logdir=./logs --port=7007

## Training Callbacks

In [None]:
class MetricCallback(keras.callbacks.Callback):
    def __init__(self, model_name, logs_dir='./logs'):
        super(Callback, self).__init__()
        logdir = os.path.join(logs_dir, model_name)
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.train_writer = tf.summary.create_file_writer(logdir + '/train')
        self.valid_writer = tf.summary.create_file_writer(logdir + '/valid')
        self.step_number = 0
        
    def tb_writer(self, items_to_write, wtype):
        writer = self.train_writer if wtype == 'train' else self.valid_writer
        
        with writer.as_default():
            for name, value in items_to_write.items():
                tf.summary.scalar(name, value, self.step_number)
            writer.flush()
        
    def on_epoch_end(self, epoch, logs={}):
        class_encoded = {
            0: 'not_related',
            1: 'related'
        }
        
        train_pred = self.model.predict([train_pairs[:, 0], train_pairs[:, 1]])
        train_pred = (train_pred.squeeze() < 1.0).astype(int)
        train_true = train_labels.astype(int)
        
        val_pred = self.model.predict([val_pairs[:, 0], val_pairs[:, 1]])
        val_pred = (val_pred.squeeze() < 1.0).astype(int)
        val_true = val_labels.astype(int)
        
        train_accuracy = accuracy_score(train_true, train_pred)
        valid_accuracy = accuracy_score(val_true, val_pred)
        train_precision, train_recall, _, _ = precision_recall_fscore_support(train_true, train_pred, labels=[0, 1])
        valid_precision, valid_recall, _, _ = precision_recall_fscore_support(val_true, val_pred, labels=[0, 1])
        
        train_loss = logs['loss']
        valid_loss = logs['val_loss']
        logs = {}
        logs['train/loss'] = train_loss
        logs['train/accuracy'] = train_accuracy
        
        for k, v in class_encoded.items():
            logs['train/precision/' + v] = train_precision[k]
            logs['train/recall/' + v] = train_recall[k]
        
        self.tb_writer(logs, wtype='train')
        
        logs = {}
        logs['valid/loss'] = valid_loss
        logs['valid/accuracy'] = valid_accuracy
        
        for k, v in class_encoded.items():
            logs['valid/precision/' + v] = valid_precision[k]
            logs['valid/recall/' + v] = valid_recall[k]

        self.tb_writer(logs, wtype='valid')
        self.step_number += 1

In [None]:
model_name = 'model_1'
logdir = os.path.join('logs', model_name)
ckpt_dir = os.path.join('checkpoints', model_name)
os.makedirs(ckpt_dir)
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)
chkpt_callback = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(ckpt_dir, 'weights.{epoch:02d}-{val_loss:.2f}.hdf5'),
    save_weights_only=True,
    period=10
)
metric_callback = MetricCallback(model_name)

In [None]:
model.fit([train_pairs[:, 0],
           train_pairs[:, 1]],
           train_labels,
           batch_size=batch_size,
           epochs=epochs,
           validation_data=([val_pairs[:, 0], val_pairs[:, 1]], val_labels),
           callbacks=[metric_callback, chkpt_callback]
         )