In [None]:
import os
import pickle
import keras
import pandas as pd
import numpy as np

from datetime import datetime
from matplotlib import pyplot as plt
from keras.optimizers import RMSprop, SGD, Adam
from keras.regularizers import l2
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda

# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [None]:
with open('data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(train_embeddings.values())[0][0].shape
print(f'Embeddings shape: {embedding_shape}')

# Loss and metrics functions

In [None]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''
    Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

def compute_accuracy(y_true, y_pred):
    # Compute classification accuracy with a fixed threshold on distances.
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

def accuracy(y_true, y_pred):
    # Compute classification accuracy with a fixed threshold on distances.
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

# Training pairs generating

Positive pairs are generated according to the input csv file, and negative pairs are generated mathcing persons from random differnet families.

In [None]:
def pairs_set(input_pairs):
    '''
    Create positive pairs according to input .csv file.
    Negative pairs are generated by randomly picking people from different families.
    
    Arguments:
    input_pairs -- pandas DataFrame with positive pairs paths
    
    Returns:
    train_pairs -- array of pairs of embeddings
    labels -- labels for each pair, 1 for positive(in blood relation) and 0 for negative
    '''
    n = len(input_pairs.index)
    train_pairs = []
    labels = []
    errors = 0
    for idx, row in input_pairs.iterrows():
        try:
            #Add positive pair
            new_pairs = make_pairs(row['p1'], row['p2'])
            train_pairs += new_pairs
            labels += [1.] * len(new_pairs)
            
            #Add negative pair
            rnd_idx = np.random.randint(n)
            while(row['p1'][:5] == input_pairs.iloc[rnd_idx][1][:5]):
                rnd_idx = np.random.randint(n)
            
            new_pairs = make_pairs(row['p1'], input_pairs.iloc[rnd_idx][1])
            train_pairs += new_pairs
            labels += [0.] * len(new_pairs)
            
        except KeyError:
            errors += 1
    print(f'\nThere are {errors} key errors of {len(train_rltshps)} relationships.')
    return np.array(train_pairs), np.array(labels)

In [None]:
def make_pairs(p1, p2):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
    pairs = []
    img_path1 = p1.replace('/', '\\')
    img_path2 = p2.replace('/', '\\')
    
    dir1 = train_embeddings[img_path1]
    dir2 = train_embeddings[img_path2]
    n = min(len(dir1), len(dir2))
    
    for i in range(n):
        pairs.append([dir1[i], dir2[i]])
    return pairs

In [None]:
# Read relatives' pairs
train_rltshps = pd.read_csv("data/train_relationships.csv")
train_rltshps.head()

In [None]:
# Shuffle rows in pandas DataFrame
train_rltshps = train_rltshps.sample(frac=1).reset_index(drop=True)
train_rltshps.info

In [None]:
# Create training and validation sets
val_rltshps = train_rltshps.iloc[3300:]
train_rltshps = train_rltshps.iloc[:3300]
print(train_rltshps.shape)
print(val_rltshps.shape)

In [None]:
train_pairs, train_labels = pairs_set(train_rltshps)
val_pairs, val_labels = pairs_set(val_rltshps)

In [None]:
print(train_pairs.shape)
print(val_pairs.shape)

# Siamise network

In [None]:
def create_base_network(input_shape):
    '''
    Create deep Keras model.
    
    Arguments:
    input_shape -- shape of the input layer
    
    Returns:
    Model -- Keras model
    '''
    input = Input(shape=input_shape)
    x = Dense(input.shape[1] // 2, activation = 'relu')(input)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 4, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 8, activation='relu')(x)
    return Model(input, x)

In [None]:
# Input has 512 embeddings
base_network = create_base_network(embedding_shape[0])

In [None]:
base_network.summary()

In [None]:
# Creation of Siamese network
input1 = Input(shape=embedding_shape[0])
input2 = Input(shape=embedding_shape[0])
processed1 = base_network(input1)
processed2 = base_network(input2)

In [None]:
distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed1, processed2])

model = Model([input1, input2], distance)

# Run training

In [None]:
learning_rate = 1e-3
epochs = 2000
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss=contrastive_loss, optimizer=optimizer)

## Run tensorboard plugin in order to track changes of training

In [None]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [None]:
logdir = os.path.join("logs", 'model_2')
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [None]:
%tensorboard --logdir=./logs --port=7007

In [None]:
model.fit([train_pairs[:, 0],
           train_pairs[:, 1]],
           train_labels,
           batch_size=128,
           epochs=epochs,
           validation_data=([val_pairs[:, 0], val_pairs[:, 1]], val_labels),
           callbacks=[tensorboard_callback]
         )

In [None]:
# Compute test accuracy
y_pred = model.predict([val_pairs[:, 0], val_pairs[:, 1]])
te_acc = compute_accuracy(val_labels, y_pred < 1.)
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))