In [3]:
import os
import pickle
import keras
import pandas as pd
import numpy as np

from datetime import datetime
from matplotlib import pyplot as plt
from keras.optimizers import RMSprop, SGD, Adam
from keras.regularizers import l2
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Conv1D

# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [4]:
with open('data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(train_embeddings.values())[0][0].shape
print(f'Embeddings shape: {embedding_shape}')

The keys examples: ['F0002\\MID1', 'F0002\\MID2', 'F0002\\MID3', 'F0005\\MID1', 'F0005\\MID2']
Embeddings shape: (512,)


# Loss and metrics functions

In [5]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''
    Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1.0
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

def compute_accuracy(y_true, y_pred):
    # Compute classification accuracy with a fixed threshold on distances.
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

def accuracy(y_true, y_pred):
    # Compute classification accuracy with a fixed threshold on distances.
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

# Training pairs generating

Positive pairs are generated according to the input csv file, and negative pairs are generated mathcing persons from random differnet families.

In [6]:
def pairs_set(input_pairs):
    '''
    Create positive pairs according to input .csv file.
    Negative pairs are generated by randomly picking people from different families.
    
    Arguments:
    input_pairs -- pandas DataFrame with positive pairs paths
    
    Returns:
    train_pairs -- array of pairs of embeddings
    labels -- labels for each pair, 1 for positive(in blood relation) and 0 for negative
    '''
    n = len(input_pairs.index)
    train_pairs = []
    labels = []
    errors = 0
    for idx, row in input_pairs.iterrows():
        try:
            #Add positive pair
            new_pairs = make_pairs(row['p1'], row['p2'])
            train_pairs += new_pairs
            labels += [1.] * len(new_pairs)
            
            #Add negative pair
            rnd_idx = np.random.randint(n)
            while(row['p1'][:5] == input_pairs.iloc[rnd_idx][1][:5]):
                rnd_idx = np.random.randint(n)
            
            new_pairs = make_pairs(row['p1'], input_pairs.iloc[rnd_idx][1])
            train_pairs += new_pairs
            labels += [0.] * len(new_pairs)
            
        except KeyError:
            errors += 1
    print(f'\nThere are {errors} key errors of {len(train_rltshps)} relationships.')
    return np.array(train_pairs), np.array(labels)

In [7]:
def make_pairs(p1, p2):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
    pairs = []
    img_path1 = p1.replace('/', '\\')
    img_path2 = p2.replace('/', '\\')
    
    dir1 = np.expand_dims(train_embeddings[img_path1], axis=-1)
    dir2 = np.expand_dims(train_embeddings[img_path2], axis=-1)
    n = min(len(dir1), len(dir2))
    
    for i in range(n):
        pairs.append([dir1[i], dir2[i]])
    return pairs

In [8]:
# Read relatives' pairs
train_rltshps = pd.read_csv("data/train_relationships.csv")
train_rltshps.head()

Unnamed: 0,p1,p2
0,F0002/MID1,F0002/MID3
1,F0002/MID2,F0002/MID3
2,F0005/MID1,F0005/MID2
3,F0005/MID3,F0005/MID2
4,F0009/MID1,F0009/MID4


In [9]:
# Shuffle rows in pandas DataFrame
train_rltshps = train_rltshps.sample(frac=1).reset_index(drop=True)
train_rltshps.info

<bound method DataFrame.info of               p1           p2
0     F0020/MID5  F0020/MID10
1     F0123/MID3   F0123/MID4
2     F0993/MID6   F0993/MID4
3     F0101/MID8  F0101/MID12
4     F0538/MID5   F0538/MID2
...          ...          ...
3593  F0198/MID5   F0198/MID2
3594  F0579/MID1   F0579/MID7
3595  F0064/MID2   F0064/MID5
3596  F0916/MID1   F0916/MID2
3597  F0818/MID2   F0818/MID6

[3598 rows x 2 columns]>

In [10]:
# Create training and validation sets
val_rltshps = train_rltshps.iloc[3300:]
train_rltshps = train_rltshps.iloc[:3300]
print(train_rltshps.shape)
print(val_rltshps.shape)

(3300, 2)
(298, 2)


In [11]:
train_pairs, train_labels = pairs_set(train_rltshps)
val_pairs, val_labels = pairs_set(val_rltshps)


There are 334 key errors of 3300 relationships.

There are 25 key errors of 3300 relationships.


In [12]:
print(train_pairs.shape)
print(val_pairs.shape)

(19972, 2, 512, 1)
(1768, 2, 512, 1)


# Siamise network

In [13]:
def create_base_network(input_shape):
    '''
    Create deep Keras model.
    
    Arguments:
    input_shape -- shape of the input layer
    
    Returns:
    Model -- Keras model
    '''
    input = Input(shape=input_shape)
    x = Conv1D(input.shape[1] // 2, 3, activation='relu')(input)
    x = Dropout(0.1)(x)
    x = Conv1D(input.shape[1] // 4, 5, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Conv1D(input.shape[1] // 8, 5, activation='relu')(x)
    x = Dropout(0.1)(x)
    
    x = Flatten()(x)
    
    x = Dense(input.shape[1] // 4, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 4, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 4, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 8, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 8, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 8, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(input.shape[1] // 8, activation='relu')(x)
    return Model(input, x)

In [14]:
# Input has 512 embeddings
base_network = create_base_network(train_pairs.shape[-2:])

In [15]:
base_network.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 512, 1)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 510, 256)          1024      
_________________________________________________________________
dropout (Dropout)            (None, 510, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 506, 128)          163968    
_________________________________________________________________
dropout_1 (Dropout)          (None, 506, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 502, 64)           41024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 502, 64)          

In [16]:
# Creation of Siamese network
input1 = Input(shape=embedding_shape[0])
input2 = Input(shape=embedding_shape[0])
processed1 = base_network(input1)
processed2 = base_network(input2)

In [17]:
distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed1, processed2])

model = Model([input1, input2], distance)

# Run training

In [18]:
learning_rate = 1e-3
epochs = 2000
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss=contrastive_loss, optimizer=optimizer)

## Run tensorboard plugin in order to track changes of training

In [19]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [20]:
logdir = os.path.join('logs', 'model_1')
ckpt_dir = os.path.join('checkpoints', 'model_1')
os.makedirs(ckpt_dir)
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)
chkpt_callback = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(ckpt_dir, 'weights.{epoch:02d}-{val_loss:.2f}.hdf5'),
    save_weights_only=True
)

In [21]:
%tensorboard --logdir=./logs --port=7007

In [22]:
model.fit([train_pairs[:, 0],
           train_pairs[:, 1]],
           train_labels,
           batch_size=16,
           epochs=epochs,
           validation_data=([val_pairs[:, 0], val_pairs[:, 1]], val_labels),
           callbacks=[tensorboard_callback, chkpt_callback]
         )

Epoch 1/2000
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
 108/1249 [=>............................] - ETA: 2:05 - loss: 0.2522

KeyboardInterrupt: 

In [None]:
# Compute test accuracy
model_ckpt = ''
model.load_weigths(model_ckpt)
y_pred = model.predict([train_pairs[:, 0], train_pairs[:, 1]])
te_acc = compute_accuracy(train_labels, y_pred < 1.0)
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

In [1]:
!rm -r logs/model_1

In [2]:
!rm -r checkpoints/*

rm: cannot remove 'checkpoints/*': No such file or directory


In [None]:
from matplotlib import pyplot as plt
plt.hist(y_pred, 200)
plt.show()