This is proof-of-concept code for training FaceNet with pure Keras. This has not been tested in practice and quite probably Labeled Faces in the Wild dataset is too small to train it anyway.

In [1]:
# download Labeled Faces in the Wild dataset
!wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz

--2019-04-14 15:39:37--  http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz
Resolving vis-www.cs.umass.edu (vis-www.cs.umass.edu)... 128.119.244.95
Connecting to vis-www.cs.umass.edu (vis-www.cs.umass.edu)|128.119.244.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 108761145 (104M) [application/x-gzip]
Saving to: ‘lfw-deepfunneled.tgz.1’


2019-04-14 15:42:17 (664 KB/s) - ‘lfw-deepfunneled.tgz.1’ saved [108761145/108761145]



In [2]:
# uncompress the dataset
!tar xzf lfw-deepfunneled.tgz

In [3]:
from keras.applications.resnet50 import ResNet50
from keras.layers import Input, Dense, Flatten, Lambda
from keras.models import Model, Sequential
import keras.backend as K

Using TensorFlow backend.


In [4]:
# input to the network, this could also be the output a different Keras model or layer
input_tensor = Input(shape=(250, 250, 3))  # this assumes K.image_data_format() == 'channels_last'

resnet50 = ResNet50(input_tensor=input_tensor, weights='imagenet', include_top=False)
resnet50.trainable = False
resnet50.summary()

Instructions for updating:
Colocations handled automatically by placer.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 256, 256, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 125, 125, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 125, 125, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [5]:
# create embedding network
model_embed = Sequential()
model_embed.add(resnet50)
model_embed.add(Flatten())
model_embed.add(Dense(128))
model_embed.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
model_embed.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Model)             (None, 8, 8, 2048)        23587712  
_________________________________________________________________
flatten_1 (Flatten)          (None, 131072)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16777344  
_________________________________________________________________
lambda_1 (Lambda)            (None, 128)               0         
Total params: 40,365,056
Trainable params: 16,777,344
Non-trainable params: 23,587,712
_________________________________________________________________


In [6]:
# create training network with three branches
xa = Input(shape=(250, 250, 3))
ea = model_embed(xa)

xp = Input(shape=(250, 250, 3))
ep = model_embed(xp)

xn = Input(shape=(250, 250, 3))
en = model_embed(xn)

alpha = 1.
def triplet_loss(x):
    anchor_embed, pos_embed, neg_embed = x
    dists_pos = K.sum((anchor_embed - pos_embed)**2, axis=-1)
    dists_neg = K.sum((anchor_embed - neg_embed)**2, axis=-1)
    return K.maximum(dists_pos - dists_neg + alpha, 0.)

loss = Lambda(triplet_loss, output_shape=(1, ))([ea, ep, en])

model_train = Model(inputs=[xa, xp, xn], outputs=[loss])
model_train.compile(loss=lambda y_true, y_pred: y_pred, optimizer='adam')
model_train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 128)          40365056    input_2[0][0]                    
                                                                 input_3[0][0]                    
          

In [7]:
from skimage.io import imread
import os
import random
import numpy as np

lfw_dir = 'lfw-deepfunneled'

# generator that samples anchors and positive-negative samples
def triplet_generator(batch_size=10):
    people = os.listdir(lfw_dir)
    print("Number of people:", len(people))

    while True:
        anchor_images = []
        pos_images = []
        neg_images = []

        n = 0
        while n < batch_size:
            pos_person, neg_person = random.sample(people, 2)
            pos_photos = os.listdir(os.path.join(lfw_dir, pos_person))
            if len(pos_photos) < 2:
                #print("fail")
                continue
            anchor_file, pos_file = random.sample(pos_photos, 2)
            neg_photos = os.listdir(os.path.join(lfw_dir, neg_person))
            neg_file = random.choice(neg_photos)
            #print(anchor_file, pos_file, neg_file)

            anchor_image = imread(os.path.join(lfw_dir, pos_person, anchor_file))
            pos_image = imread(os.path.join(lfw_dir, pos_person, pos_file))
            neg_image = imread(os.path.join(lfw_dir, neg_person, neg_file))
            #print(anchor_image.shape, pos_image.shape, neg_image.shape)

            anchor_images.append(anchor_image)
            pos_images.append(pos_image)
            neg_images.append(neg_image)

            n += 1

        anchor_images = np.array(anchor_images)
        pos_images = np.array(pos_images)
        neg_images = np.array(neg_images)

        yield [anchor_images, pos_images, neg_images], np.zeros((n, ))

In [8]:
# train model
model_train.fit_generator(triplet_generator(), steps_per_epoch=100, epochs=10)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Number of people: 5749
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fef6ca71438>

In [9]:
# download validation pairs
!wget http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt

--2019-04-14 15:49:23--  http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt
Resolving vis-www.cs.umass.edu (vis-www.cs.umass.edu)... 128.119.244.95
Connecting to vis-www.cs.umass.edu (vis-www.cs.umass.edu)|128.119.244.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26002 (25K) [text/plain]
Saving to: ‘pairsDevTest.txt.1’


2019-04-14 15:49:24 (186 KB/s) - ‘pairsDevTest.txt.1’ saved [26002/26002]



In [10]:
# read in validation pairs
left_images = []
right_images = []
targets = []

with open('pairsDevTest.txt', 'r') as f:
    line = f.readline()
    for line in f:
        data = line[:-1].split('\t')
        if len(data) == 3:
            person = data[0]
            left_file = person + '_%04d.jpg' % int(data[1])
            right_file = person + '_%04d.jpg' % int(data[2])
            left_image = imread(os.path.join(lfw_dir, person, left_file))
            right_image = imread(os.path.join(lfw_dir, person, right_file))
            left_images.append(left_image)
            right_images.append(right_image)
            targets.append(1)
        elif len(data) == 4:
            left_person = data[0]
            right_person = data[2]
            left_file = left_person + '_%04d.jpg' % int(data[1])
            right_file = right_person + '_%04d.jpg' % int(data[3])
            left_image = imread(os.path.join(lfw_dir, left_person, left_file))
            right_image = imread(os.path.join(lfw_dir, right_person, right_file))
            left_images.append(left_image)
            right_images.append(right_image)
            targets.append(0)
        else:
            assert False

left_images = np.array(left_images)
right_images = np.array(right_images)
targets = np.array(targets)
print(left_images.shape, right_images.shape, targets.shape)

(1000, 250, 250, 3) (1000, 250, 250, 3) (1000,)


In [11]:
# create prediction model that outputs distance between two images
dist = Lambda(lambda x: K.sum((x[0] - x[1])**2, axis=-1))([ep, en])

model_predict = Model(inputs=[xp, xn], outputs=dist)
model_predict.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 250, 250, 3)  0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 128)          40365056    input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None,)              0           sequential_1[2][0]               
          

In [12]:
# calculate distances between validation set pairs
dists = model_predict.predict([left_images, right_images])
dists.shape, dists.min(), dists.max(), dists.mean()

((1000,), 0.057256132, 2.9857984, 0.9309368)

In [13]:
# calculate accuracy for given threshold alpha
np.mean((dists < alpha) == targets)

0.788