In [None]:
!pip install dlib
!pip install face_recognition
!pip install opencv-python
!pip install imutils

# Encode the faces using OpenCV

Before being able to recognise faces in images and videos, I first need to encode (or quantify) the faces in my training set. Keep in mind that I am not actually training a network here - the network (in the library 'face_recognition') has already been trained to create 128-d embeddings from a dataset of ~3 million images.

I could alternatively train a network from scratch or even fine-tune the weights of an existing model, but that is too much to be done for many projects. Furthermore, I would need a lot of images to train the network from scratch. Instead, it is easier to use the pre-trained network and then use it to construct 128-d embeddings for each of the 30 faces in my dataset.

During classification, I have used a simple KNN model and votes to conclude the final face classification. Other traditional machine learning models could be used here as well.

In [None]:
#import the necessary packages
from imutils import paths
import face_recognition
import pickle
import cv2
import os
import time
import dlib
# import argparse
# #construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument('-i', '--dataset', required=True,
# 	help='path to input directory of faces + images')
# ap.add_argument('-e', '--encodings', required=True,
# 	help='path to serialized db of facial encodings')
# ap.add_argument('-d', '--detection-method', type=str, default='cnn',
# 	help='face detection model to use: either `hog` or `cnn`')
# args = vars(ap.parse_args())
args = {}
args['dataset'] = os.getcwd() + '\\dataset'               #path to input directory of faces and images
args['encodings'] = os.getcwd() + '\\encodings.pickle'    #path to serialized db of facial encodings
args['detection_method'] = 'cnn'                          #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.

os.getcwd()

# Create facial embeddings

In [2]:
def create_facial_embeddings(args):
    #grab the paths to the input images in our dataset
    print('[INFO] quantifying faces...')
    imagePaths = list(paths.list_images(args['dataset']))
    #initialize the list of known encodings and known names
    knownEncodings = []
    knownNames = []
    for (i, imagePath) in enumerate(imagePaths):
        print(i, imagePath)

    #OpenCV orders color channels in BGR, but the dlib actually expects RGB. The face_recognition module uses dlib, so we need to swap color spaces and name the new image rgb
    ti = time.time()
    print('[INFO] processing image...')
    #loop over the image paths
    for (i, imagePath) in enumerate(imagePaths):
        #extract the person name from the image path
        print('{}/{}'.format(i+1, len(imagePaths)), end=', ')
        name = imagePath.split(os.path.sep)[-2]
        #load the input image and convert it from BGR (OpenCV ordering) to dlib ordering (RGB)
        image = cv2.imread(imagePath)
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        #detect the (x,y)-coordinates of the bounding boxes corresponding to each face in the input image
        boxes = face_recognition.face_locations(rgb,  model=args['detection_method'])
        #compute the facial embedding for the face, ie, to turn the bounding boxes of the face into a list of 128 numbers
        encodings = face_recognition.face_encodings(rgb, boxes)
        #loop over the encodings
        for encoding in encodings:
            # add each encoding + name to our set of known names and encodings
            knownEncodings.append(encoding)
            knownNames.append(name)
    print('Done!')
    print('Time taken: {:.1f} minutes'.format((time.time() - ti)/60))

    #dump the names and encodings to disk for future recall
    #encodings.pickle contains the 128-d face embeddings for each face in our dataset
    print('[INFO] serializing encodings...')
    data = {'encodings': knownEncodings, 'names': knownNames}
    f = open(args['encodings'], 'wb')
    f.write(pickle.dumps(data))
    f.close()
    print('Done!')

In [None]:
#using CPU only, encoding 30 images required ~10min !!
args = {}
args['dataset'] = os.getcwd() + '\\dataset'               #path to input directory of faces and images
args['encodings'] = os.getcwd() + '\\encodings.pickle'    #path to serialized db of facial encodings
args['detection_method'] = 'cnn'                          #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.

create_facial_embeddings(args)

# Recognise faces in images

In [4]:
#import the necessary packages
import face_recognition
import pickle
import cv2
import os
import time
from collections import Counter

args = {}
args['encodings'] = os.getcwd() + '\\encodings.pickle'        #path to serialized db of facial encodings
args['image'] = os.getcwd() + '\\image_test\\test (1).jpg'    #path to input image
args['detection_method'] = 'cnn'                              #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.

In [5]:
def recognise_faces(args):
    ti = time.time()
    #load the known faces and embeddings
    print('[INFO] loading encodings...')
    data = pickle.loads(open(args['encodings'], 'rb').read())
    #load the input image and convert it from BGR to RGB
    image = cv2.imread(args['image'])
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    #detect the (x,y)-coordinates of the bounding boxes corresponding to each face in the input image, then compute the facial embeddings for each face
    print('[INFO] recognising faces...')
    boxes = face_recognition.face_locations(rgb, model=args['detection_method'])
    encodings = face_recognition.face_encodings(rgb, boxes)
    #initialize the list of names for each face detected
    names = []

    #loop over the facial embeddings
    for encoding in encodings:
        #attempt to match each face in the input image to our known encodings, function returns a list of True/False values, one for each known encoding
        #Internally, the compare_faces function is computing the Euclidean distance between the candidate embedding and all faces in our known encodings
        votes = face_recognition.compare_faces(data['encodings'], encoding)
        #check to see if a match is found
        if True in votes:
            #find the corresponding names of all faces matched (vote==True)
            matches = [name for name, vote in list(zip(data['names'], votes)) if vote == True]  
            #determine the most frequently occuring name (note: in the unlikely event of a tie, Python will select first entry in the dictionary)
            name = Counter(matches).most_common()[0][0]
        else:
            name = 'Unknown'
        #update the list of names
        names.append(name)

    print([' '.join([e.title() for e in name.split('_')]) for name in names])
    print('Time taken: {:.1f} seconds'.format(time.time() - ti))
          
    #visualise with bounding boxes and labeled names, loop over the recognised faces
    for ((top, right, bottom, left), name) in zip(boxes, names):
        #draw the predicted face name on the image
        cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
        y = top - 15 if top - 15 > 15 else top + 15
        cv2.putText(image, name, (left, y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)

    #Display the resulting frame, press 'q' to exit
    window_text = args['image'].split(os.path.sep)[-1]
    cv2.imshow(window_text, image)
    while True:
        #if the `q` key is pressed, break from the loop
        if cv2.waitKey(1) & 0xFF == ord('q'):
            cv2.destroyAllWindows()
            break
    #Save output image
    cv2.imwrite(args['image'].rsplit('.', 1)[0] + '_output.jpg', image)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (1).jpg'
recognise_faces(args)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (2).jpg'
recognise_faces(args)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (3).jpg'
recognise_faces(args)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (4).jpg'
recognise_faces(args)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (5).jpg'
recognise_faces(args)

In [None]:
args['image'] = os.getcwd() + '\\image_test\\test (6).jpg'
recognise_faces(args)

# Recognise faces in video files

In [13]:
#import the necessary packages
import face_recognition
import imutils
import pickle
import cv2
import os
import time
from collections import Counter

args = {}
args['encodings'] = os.getcwd() + '\\encodings.pickle'              #path to serialized db of facial encodings
args['input'] = os.getcwd() + '\\video_test\\trailer.mp4'           #path to input video
args['output'] = args['input'].rsplit('.', 1)[0] + '_output.avi'    #path to output video
args['display'] = 1                                                 #display output frame to screen: yes or no
args['detection_method'] = 'hog'                                    #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.
#Choose 'hog' if using only CPU (no GPU)

In [14]:
def recognise_faces_video(args):
    ti = time.time()
    #load the known faces and embeddings
    print('[INFO] loading encodings...')
    data = pickle.loads(open(args['encodings'], 'rb').read())
    #initialize pointer to vid file and vid writer
    print('[INFO] processing video...')
    stream = cv2.VideoCapture(args['input'])
    writer = None    #optionally writing processed video frames to disk later, so initialize writer to None

    #loop over frames from the video file stream
    while True:
        #grab next frame
        (grabbed, frame) = stream.read()
        #if frame was not grabbed, then we have reached the end of stream
        if not grabbed:
            break
        #convert the input frame from BGR to RGB then resize it to have a width of 750px (to speedup processing)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        rgb = imutils.resize(frame, width=750)
        r = frame.shape[1] / float(rgb.shape[1])
        #detect the (x,y)-coordinates of the bounding boxes corresponding to each face in the input frame, then compute the facial embeddings for each face
        boxes = face_recognition.face_locations(rgb, model=args['detection_method'])
        encodings = face_recognition.face_encodings(rgb, boxes)
        names = []

        #loop over the facial embeddings
        for encoding in encodings:
        #attempt to match each face in the input image to our known encodings, function returns a list of True/False values, one for each known encoding
        #Internally, the compare_faces function is computing the Euclidean distance between the candidate embedding and all faces in our known encodings
        votes = face_recognition.compare_faces(data['encodings'], encoding)
        #check to see if a match is found
        if True in votes:
            #find the corresponding names of all faces matched (vote==True)
            matches = [name for name, vote in list(zip(data['names'], votes)) if vote == True]  
            #determine the most frequently occuring name (note: in the unlikely event of a tie, Python will select first entry in the dictionary)
            name = Counter(matches).most_common()[0][0]
        else:
            name = 'Unknown'
        #update the list of names
        names.append(name)

        #visualise with bounding boxes and labeled names, loop over the recognised faces
        for ((top, right, bottom, left), name) in zip(boxes, names):
            #rescale the face coordinates
            top = int(top * r)
            right = int(right * r)
            bottom = int(bottom * r)
            left = int(left * r)
            #draw the predicted face name on the image
            cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
            y = top - 15 if top - 15 > 15 else top + 15
            cv2.putText(frame, name, (left, y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)

        #if the video writer is None *AND* output path is provided (to write the frame to disk)
        if writer is None and args['output'] is not None:
            fourcc = cv2.VideoWriter_fourcc(*'MJPG')    #to use the “MJPG” 4-character code
            writer = cv2.VideoWriter(args['output'], fourcc, 24, (frame.shape[1], frame.shape[0]), True)    #output file path, fourcc, frames per second target, and frame dimensions
        #if the writer is not None, write the frame with recognised faces to disk
        if writer is not None:
            writer.write(frame)

        #check if displaying output frame to screen
        if args['display'] == 1:
            cv2.imshow('Video file', frame)
            #if the `q` key is pressed, break from the loop
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    #do a bit of cleanup
    cv2.destroyAllWindows()
    stream.release()    #close video file pointers
    #check if the video writer point needs to be released
    if writer is not None:
        writer.release()
    print('Time taken: {:.1f} minutes'.format((time.time() - ti)/60))

In [None]:
args['input'] = os.getcwd() + '\\video_test\\trailer.mp4'
args['output'] = args['input'].rsplit('.', 1)[0] + '_output.avi'
recognise_faces_video(args)

In [None]:
args['input'] = os.getcwd() + '\\video_test\\lunch_scene.mp4'
args['output'] = args['input'].rsplit('.', 1)[0] + '_output.avi'
recognise_faces_video(args)

The output videos are: 

trailer_output.avi https://youtu.be/BxfdMrhsEnw

lunch_scene_output.avi https://youtu.be/MtBklF6ivmg

# Recognise faces in webcam

In [1]:
#import the necessary packages
import imutils
from imutils import paths
from imutils.video import VideoStream
import face_recognition
import pickle
import cv2
import os
import time
from collections import Counter

In [None]:
#create facial embeddings
args = {}
args['dataset'] = os.getcwd() + '\\dataset_webcam'               #path to input directory of faces and images
args['encodings'] = os.getcwd() + '\\encodings_webcam.pickle'    #path to serialized db of facial encodings
args['detection_method'] = 'cnn'                                 #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.

create_facial_embeddings(args)

In [4]:
#turn on webcam
args = {}
args['encodings'] = os.getcwd() + '\\encodings_webcam.pickle'    #path to serialized db of facial encodings
args['output'] = os.getcwd() + '\\webcam_test\\output.avi'       #path to output video
args['display'] = 1                                              #display output frame to screen: yes or no
args['detection_method'] = 'hog'                                 #face detection model to use: CNN method is more accurate but slower. HOG is faster but less accurate.
#Choose 'hog' if using only CPU (no GPU)

In [None]:
ti = time.time()
#load the known faces and embeddings
print('[INFO] loading encodings...')
data = pickle.loads(open(args['encodings'], 'rb').read())
#initialize the video stream and pointer to output video file, then allow the camera sensor to warm up
print('[INFO] starting video stream...')
vs = VideoStream(src=0).start()    #use VideoStream to access webcam, use src=1 for second webcam
time.sleep(2.0)    #time.sleep with 2 seconds to warm up webcam
writer = None    #optionally writing processed video frames to disk later, so initialize writer to None

#loop over frames from the video file stream
while True:
    #grab a frame from the threaded video stream
    frame = vs.read()
    #convert the input frame from BGR to RGB then resize it to have a width of 750px (to speedup processing)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    rgb = imutils.resize(frame, width=750)
    r = frame.shape[1] / float(rgb.shape[1])
    #detect the (x,y)-coordinates of the bounding boxes corresponding to each face in the input frame, then compute the facial embeddings for each face
    boxes = face_recognition.face_locations(rgb, model=args['detection_method'])
    encodings = face_recognition.face_encodings(rgb, boxes)
    names = []
    #loop over the facial embeddings
    for encoding in encodings:
        #attempt to match each face in the input image to our known encodings, function returns a list of True/False values, one for each known encoding
        #Internally, the compare_faces function is computing the Euclidean distance between the candidate embedding and all faces in our known encodings
        votes = face_recognition.compare_faces(data['encodings'], encoding)
        #check to see if a match is found
        if True in votes:
            #find the corresponding names of all faces matched (vote==True)
            matches = [name for name, vote in list(zip(data['names'], votes)) if vote == True]  
            #determine the most frequently occuring name (note: in the unlikely event of a tie, Python will select first entry in the dictionary)
            name = Counter(matches).most_common()[0][0]
        else:
            name = 'Unknown'
        #update the list of names
        names.append(name)

    #visualise with bounding boxes and labeled names, loop over the recognised faces
    for ((top, right, bottom, left), name) in zip(boxes, names):
        #rescale the face coordinates
        top = int(top * r)
        right = int(right * r)
        bottom = int(bottom * r)
        left = int(left * r)
        #draw the predicted face name on the image
        cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
        y = top - 15 if top - 15 > 15 else top + 15
        cv2.putText(frame, name, (left, y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)

    #if the video writer is None *AND* output path is provided (to write the frame to disk)
    if writer is None and args['output'] is not None:
        fourcc = cv2.VideoWriter_fourcc(*'MJPG')    #to use the “MJPG” 4-character code
        writer = cv2.VideoWriter(args['output'], fourcc, 20, (frame.shape[1], frame.shape[0]), True)    #output file path, fourcc, frames per second target, and frame dimensions
    #if the writer is not None, write the frame with recognised faces to disk
    if writer is not None:
        writer.write(frame)
        
    #check if displaying output frame to screen
    if args['display'] == 1:
        cv2.imshow('Webcam', frame)
        #if the `q` key is pressed, break from the loop
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
            
#do a bit of cleanup
cv2.destroyAllWindows()
vs.stop()
#check if the video writer point needs to be released
if writer is not None:
    writer.release()
print('Done! \nTime taken: {:.1f} minutes'.format((time.time() - ti)/60))