## Face search demo using Amazon ES knn and machine learning.

High level steps:

1. face detection and feature extraction
2. feature vector index in ES
3. 1:N vector search

## Detect face and extact features

In [None]:
import math
import os
import os.path
import face_recognition
from face_recognition.face_recognition_cli import image_files_in_folder
from PIL import Image

In [None]:
def extract_features(train_dir):
    X = []
    y = []
    img_paths = []

    # Loop through each person in the training set
    for class_dir in os.listdir(train_dir):
        if not os.path.isdir(os.path.join(train_dir, class_dir)):
            continue

        # Loop through each training image for the current person
        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):
            image = face_recognition.load_image_file(img_path)
            face_bounding_boxes = face_recognition.face_locations(image)

            if len(face_bounding_boxes) != 1:
                # If there are no people (or too many people) in a training image, skip the image.
                print("Image {} not suitable for training: {}".format(img_path, "Didn't find a face" if len(face_bounding_boxes) < 1 else "Found more than one face"))
            else:
                # Add face encoding for current image to the training set
                X.append(face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0])
                y.append(class_dir)
                img_paths.append(img_path)
        
    return X, y, img_paths

In [None]:
# setting up the Elasticsearch connection
import boto3
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
region = 'ap-southeast-1'
service = 'es'
es_host = 'search-winston-elasti-2lknspawyv4c-ftmwkbcikgrs7kkah7grjwvoay.ap-southeast-1.es.amazonaws.com'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

es = Elasticsearch(
    hosts = [{'host': es_host, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [None]:
#Define KNN Elasticsearch index maping
knn_index = {
    "settings": {
        "index.knn": True
    },
    "mappings": {
        "properties": {
            "face_img_vector": {
                "type": "knn_vector",
                "dimension": 128
            }
        }
    }
}

#Creating the Elasticsearch index
es.indices.create(index="idx_faces",body=knn_index,ignore=400)
es.indices.get(index="idx_faces")

## Build knn index in ES

In [None]:
TRAIN_DIR = "train"
vectors, names, img_paths = extract_features(TRAIN_DIR)
print(names)
print(img_paths)
print(len(vectors[0]))

In [None]:
def es_import(i):
    es.index(index='idx_faces',
             body={"face_img_vector": i[0], 
                   "name": i[1],
                   "img_path": i[2]}
            )

for vector, name, img_path in zip(vectors, names, img_paths):
    # print(vector, name)
    es_import([vector, name, img_path])

In [None]:
# res = es.delete_by_query(index="idx_faces", body={"query": {"match_all": {}}})

## 1:N face search

In [None]:
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg 

def display_img(img_path):
    image = plt.imread(img_path)

    fig, ax = plt.subplots(figsize=(16,8))
    ax.imshow(image)
    ax.axis('off')
    
test_img_path = 'test/obama2.jpg'
display_img(test_img_path)

image = face_recognition.load_image_file(test_img_path)
face_bounding_boxes = face_recognition.face_locations(image)

face_feature = face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0]

In [None]:
k = 1
idx_name = 'idx_faces'
res = es.search(request_timeout=30, index=idx_name,
                body={'size': k, 
                      'query': {'knn': {'face_img_vector': {'vector': face_feature, 'k': k}}}})
print("Return top 1 with score: %s" % res['hits']['max_score'])

# print(res)
DISTANCE_THRESHHOLD = 0.8
if (res['hits']['hits'][0]['_score'] >= DISTANCE_THRESHHOLD):
    print("Found %s" % res['hits']['hits'][0]['_source']['name'])
    display_img(res['hits']['hits'][0]['_source']['img_path'])
else:
    print("No faces found.")

In [None]:
res = es.search(index="idx_faces", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])
# print(res)