In [1]:
import sys
import os
import dlib
import glob
import yaml
import pandas as pd

In [2]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [4]:
detector = dlib.get_frontal_face_detector()
sp = dlib.shape_predictor("../" + config['shape_predictor'])
facerec = dlib.face_recognition_model_v1("../" + config['face_recognition_model_v1'])

In [5]:
descriptors = []
images = []

face_clustering_result = []
face_embedding_result = []

In [7]:
# for f in glob.glob(os.path.join("lfw/**/*.jpg")):
# for f in glob.glob(os.path.join("raw/*.*")):
# for f in glob.glob(os.path.join("dataset/COCO_Group2a/*.jpg")):
face_id = 0
for f in glob.glob("../" + config['image_input_folder']):
    logger.info("Processing file: {}".format(f))
    img = dlib.load_rgb_image(f)

    # Ask the detector to find the bounding boxes of each face. The 1 in the
    # second argument indicates that we should upsample the image 1 time. This
    # will make everything bigger and allow us to detect more faces.
    dets = detector(img, 1)
    logger.info("Number of faces detected: {}".format(len(dets)))

    # Now process each face we found.
    fcr = {}
    fcr["img_file"] = f
    fcr["no_of_faces"] = len(dets)
    for k, d in enumerate(dets):
        fcr_ed = {}
        fcr_ed['face'] = ''.join(['face_id_', str(face_id)])
        
        # Get the landmarks/parts for the face in box d.
        shape = sp(img, d)

        # Compute the 128D vector that describes the face in img identified by
        # shape.  
        face_descriptor = facerec.compute_face_descriptor(img, shape)
        fcr_ed['embedding'] = face_descriptor
        fcr_ed['cluster'] = ''

        descriptors.append(face_descriptor)
        images.append((img, shape))

        # d: (d.left(), d.top(), d.right(), d.bottom())
        # fcr["face_rec"] = d
        fcr['face_cluster_in_image'] = []

        face_embedding_result.append(fcr_ed)

        face_id = face_id + 1

    face_clustering_result.append(fcr)

2023-11-15 02:11:08,651 - __main__ - INFO - Processing file: ../dataset/raw/Diana_0.jpg
2023-11-15 02:11:08,826 - __main__ - INFO - Number of faces detected: 7
2023-11-15 02:11:09,489 - __main__ - INFO - Processing file: ../dataset/raw/Diana_1.jpg
2023-11-15 02:11:09,789 - __main__ - INFO - Number of faces detected: 2
2023-11-15 02:11:09,978 - __main__ - INFO - Processing file: ../dataset/raw/Diana_2.jpg
2023-11-15 02:11:10,492 - __main__ - INFO - Number of faces detected: 3
2023-11-15 02:11:10,758 - __main__ - INFO - Processing file: ../dataset/raw/Diana_3.jpg
2023-11-15 02:11:10,857 - __main__ - INFO - Number of faces detected: 1
2023-11-15 02:11:10,946 - __main__ - INFO - Processing file: ../dataset/raw/Diana_4.jpg
2023-11-15 02:11:15,421 - __main__ - INFO - Number of faces detected: 3
2023-11-15 02:11:15,684 - __main__ - INFO - Processing file: ../dataset/raw/Diana_5.png
2023-11-15 02:11:15,918 - __main__ - INFO - Number of faces detected: 6
2023-11-15 02:11:16,460 - __main__ - INF

In [8]:
#Now let's cluster the faces.  
labels = dlib.chinese_whispers_clustering(descriptors, config['cw_clustering_threshold'])
num_classes = len(set(labels))

face_sum = 0
pic_no = 0

logger.info("Number of clusters: {}".format(num_classes))

for i in range(len(labels)):
    face_embedding_result[i]['cluster'] = labels[i]

    if i + 1 <= face_sum + face_clustering_result[pic_no]['no_of_faces']:
       face_clustering_result[pic_no]['face_cluster_in_image'].append(labels[i])
    else:
        pic_no = pic_no + 1
        face_sum = face_sum + face_clustering_result[pic_no - 1]['no_of_faces']
        face_clustering_result[pic_no]['face_cluster_in_image'].append(labels[i])

    # print("pic_no {}".format(pic_no))
    # print("face_sum {}".format(face_sum))

2023-11-15 02:12:47,607 - __main__ - INFO - Number of clusters: 53


In [56]:
clusters = [[] for _ in range(num_classes)]
for i, pair in enumerate(images):
    clusters[labels[i]].append(pair)

for i, cluster in enumerate(clusters):
    if len(cluster) > config['cluster_component_number_threshold']:
        cluster_folder_path = os.path.join("output", str(i))
        if not os.path.isdir(cluster_folder_path):
            os.makedirs(cluster_folder_path)
        for j, pair in enumerate(cluster):
            img, shape = pair
            dlib.save_face_chip(img, shape, os.path.join(cluster_folder_path, 'face_{}'.format(j)), size=150, padding=0.25)

In [9]:

face_clustering_result_df = pd.DataFrame(face_clustering_result)
face_embedding_result_df = pd.DataFrame(face_embedding_result)

In [63]:
face_embedding_result_df.head()

Unnamed: 0,face,embedding,cluster
0,face_id_0,"[-0.17780616879463196, 0.11140817403793335, 0....",0
1,face_id_1,"[-0.14510276913642883, 0.11045663803815842, 0....",1
2,face_id_2,"[-0.18592892587184906, 0.09384222328662872, 0....",1
3,face_id_3,"[-0.21630069613456726, 0.07564140111207962, 0....",1
4,face_id_4,"[-0.17144173383712769, 0.05064402520656586, 0....",2


In [69]:
face_typical_embedding_df = face_embedding_result_df.groupby('cluster').head(1).reset_index(drop=True)

In [70]:
face_typical_embedding_df.head()

Unnamed: 0,face,embedding,cluster
0,face_id_0,"[-0.17780616879463196, 0.11140817403793335, 0....",0
1,face_id_1,"[-0.14510276913642883, 0.11045663803815842, 0....",1
2,face_id_4,"[-0.17144173383712769, 0.05064402520656586, 0....",2
3,face_id_7,"[-0.04093189537525177, 0.1407700479030609, 0.0...",3
4,face_id_10,"[-0.025122130289673805, 0.11117591708898544, 0...",4


In [57]:
img = dlib.load_rgb_image("../image_input.png")

In [58]:
import numpy as np

detector = dlib.get_frontal_face_detector()
sp = dlib.shape_predictor("../" + config['shape_predictor'])
facerec = dlib.face_recognition_model_v1("../" + config['face_recognition_model_v1'])
dets = detector(img, 1)
for k, d in enumerate(dets):    
    # Get the landmarks/parts for the face in box d.
    shape = sp(img, d)

    # Compute the 128D vector that describes the face in img identified by
    # shape.  
    face_descriptor = facerec.compute_face_descriptor(img, shape)

In [59]:
from math import sqrt

def euclidean_dist(vector_x):
    if len(vector_x) != len(face_descriptor):
        raise Exception('Vectors must be same dimensions')

    x = np.array(vector_x)
    y = np.array(face_descriptor)
    return sum((x[dim] - y[dim]) ** 2 for dim in range(len(x)))

In [71]:
face_typical_embedding_df['distance'] = face_typical_embedding_df['embedding'].map(euclidean_dist)

In [72]:
face_typical_embedding_df.head()

Unnamed: 0,face,embedding,cluster,distance
0,face_id_0,"[-0.17780616879463196, 0.11140817403793335, 0....",0,0.920756
1,face_id_1,"[-0.14510276913642883, 0.11045663803815842, 0....",1,0.710481
2,face_id_4,"[-0.17144173383712769, 0.05064402520656586, 0....",2,0.239845
3,face_id_7,"[-0.04093189537525177, 0.1407700479030609, 0.0...",3,0.642192
4,face_id_10,"[-0.025122130289673805, 0.11117591708898544, 0...",4,0.569327


In [79]:
output_cluster = face_typical_embedding_df.iloc[face_typical_embedding_df['distance'].idxmin()]['cluster']

In [90]:
face_clustering_result_df.head()

Unnamed: 0,img_file,no_of_faces,face_cluster_in_image
0,../dataset/raw/Diana_0.jpg,7,"[0, 1, 1, 1, 2, 1, 1]"
1,../dataset/raw/Diana_1.jpg,2,"[3, 2]"
2,../dataset/raw/Diana_2.jpg,3,"[2, 4, 5]"
3,../dataset/raw/Diana_3.jpg,1,[2]
4,../dataset/raw/Diana_4.jpg,3,"[3, 6, 2]"


In [100]:
face_clustering_result_df[[2 in i for i in face_clustering_result_df['face_cluster_in_image']]].img_file.tolist()


['../dataset/raw/Diana_0.jpg',
 '../dataset/raw/Diana_1.jpg',
 '../dataset/raw/Diana_2.jpg',
 '../dataset/raw/Diana_3.jpg',
 '../dataset/raw/Diana_4.jpg',
 '../dataset/raw/Diana_5.png',
 '../dataset/raw/Diana_6.jpeg',
 '../dataset/raw/Diana_7.jpg',
 '../dataset/raw/Obama_1.png',
 '../dataset/raw/Obama_2.png',
 '../dataset/raw/Diana_0.jpg',
 '../dataset/raw/Diana_1.jpg',
 '../dataset/raw/Diana_2.jpg',
 '../dataset/raw/Diana_3.jpg',
 '../dataset/raw/Diana_4.jpg',
 '../dataset/raw/Diana_5.png',
 '../dataset/raw/Diana_6.jpeg',
 '../dataset/raw/Diana_7.jpg',
 '../dataset/raw/Obama_1.png',
 '../dataset/raw/Obama_2.png']