In [1]:
from deepface import DeepFace
import cv2 as cv
import numpy as np
from tqdm.notebook import tqdm
import os
import shutil
import json
import uuid
import traceback
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from collections import Counter




In [2]:
session_id = uuid.uuid4().hex[:4]

In [3]:
FACE_DETECTION_BACKEND = "ssd"
FACE_EMBEDDINGS_BACKEND = "ArcFace"

In [4]:
# session_id = f"Buffy_{FACE_DETECTION_BACKEND}_{FACE_EMBEDDINGS_BACKEND}"

# Clear out files and folders in data 

In [5]:
def setup_experiment():
    # Remove faces, embeddings, demography folders
    if os.path.exists("data/frames"):
        shutil.rmtree("data/frames")
    os.mkdir("data/frames")
    
    if os.path.exists("data/faces"):
        shutil.rmtree("data/faces")
    os.mkdir("data/faces")
    
    if os.path.exists("data/embeddings"):
        shutil.rmtree("data/embeddings")
    os.mkdir("data/embeddings")
    
    if os.path.exists("data/demography"):
        shutil.rmtree("data/demography")
    os.mkdir("data/demography")

# Capture the frames and display them

In [6]:
def run_experiment(path):
    cap = cv.VideoCapture(path)
    # Open capture loop
    ctr = 0
    while cap.isOpened():
        ret, frame = cap.read()    
        # ret is only true when frame is read properly
        if not ret:
            print("Cannot read frame.")
            break
        if frame is None:
            continue 
        # Display/Process every 15th frame
        if ctr % 15 == 0:
            # Add processing here
            process_frame(frame, ctr)
        if cv.waitKey(1) == ord('q'):
            break
        ctr += 1

    cap.release()
    cv.destroyAllWindows()

In [7]:
def process_frame(frame, frame_id):
#     cv.imshow("face", frame)
#     print(f"frame_id: {frame_id}")
#     return
    # detect and highlight faces
    try:

        # Generate demography data
        frame_data = DeepFace.analyze(img_path = frame, detector_backend = FACE_DETECTION_BACKEND)
#         import ipdb; ipdb.set_trace()
        # Generate embeddings using deepface
        embedding_data = DeepFace.represent(img_path = frame, model_name = FACE_EMBEDDINGS_BACKEND,  detector_backend = FACE_DETECTION_BACKEND)
        # For every face detected, store the image, face, embeddings, and dump the demography json
        for idx, (f_data, e_data) in enumerate(zip(frame_data, embedding_data)):
            file_name = f"{session_id}_{frame_id}_{idx}"
            # Store the image
            cv.imwrite(f"data/frames/{file_name}.png", frame)
            # Extract the facial region and store/show the face
            x,y,w,h = f_data["region"]["x"], f_data["region"]["y"], f_data["region"]["w"], f_data["region"]["h"]
            # Make sure they represent the same face
            assert(x == e_data["facial_area"]["x"])
            assert(y == e_data["facial_area"]["y"])
            assert(w == e_data["facial_area"]["w"])
            assert(h == e_data["facial_area"]["h"])
            x = max(int(x),0)
            y = max(int(y),0)
            w = max(int(w),0)
            h = max(int(h),0)
            face_region = frame[y:y+h, x:x+w,:]
            cv.imwrite(f"data/faces/{file_name}.png", face_region)
            cv.imshow("face", face_region)
            # Store the embeddings
            np.save(f"data/embeddings/{file_name}.npy", e_data["embedding"])
            
            # Dump the dict as a json
            with open(f"data/demography/{file_name}.json", 'w') as f:
                json.dump(f_data, f)
            
#         if len(frame_data) > 1:
#             cv.imshow("face", frame)
#             cv2.setWindowProperty("face", cv2.WND_PROP_TOPMOST, 1)
        
    except ValueError as ve:
        # print(str(ve))
        pass
    except Exception as e:
        print(str(e))
        print(traceback.format_exc())
        import ipdb; ipdb.set_trace()

In [8]:
def cluster_embeddings_k_means(n_clusters=15):
    # List all embeddings
    embed_files = os.listdir("data/embeddings/")
    # Load all embeddings and their associated names(without extensions)
    embeddings = { f_name.split(".")[0] : np.load(f"data/embeddings/{f_name}") for f_name in embed_files}

    # Cluster the embeddings using sklearn, get the cluster centroids
    embed_vals = np.array([val for val in embeddings.values()])
#     import ipdb; ipdb.set_trace()
    k_means = KMeans(n_clusters=n_clusters, random_state=42, n_init=5)
    k_means_results = k_means.fit(normalize(embed_vals))
    # Get silhouette score
    sil_score = silhouette_score(normalize(embed_vals), k_means.predict(normalize(embed_vals)))
    print(f"Silhouette score for K-Means : {sil_score:.4f}")
    # Assign the individual file prefix names to a centroid based on distance
    cluster_assignments = {k:v for k,v in zip(embeddings.keys(), k_means_results.labels_)}
    # Return file prefix - cluster number pairs and centroids
    return (cluster_assignments, k_means_results.cluster_centers_)

In [9]:
def cluster_embeddings_agglo(n_clusters=15):
    # List all embeddings
    embed_files = os.listdir("data/embeddings/")
    # Load all embeddings and their associated names(without extensions)
    embeddings = {f_name.split(".")[0]: np.load(f"data/embeddings/{f_name}") for f_name in embed_files}

    # Cluster the embeddings using sklearn agglo
    embed_vals = np.array(list(embeddings.values()))
    normalized_embed_vals = normalize(embed_vals)
    agglo = AgglomerativeClustering(n_clusters=n_clusters)
    agglo_results = agglo.fit(normalized_embed_vals)

    # Get silhouette score
    sil_score = silhouette_score(normalized_embed_vals, agglo_results.labels_)
    print(f"Silhouette score for agglomerative clustering : {sil_score:.4f}")

    # Assign the individual file prefix names to a centroid based on distance
    cluster_assignments = dict(zip(embeddings.keys(), agglo_results.labels_))

    # Compute centroids for each cluster
    centroids = []
    for cluster_id in range(n_clusters):
        cluster_points = normalized_embed_vals[agglo_results.labels_ == cluster_id]
        if len(cluster_points) > 0:
            centroid = np.mean(cluster_points, axis=0)
            centroids.append(centroid)
        else:
            centroids.append(np.zeros_like(embed_vals[0]))

    # Return file prefix - cluster number pairs and centroids
    return cluster_assignments, centroids

In [10]:
def ensemble_demography_assignment(cluster_assignments, centroids):
    # For each clustered face, get the demography data and use the one race and gender that appears the most and assign that
    n_clusters = len(centroids)
    
    clustered_files = {}
    demography = {}
    for k,v in cluster_assignments.items():
        if v not in clustered_files:
            clustered_files[v] = []
        clustered_files[v].append(k)
    
    for cluster, f_names in clustered_files.items():
        race =[]
        gender=[]
        race_confidences = []
        gender_confidences = []
        for f_name in f_names:
            with open(f"data/demography/{f_name}.json") as f:
                demo_data = json.load(f)
            race.append(demo_data.get("dominant_race"))
            gender.append(demo_data.get("dominant_gender"))
            race_confidences.append(demo_data.get("race"))
            gender_confidences.append(demo_data.get("gender"))
            demography[cluster] = {}
            demography[cluster]["race"] = Counter(race).most_common(1)[0][0]
            demography[cluster]["gender"] = Counter(gender).most_common(1)[0][0]
            # For the assigned race and gender, calculate the median confidence for those classes
            demography[cluster]["race_confidence"] = np.median([rc[demography[cluster]["race"]] for rc in race_confidences])
            demography[cluster]["gender_confidence"] = np.median([gc[demography[cluster]["gender"]] for gc in gender_confidences])
    # Return the cluster labels, and the race and gender with the median confidence
    return demography

In [11]:
def get_representative_samples(cluster_assignments, centers):
    
    # Make a dict of cluster_no - files
    clustered_files = {}
    for k,v in cluster_assignments.items():
        if v not in clustered_files:
            clustered_files[v] = []
        clustered_files[v].append(k)
    
    representative_samples = {}
    for cluster, f_names in clustered_files.items():
        distances = {}
        center = normalize(centers[cluster].reshape(1,-1))
        for f_name in f_names:
            embedding = normalize(np.load(f"data/embeddings/{f_name}.npy").reshape(1,-1)) # Load the corresponding embedding
            distances[f_name] = np.linalg.norm(embedding - center, ord=2) # Calculate and store the distance
            distances = {k: v for k, v in sorted(distances.items(), key=lambda item: item[1])} # Sort by distances
            representative_samples[cluster] = list(distances.keys())[:5]
    return representative_samples

In [12]:
# setup_experiment()
# run_experiment('data/movie/sample.mp4')
assignments, centers = cluster_embeddings_k_means(n_clusters=7)
demography_data = ensemble_demography_assignment(assignments, centers)
representative_samples = get_representative_samples(assignments, centers)

Silhouette score for K-Means : 0.2474


In [13]:
for c,f_names in representative_samples.items():
    for f_name in f_names:
        frame = cv.imread(f"data/faces/{f_name}.png")
        cv.imshow(f"Person-{c}", frame)
        cv.waitKey(0)
        cv.destroyAllWindows()

In [14]:
assignments, centers = cluster_embeddings_agglo(n_clusters=7)
demography_data = ensemble_demography_assignment(assignments, centers)
representative_samples = get_representative_samples(assignments, centers)

Silhouette score for agglomerative clustering : 0.2566


In [15]:
for c,f_names in representative_samples.items():
    for f_name in f_names:
        frame = cv.imread(f"data/faces/{f_name}.png")
        cv.imshow(f"Person-{c}", frame)
        cv.waitKey(0)
        cv.destroyAllWindows()
        


In [16]:
# # View all files with face #5
# import time
# relevant_files = [f"data/faces/{k}.png" for k,v in assignments.items() if v == 1]
# for f in relevant_files:
#     frame = cv.imread(f)
#     cv.imshow("Person-5", frame)
#     # waits for user to press any key 
#     # (this is necessary to avoid Python kernel form crashing) 
#     cv.waitKey(0) 

#     # closing all open windows 
#     cv.destroyAllWindows() 
    