Testing the celeb recognition package

https://pypi.org/project/celeb-detector/


https://www.kaggle.com/code/vinayakshanawad/celebrity-face-recognition-vggface-model/notebook


## Before running this code, install the following libraries / dependencies:

pip install numpy

pip install face-recognition

pip install imutils

pip install argparse

pip install pandas

pip install opencv-python

### For installing the Libgl package. installation might vary based on operating system:

sudo apt-get update && sudo apt-get install -y libgl1-mesa-glx



## Creating Functions

In [None]:
#
import os
import random
import shutil
import face_recognition
import numpy as np

# Define paths for source, training, and test directories
source_dir = "/workspace/DS4002Project3/DATA/celebrities"
train_dir = "/workspace/DS4002Project3/DATA/celebrities_train"
test_dir = "/workspace/DS4002Project3/DATA/celebrities_test"

# Number of images for training and test sets
train_count = 80
test_count = 20

# create a function to split the data into test and training sets (which will be saved
# into folders)
def split_data():
    if not os.path.exists(train_dir): # create training folder if it doesn't exist
        os.makedirs(train_dir)
    if not os.path.exists(test_dir): # create test folder if it doesn't exist
        os.makedirs(test_dir)

    # Iterate through each celebrity folder
    for celeb_folder in os.listdir(source_dir): # "for each celeb folder in celebrities folder"
        celeb_path = os.path.join(source_dir, celeb_folder) # create path to specific celeb folder

        if os.path.isdir(celeb_path):
            # List all image files
            images = [img for img in os.listdir(celeb_path) if img.endswith(('.jpg', '.jpeg'))]
            random.shuffle(images) # shuffle the images to ensure randomization

            # Ensure we have enough images
            if len(images) >= (train_count + test_count):
                # Select the first 80 images for training and the next 20 for testing
                train_images = images[:train_count]
                test_images = images[train_count:train_count + test_count]

                train_celeb_folder = os.path.join(train_dir, celeb_folder)
                test_celeb_folder = os.path.join(test_dir, celeb_folder)
                os.makedirs(train_celeb_folder, exist_ok=True)
                os.makedirs(test_celeb_folder, exist_ok=True)

                # Copy images to train and test folders without overlap
                for img in train_images:
                    shutil.copy(os.path.join(celeb_path, img), train_celeb_folder)
                for img in test_images:
                    shutil.copy(os.path.join(celeb_path, img), test_celeb_folder)
            else:
                print(f"Not enough images in {celeb_folder} for splitting.")

# Encode faces in the training set
def encode_training_faces():
    encodings = {}

    for celeb_folder in os.listdir(train_dir):
        celeb_path = os.path.join(train_dir, celeb_folder)
        if os.path.isdir(celeb_path):
            celeb_encodings = []

            # Process each image
            for img_name in os.listdir(celeb_path):
                img_path = os.path.join(celeb_path, img_name)
                img = face_recognition.load_image_file(img_path)

                # Get encodings (assuming one face per image)
                encoding = face_recognition.face_encodings(img)
                if encoding:
                    celeb_encodings.append(encoding[0])

            # Save encodings for each celebrity
            if celeb_encodings:
                encodings[celeb_folder] = np.mean(celeb_encodings, axis=0)

    return encodings


## Creating celeb-image-data dataframe

Create dataframe to record the following characteristics for each picture:

1. Celebrity folder (ex: Pedro Pascal)
2. Photo Number (ex: 08)
3. Link to location in repo (ex: /celebrities/Pedro-Pascal/08.jpg)
4. Gender of individual in photo (ex: Male)
5. Brightness of photo (ex: 97.4595) SOURCE: https://stackoverflow.com/questions/3490727/what-are-some-methods-to-analyze-image-brightness-using-python
6. Image resolution (ex: 50x50) https://www.geeksforgeeks.org/finding-the-size-resolution-of-image-in-python/
7. Race of individual photographed
8. Saturation of image: https://stackoverflow.com/questions/58831690/how-to-measure-the-saturation-of-an-image

In [None]:
import pandas as pd
from PIL import Image, ImageStat
import os

# Calculate the brightness of an image

# make a function that will calculate the average brightness of an image
def brightness(im_file):
    im = Image.open(im_file).convert('L')  # Convert to grayscale
    stat = ImageStat.Stat(im)
    return stat.mean[0]  # Mean brightness

# make a function that will take the filename of an image (01-100) and convert to integer
def extract_number_from_filename(filename):
    # Split at the dot and take the part before it
    number_str = filename.split('.')[0]
    # Convert the resulting string to an integer
    return int(number_str)

# make a function to list the race of the celebrity photographed
def get_race(celebrity_name):
    if celebrity_name == "Angelina-Jolie":
        return "White"
    elif celebrity_name == "America-Ferrera":
        return "White-Latina"
    elif celebrity_name == "Ayo-Edebiri":
        return "Black"
    elif celebrity_name == "Conan-OBrien":
        return "White"
    elif celebrity_name == "Danny-Pudi":
        return "Asian"
    elif celebrity_name == "David-Bowie":
        return "White"
    elif celebrity_name == "Donald-Glover":
        return "Black"
    elif celebrity_name == "Elizabeth-Olsen":
        return "White"
    elif celebrity_name == "Jackie-Chan":
        return "Asian"
    elif celebrity_name == "Jim-Carrey":
        return "White"
    elif celebrity_name == "John-Lennon":
        return "White"
    elif celebrity_name == "John-Mulaney":
        return "White"
    elif celebrity_name == "Lucy-Liu":
        return "Asian"
    elif celebrity_name == "Margot-Robbie":
        return "White"
    elif celebrity_name == "Mariah-Carey":
        return "Black-White"
    elif celebrity_name == "Matt-Damon":
        return "White"
    elif celebrity_name == "Maya-Rudolph":
        return "Black-White"
    elif celebrity_name == "Morgan-Freeman":
        return "Black"
    elif celebrity_name == "Olivia-Rodrigo":
        return "Asian-White"
    elif celebrity_name == "Pedro-Pascal":
        return "White"
    elif celebrity_name == "Priyanka-Chopra":
        return "Asian"
    elif celebrity_name == "Ryan-Gosling":
        return "White"
    elif celebrity_name == "SZA":
        return "Black"
    elif celebrity_name == "Salma-Hayek":
        return "White-Latina"
    elif celebrity_name == "Will-Smith":
        return "Black"
    else:
        return "Unknown"

from PIL import Image

# make a function that states the resolution of a photo
def get_resolution(img_path):
    try:
        with Image.open(img_path) as img:
            wid, hgt = img.size
            resolution = f"{wid}x{hgt}"
            return resolution
    except Exception as e:
        return None

# function to calculate saturation. --> adopted from https://pyimagesearch.com/2017/06/05/computing-image-colorfulness-with-opencv-and-python/
from imutils import build_montages
from imutils import paths
import argparse
import imutils
import cv2

# Create a DataFrame including brightness information
def create_image_dataframe_from_source():
    data = []

    # Process each celebrity folder in the source directory
    for celeb_folder in os.listdir(source_dir):
        celeb_path = os.path.join(source_dir, celeb_folder)

        if os.path.isdir(celeb_path):
            for i, img_name in enumerate(os.listdir(celeb_path), start=1):
                img_path = os.path.join(celeb_path, img_name)

                # Calculate brightness for the image
                img_brightness = brightness(img_path)

                # Calculate resolution for the image
                img_resolution = get_resolution(img_path)

                # Append the data to the list
                data.append({
                    'celebrity_name': celeb_folder,
                    'picture_number': extract_number_from_filename(img_name),
                    'file_path': img_path,
                    'brightness': img_brightness,
                    'resolution': img_resolution
                })

    # Create DataFrame -- first add original columns
    df = pd.DataFrame(data, columns=['celebrity_name', 'picture_number', 'file_path', 'brightness', 'resolution'])
    # add race column
    df['race'] = df['celebrity_name'].apply(get_race)
    return df

df = create_image_dataframe_from_source()
# Sort by celebrity_name and then by picture_number
df = df.sort_values(by=['celebrity_name', 'picture_number'], ascending=[True, True])
# Reset index if needed
df = df.reset_index(drop=True)

# add gender column:
def gender_specification(dataframe):
    # Create a new column "gender" and assign values based on celebrity_name
    df['gender'] = df['celebrity_name'].apply(
        lambda name: 'female' if name in ['Angelina-Jolie', 'America-Ferrera', 'Ayo-Edebiri', 'Elizabeth-Olsen', 'Lucy-Liu', 'Margot-Robbie', 'Mariah-Carrie', 'Olivia-Rodrigo', 'Salma-Hayek', 'SZA']
        else 'male' if name in ['Conan-OBrien', 'Danny-Pudi', 'David-Bowie', 'Donald-Glover', 'Jackie-Chan', 'Jim-Carrey', 'John-Lennon', 'John-Mulaney', 'Matt-Damon', 'Morgan-Freeman', 'Pedro-Pascal', 'Ryan-Gosling', 'Will-Smith']
        else None
    )
    return df

gender_specification(df)

# add saturation column
import cv2

def image_colorfulness(image):
    # Split the image into its respective RGB components
    (B, G, R) = cv2.split(image.astype("float"))

    # Compute rg = R - G
    rg = np.absolute(R - G)

    # Compute yb = 0.5 * (R + G) - B
    yb = np.absolute(0.5 * (R + G) - B)

    # Compute the mean and standard deviation of both `rg` and `yb`
    (rgMean, rgStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))

    # Combine the mean and standard deviations
    stdRoot = np.sqrt((rgStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rgMean ** 2) + (ybMean ** 2))

    # Derive the "colorfulness" metric and return it
    return stdRoot + (0.3 * meanRoot)

# List to hold colorfulness values
colorfulness_values = []

# Loop through each file path in the DataFrame
for filepath in df['file_path']:
    # Load the image
    image = cv2.imread(filepath)

    # Check if the image was loaded correctly
    if image is not None:
        # Calculate the colorfulness
        colorfulness_value = image_colorfulness(image)
    else:
        print(f"Warning: Could not load image at {filepath}")
        colorfulness_value = None  # Use None for missing images

    # Append the colorfulness value
    colorfulness_values.append(colorfulness_value)

# Add the colorfulness values as a new column in the DataFrame
df['colorfulness'] = colorfulness_values

df.to_csv('/workspace/DS4002Project3/DATA/celeb_image_data.csv', index=False)

In [None]:


# create function to identify random celeb from the test set
def identify_random_test_image(encodings):
    celeb_folder = random.choice(os.listdir(test_dir))
    celeb_path = os.path.join(test_dir, celeb_folder)
    test_image = random.choice(os.listdir(celeb_path))
    test_image_path = os.path.join(celeb_path, test_image)

    # Load and encode test image
    test_img = face_recognition.load_image_file(test_image_path)
    test_encoding = face_recognition.face_encodings(test_img)

    if test_encoding:
        test_encoding = test_encoding[0]

        # Compare with known encodings
        results = face_recognition.compare_faces(
            list(encodings.values()), test_encoding, tolerance=0.6
        )

        # Find match
        if True in results:
            match_index = results.index(True)
            matched_celebrity = list(encodings.keys())[match_index]
            print(f"Identified as: {matched_celebrity} from test image: {test_image_path}")
        else:
            print(f"No match found for {test_image_path}.")
    else:
        print(f"No face detected in test image {test_image_path}")


In [None]:

# Run steps-- split data and perform encodings
split_data()
celebrity_encodings = encode_training_faces()


Not enough images in David-Bowie for splitting.


In [None]:
# define function which tests for accuracy
def calculate_accuracy(encodings):
    total_images = 0
    correct_identifications = 0

    # Iterate through each celebrity folder in the test set
    for celeb_folder in os.listdir(test_dir):
        celeb_path = os.path.join(test_dir, celeb_folder)

        # Ensure it's a directory
        if os.path.isdir(celeb_path):
            # Process each image in the folder
            for test_image in os.listdir(celeb_path):
                test_image_path = os.path.join(celeb_path, test_image)

                # Load and encode test image
                test_img = face_recognition.load_image_file(test_image_path)
                test_encoding = face_recognition.face_encodings(test_img)

                if test_encoding:
                    test_encoding = test_encoding[0]

                    # Compare with known encodings
                    results = face_recognition.compare_faces(
                        list(encodings.values()), test_encoding, tolerance=0.6
                    )

                    # Find match
                    if True in results:
                        match_index = results.index(True)
                        matched_celebrity = list(encodings.keys())[match_index]

                        # Check if the identified celebrity matches the folder name
                        if matched_celebrity == celeb_folder:
                            correct_identifications += 1
                        else:
                            print(f"Incorrect: {test_image_path} identified as {matched_celebrity}")
                    else:
                        print(f"No match found for: {test_image_path}")

                # Increment total image count
                total_images += 1

    # Calculate accuracy
    accuracy = (correct_identifications / total_images) * 100 if total_images > 0 else 0
    print(f"Accuracy Rate: {accuracy:.2f}%")
    print(f"Total Images: {total_images}, Correct Identifications: {correct_identifications}")

# Run accuracy calculation
calculate_accuracy(celebrity_encodings)


Incorrect: celebrities_test/Ayo-Edebiri/93.jpg identified as Matt-Damon
Incorrect: celebrities_test/Lucy-Liu/40.jpg identified as Ayo-Edebiri
Incorrect: celebrities_test/Margot-Robbie/09.jpeg identified as John-Lennon
Incorrect: celebrities_test/Matt-Damon/09.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/23.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/40.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/45.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/46.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/50.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/70.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/73.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/74.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/97.jpg identified as John-Mulaney
Incorrect: celebrities_test/Olivia-Rodrigo/22.jpg id

## Adding columns for race and saturation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np
import matplotlib.pyplot as plt

url = 'https://raw.githubusercontent.com/oak50/DS4002Project3/main/celeb_image_data.csv'

df = pd.read_csv(url)

def get_race(celebrity_name):
    if celebrity_name == "Angelina-Jolie":
        return "White"
    elif celebrity_name == "Ayo-Edebiri":
        return "Black"
    elif celebrity_name == "Conan-OBrien":
        return "White"
    elif celebrity_name == "Danny-Pudi":
        return "Asian"
    elif celebrity_name == "David-Bowie":
        return "White"
    elif celebrity_name == "Donald-Glover":
        return "Black"
    elif celebrity_name == "Elizabeth-Olsen":
        return "White"
    elif celebrity_name == "Jackie-Chan":
        return "Asian"
    elif celebrity_name == "Jim-Carrey":
        return "White"
    elif celebrity_name == "John-Lennon":
        return "White"
    elif celebrity_name == "John-Mulaney":
        return "White"
    elif celebrity_name == "Lucy-Liu":
        return "Asian"
    elif celebrity_name == "Margot-Robbie":
        return "White"
    elif celebrity_name == "Mariah-Carey":
        return "Black"
    elif celebrity_name == "Matt-Damon":
        return "White"
    elif celebrity_name == "Maya-Rudolph":
        return "Black"
    elif celebrity_name == "Morgan-Freeman":
        return "Black"
    elif celebrity_name == "Olivia-Rodrigo":
        return "Asian"
    elif celebrity_name == "Pedro-Pascal":
        return "White"
    elif celebrity_name == "Priyanka-Chopra":
        return "Asian"
    elif celebrity_name == "Ryan-Gosling":
        return "White"
    elif celebrity_name == "SZA":
        return "Black"
    elif celebrity_name == "Salma-Hayek":
        return "White"
    elif celebrity_name == "Will-Smith":
        return "Black"
    else:
        return "Unknown"

df['race'] = df['celebrity_name'].apply(get_race)

df.to_csv('celeb_image_data_race.csv', index=False)

In [None]:
from google.colab import files

files.download('celeb_image_data_race.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>