Testing the celeb recognition package

https://pypi.org/project/celeb-detector/


https://www.kaggle.com/code/vinayakshanawad/celebrity-face-recognition-vggface-model/notebook


## Before running this code, install the following libraries / dependencies. Note that the face-recognition library will take several minutes to install.

pip install numpy

pip install face-recognition

pip install imutils

pip install pandas

pip install opencv-python

### For installing the Libgl package. installation might vary based on operating system:

sudo apt-get update && sudo apt-get install -y libgl1-mesa-glx



## Creating Functions

In [13]:
import os
import random
import shutil
import face_recognition
import numpy as np
import pandas as pd

# Define paths for source, training, and test directories
source_dir = "/workspace/DS4002Project3/DATA/celebrities"
train_dir = "/workspace/DS4002Project3/DATA/celebrities_train"
test_dir = "/workspace/DS4002Project3/DATA/celebrities_test"

# Number of images for training and test sets
train_count = 80
test_count = 20

# Create a function to split the data into test and training sets
def split_data():
    # Clear existing contents of training and test folders
    if os.path.exists(train_dir):
        shutil.rmtree(train_dir)
    if os.path.exists(test_dir):
        shutil.rmtree(test_dir)

    # Re-create the empty directories
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate through each celebrity folder
    for celeb_folder in os.listdir(source_dir):
        celeb_path = os.path.join(source_dir, celeb_folder)

        if os.path.isdir(celeb_path):
            # List all image files
            images = [img for img in os.listdir(celeb_path) if img.endswith(('.jpg', '.jpeg'))]
            random.shuffle(images)  # Shuffle the images to ensure randomization

            # Ensure we have enough images
            if len(images) >= (train_count + test_count):
                # Select the first 80 images for training and the next 20 for testing
                train_images = images[:train_count]
                test_images = images[train_count:train_count + test_count]

                train_celeb_folder = os.path.join(train_dir, celeb_folder)
                test_celeb_folder = os.path.join(test_dir, celeb_folder)
                os.makedirs(train_celeb_folder, exist_ok=True)
                os.makedirs(test_celeb_folder, exist_ok=True)

                # Copy images to train and test folders without overlap
                for img in train_images:
                    shutil.copy(os.path.join(celeb_path, img), train_celeb_folder)
                for img in test_images:
                    shutil.copy(os.path.join(celeb_path, img), test_celeb_folder)
            else:
                print(f"Not enough images in {celeb_folder} for splitting.")

# Encode faces in the training set
def encode_training_faces():
    encodings = {}

    for celeb_folder in os.listdir(train_dir):
        celeb_path = os.path.join(train_dir, celeb_folder)
        if os.path.isdir(celeb_path):
            celeb_encodings = []

            # Process each image
            for img_name in os.listdir(celeb_path):
                img_path = os.path.join(celeb_path, img_name)
                img = face_recognition.load_image_file(img_path)

                # Get encodings (assuming one face per image)
                encoding = face_recognition.face_encodings(img)
                if encoding:
                    celeb_encodings.append(encoding[0])

            # Save encodings for each celebrity
            if celeb_encodings:
                encodings[celeb_folder] = np.mean(celeb_encodings, axis=0)

    return encodings

split_data()
encodings = encode_training_faces()
print("Face encodings generated for training set.")


KeyboardInterrupt: 

In [10]:
# Function to count the number of images in a specified directory
def count_images_in_folder(folder_path):
    if os.path.exists(folder_path):
        return len([img for img in os.listdir(folder_path) if img.endswith(('.jpg', '.jpeg'))])
    return 0

# Count images in Jim Carrey's training and testing folders
jim_carrey_train_folder = os.path.join("/workspace/DS4002Project3/DATA/celebrities_train", "Jim-Carrey")
jim_carrey_test_folder = os.path.join("/workspace/DS4002Project3/DATA/celebrities_test", "Jim-Carrey")

num_train_images = count_images_in_folder(jim_carrey_train_folder)
num_test_images = count_images_in_folder(jim_carrey_test_folder)

print(f"Number of images in Jim Carrey's training folder: {num_train_images}")
print(f"Number of images in Jim Carrey's testing folder: {num_test_images}")


# Function to count the number of images in a specified directory
def count_images_in_folder(folder_path):
    if os.path.exists(folder_path):
        return len([img for img in os.listdir(folder_path) if img.endswith(('.jpg', '.jpeg'))])
    return 0

# Count images in Jim Carrey's training and testing folders
jim_carrey_train_folder = os.path.join("/workspace/DS4002Project3/DATA/celebrities_train", "Jackie-Chan")
jim_carrey_test_folder = os.path.join("/workspace/DS4002Project3/DATA/celebrities_test", "Jackie-Chan")

num_train_images = count_images_in_folder(jim_carrey_train_folder)
num_test_images = count_images_in_folder(jim_carrey_test_folder)

print(f"Number of images in Jackie Chan's training folder: {num_train_images}")
print(f"Number of images in Jackie Chan's testing folder: {num_test_images}")



Number of images in Jim Carrey's training folder: 80
Number of images in Jim Carrey's testing folder: 20
Number of images in Jackie Chan's training folder: 80
Number of images in Jackie Chan's testing folder: 20


In [11]:
# define function which tests for accuracy
def calculate_accuracy(encodings):
    total_images = 0
    correct_identifications = 0

    # Iterate through each celebrity folder in the test set
    for celeb_folder in os.listdir(test_dir):
        celeb_path = os.path.join(test_dir, celeb_folder)

        # Ensure it's a directory
        if os.path.isdir(celeb_path):
            # Process each image in the folder
            for test_image in os.listdir(celeb_path):
                test_image_path = os.path.join(celeb_path, test_image)

                # Load and encode test image
                test_img = face_recognition.load_image_file(test_image_path)
                test_encoding = face_recognition.face_encodings(test_img)

                if test_encoding:
                    test_encoding = test_encoding[0]

                    # Compare with known encodings
                    results = face_recognition.compare_faces(
                        list(encodings.values()), test_encoding, tolerance=0.6
                    )

                    # Find match
                    if True in results:
                        match_index = results.index(True)
                        matched_celebrity = list(encodings.keys())[match_index]

                        # Check if the identified celebrity matches the folder name
                        if matched_celebrity == celeb_folder:
                            correct_identifications += 1
                        else:
                            print(f"Incorrect: {test_image_path} identified as {matched_celebrity}")
                    else:
                        print(f"No match found for: {test_image_path}")

                # Increment total image count
                total_images += 1

    # Calculate accuracy
    accuracy = (correct_identifications / total_images) * 100 if total_images > 0 else 0
    print(f"Accuracy Rate: {accuracy:.2f}%")
    print(f"Total Images: {total_images}, Correct Identifications: {correct_identifications}")

# Run accuracy calculation
calculate_accuracy(celebrity_encodings)


Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/David-Bowie/37.jpg identified as Conan-OBrien
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/100.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/09.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/11.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/25.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/28.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Elizabeth-Olsen/14.jpg identified as Angelina-Jolie
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Jim-Carrey/30.jpg identified as Jackie-Chan
Incorrect: /workspace/DS4002Project3/DATA/celebrities_test/Jim-Carrey/59.jpg identified as Danny-Pudi
Incorrect: /workspace/D

In [21]:
from multiprocessing import Pool
import face_recognition
import pandas as pd
import numpy as np

# Function to encode faces
def encode_face(img_path):
    try:
        img = face_recognition.load_image_file(img_path)
        encoding = face_recognition.face_encodings(img, model="cnn")  # Using cnn model for better accuracy
        if encoding:
            return encoding[0]
        else:
            return None
    except Exception as e:
        return None

# Function to encode faces in parallel using multiprocessing
def encode_faces_in_parallel(image_paths):
    with Pool() as pool:
        encodings = pool.map(encode_face, image_paths)
    return [encoding for encoding in encodings if encoding is not None]

# Function to split data into training and testing sets
def split_data(df):
    train_data = df.sample(frac=0.8, random_state=np.random.randint(0, 10000))  # Random split for each round
    test_data = df.drop(train_data.index)  # Remaining 20% for testing
    return train_data, test_data

# Function to calculate accuracy over multiple rounds
def calculate_accuracy(df, rounds=10):
    correct_matches = 0
    total_matches = len(df)
    
    # List to store the accuracy for each round
    accuracy_per_round = []

    # Loop over multiple rounds for testing
    for round_number in range(rounds):
        # Step 1: Randomly split data into training and testing sets for this round
        train_df, test_df = split_data(df)

        # Step 2: Encode faces in the training data
        train_encodings = encode_faces_in_parallel(train_df['file_path'].tolist())
        
        round_correct = 0  # Correct match counter for this round

        # Step 3: Test the images in the test set
        for i, row in test_df.iterrows():
            test_encoding = encode_face(row['file_path'])
            if test_encoding is not None:
                # Compare the face in the test set with all the faces in the training set
                matches = face_recognition.compare_faces(train_encodings, test_encoding)
                
                # Calculate the face distance to find the best match
                best_match_index = np.argmin([face_recognition.face_distance([train_encodings[i]], test_encoding) for i in range(len(train_encodings))])
                
                # If the best match corresponds to the correct celebrity, increment the correct match counter
                if matches[best_match_index]:
                    round_correct += 1

        # Step 4: Calculate the accuracy for this round
        round_accuracy = round_correct / len(test_df)
        accuracy_per_round.append(round_accuracy)

        # Optional: Store the round accuracy back into the dataframe (if needed for further analysis)
        test_df[f"round_{round_number + 1}_accuracy"] = round_correct / len(test_df)
    
    # Calculate the average accuracy rate across all 10 rounds
    average_accuracy = np.mean(accuracy_per_round)
    
    return average_accuracy

# Main process
if __name__ == "__main__":
    # Step 1: Load DataFrame with image data (make sure this path is correct)
    df = pd.read_csv("/workspace/DS4002Project3/DATA/celeb_image_data.csv")

    # Step 2: Calculate the accuracy over multiple rounds with random splits
    accuracy_rate = calculate_accuracy(df, rounds=10)

    # Output the results
    print(f"Average Accuracy Rate over 10 rounds: {accuracy_rate * 100:.2f}%")

Average Accuracy Rate over 10 rounds: 93.46%


In [22]:
# Save the DataFrame with accuracy results to a CSV file
test_df.to_csv("/workspace/DS4002Project3/DATA/celeb_image_data_testing.csv", index=False)


In [24]:
import pandas as pd

# Read the dataframe from the CSV file
df = pd.read_csv("/workspace/DS4002Project3/DATA/celeb_image_data_testing.csv")

# Calculate the accuracy rate for each celebrity
celebrity_accuracy = df.groupby('celebrity_name')['accuracy_rate'].mean()

# Print the accuracy rate for each celebrity
print("Celebrity Accuracy Rates:")
for celeb, accuracy in celebrity_accuracy.items():
    print(f"{celeb}: {accuracy * 100:.2f}%")


Celebrity Accuracy Rates:
America-Ferrera: 94.12%
Angelina-Jolie: 100.00%
Ayo-Edebiri: 92.86%
Conan-OBrien: 93.33%
Danny-Pudi: 100.00%
David-Bowie: 83.33%
Donald-Glover: 100.00%
Elizabeth-Olsen: 90.48%
Jackie-Chan: 100.00%
Jim-Carrey: 100.00%
John-Lennon: 92.00%
John-Mulaney: 89.29%
Lucy-Liu: 85.00%
Margot-Robbie: 83.33%
Mariah-Carey: 85.00%
Matt-Damon: 100.00%
Maya-Rudolph: 88.00%
Morgan-Freeman: 95.24%
Olivia-Rodrigo: 88.89%
Pedro-Pascal: 92.86%
Priyanka-Chopra: 100.00%
Ryan-Gosling: 92.86%
SZA: 84.21%
Salma-Hayek: 96.00%
Will-Smith: 100.00%
