Testing the celeb recognition package

https://pypi.org/project/celeb-detector/


https://www.kaggle.com/code/vinayakshanawad/celebrity-face-recognition-vggface-model/notebook 

## Initializing Functions

In [7]:
#
import os
import random
import shutil
import face_recognition
import numpy as np

# Define paths for source, training, and test directories
source_dir = "celebrities"
train_dir = "celebrities_train"
test_dir = "celebrities_test"

# Number of images for training and test sets
train_count = 80
test_count = 20

# create a function to split the data into test and training sets (which will be saved
# into folders)
def split_data():
    if not os.path.exists(train_dir): # create training folder if it doesn't exist
        os.makedirs(train_dir)
    if not os.path.exists(test_dir): # create test folder if it doesn't exist
        os.makedirs(test_dir)
    
    # Iterate through each celebrity folder
    for celeb_folder in os.listdir(source_dir): # "for each celeb folder in celebrities folder"
        celeb_path = os.path.join(source_dir, celeb_folder) # create path to specific celeb folder
        
        if os.path.isdir(celeb_path):
            # List all image files
            images = [img for img in os.listdir(celeb_path) if img.endswith(('.jpg', '.jpeg'))]
            random.shuffle(images) # shuffle the images to ensure randomization
            
            # Ensure we have enough images
            if len(images) >= (train_count + test_count):
                # Select the first 80 images for training and the next 20 for testing
                train_images = images[:train_count]
                test_images = images[train_count:train_count + test_count]

                train_celeb_folder = os.path.join(train_dir, celeb_folder)
                test_celeb_folder = os.path.join(test_dir, celeb_folder)
                os.makedirs(train_celeb_folder, exist_ok=True)
                os.makedirs(test_celeb_folder, exist_ok=True)
                
                # Copy images to train and test folders without overlap
                for img in train_images:
                    shutil.copy(os.path.join(celeb_path, img), train_celeb_folder)
                for img in test_images:
                    shutil.copy(os.path.join(celeb_path, img), test_celeb_folder)
            else:
                print(f"Not enough images in {celeb_folder} for splitting.")

# Encode faces in the training set
def encode_training_faces():
    encodings = {}
    
    for celeb_folder in os.listdir(train_dir):
        celeb_path = os.path.join(train_dir, celeb_folder)
        if os.path.isdir(celeb_path):
            celeb_encodings = []
            
            # Process each image
            for img_name in os.listdir(celeb_path):
                img_path = os.path.join(celeb_path, img_name)
                img = face_recognition.load_image_file(img_path)
                
                # Get encodings (assuming one face per image)
                encoding = face_recognition.face_encodings(img)
                if encoding:
                    celeb_encodings.append(encoding[0])
            
            # Save encodings for each celebrity
            if celeb_encodings:
                encodings[celeb_folder] = np.mean(celeb_encodings, axis=0)
    
    return encodings


## EDA

Create dataframe to record the following characteristics for each picture:

1. Celebrity folder (ex: Pedro Pascal)
2. Photo Number (ex: 08)
3. Link to location in repo (ex: /celebrities/Pedro-Pascal/08.jpg)
4. Gender of individual in photo (ex: Male)
5. Lighting degree in photo (ex: _________) SOURCE: https://stackoverflow.com/questions/3490727/what-are-some-methods-to-analyze-image-brightness-using-python
6. Image resolution (ex: _________) 

In [11]:
import pandas as pd
from PIL import Image, ImageStat
import os

# Calculate the brightness of an image
def brightness(im_file):
    im = Image.open(im_file).convert('L')  # Convert to grayscale
    stat = ImageStat.Stat(im)
    return stat.mean[0]  # Mean brightness

def extract_number_from_filename(filename):
    # Split at the dot and take the part before it
    number_str = filename.split('.')[0]
    # Convert the resulting string to an integer
    return int(number_str)

# Create a DataFrame including brightness information
def create_image_dataframe_from_source():
    data = []

    # Process each celebrity folder in the source directory
    for celeb_folder in os.listdir(source_dir):
        celeb_path = os.path.join(source_dir, celeb_folder)
        
        if os.path.isdir(celeb_path):
            for i, img_name in enumerate(os.listdir(celeb_path), start=1):
                img_path = os.path.join(celeb_path, img_name)
                
                # Calculate brightness for the image
                img_brightness = brightness(img_path)
                
                # Append the data to the list
                data.append({
                    'celebrity_name': celeb_folder,
                    'picture_number': extract_number_from_filename(img_name),
                    'file_path': img_path,
                    'brightness': img_brightness
                })
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=['celebrity_name', 'picture_number', 'file_path', 'brightness'])
    return df

# Example usage
df = create_image_dataframe_from_source()
print(df.head())

df.to_csv('celeb_image_data.csv', index=False)

   celebrity_name  picture_number                           file_path  \
0  Angelina-Jolie               1    celebrities/Angelina-Jolie/1.jpg   
1  Angelina-Jolie              10   celebrities/Angelina-Jolie/10.jpg   
2  Angelina-Jolie             100  celebrities/Angelina-Jolie/100.jpg   
3  Angelina-Jolie              11   celebrities/Angelina-Jolie/11.jpg   
4  Angelina-Jolie              12   celebrities/Angelina-Jolie/12.jpg   

   brightness  
0  145.551626  
1  126.765005  
2  125.344680  
3  118.658889  
4   58.537125  


In [3]:


# create function to identify random celeb from the test set
def identify_random_test_image(encodings):
    celeb_folder = random.choice(os.listdir(test_dir))
    celeb_path = os.path.join(test_dir, celeb_folder)
    test_image = random.choice(os.listdir(celeb_path))
    test_image_path = os.path.join(celeb_path, test_image)
    
    # Load and encode test image
    test_img = face_recognition.load_image_file(test_image_path)
    test_encoding = face_recognition.face_encodings(test_img)
    
    if test_encoding:
        test_encoding = test_encoding[0]
        
        # Compare with known encodings
        results = face_recognition.compare_faces(
            list(encodings.values()), test_encoding, tolerance=0.6
        )
        
        # Find match
        if True in results:
            match_index = results.index(True)
            matched_celebrity = list(encodings.keys())[match_index]
            print(f"Identified as: {matched_celebrity} from test image: {test_image_path}")
        else:
            print(f"No match found for {test_image_path}.")
    else:
        print(f"No face detected in test image {test_image_path}")


In [4]:

# Run steps-- split data and perform encodings
split_data()
celebrity_encodings = encode_training_faces()


Not enough images in David-Bowie for splitting.


In [5]:
# define function which tests for accuracy
def calculate_accuracy(encodings):
    total_images = 0
    correct_identifications = 0
    
    # Iterate through each celebrity folder in the test set
    for celeb_folder in os.listdir(test_dir):
        celeb_path = os.path.join(test_dir, celeb_folder)
        
        # Ensure it's a directory
        if os.path.isdir(celeb_path):
            # Process each image in the folder
            for test_image in os.listdir(celeb_path):
                test_image_path = os.path.join(celeb_path, test_image)
                
                # Load and encode test image
                test_img = face_recognition.load_image_file(test_image_path)
                test_encoding = face_recognition.face_encodings(test_img)
                
                if test_encoding:
                    test_encoding = test_encoding[0]
                    
                    # Compare with known encodings
                    results = face_recognition.compare_faces(
                        list(encodings.values()), test_encoding, tolerance=0.6
                    )
                    
                    # Find match
                    if True in results:
                        match_index = results.index(True)
                        matched_celebrity = list(encodings.keys())[match_index]
                        
                        # Check if the identified celebrity matches the folder name
                        if matched_celebrity == celeb_folder:
                            correct_identifications += 1
                        else:
                            print(f"Incorrect: {test_image_path} identified as {matched_celebrity}")
                    else:
                        print(f"No match found for: {test_image_path}")
                
                # Increment total image count
                total_images += 1
    
    # Calculate accuracy
    accuracy = (correct_identifications / total_images) * 100 if total_images > 0 else 0
    print(f"Accuracy Rate: {accuracy:.2f}%")
    print(f"Total Images: {total_images}, Correct Identifications: {correct_identifications}")

# Run accuracy calculation
calculate_accuracy(celebrity_encodings)


Incorrect: celebrities_test/Ayo-Edebiri/93.jpg identified as Matt-Damon
Incorrect: celebrities_test/Lucy-Liu/40.jpg identified as Ayo-Edebiri
Incorrect: celebrities_test/Margot-Robbie/09.jpeg identified as John-Lennon
Incorrect: celebrities_test/Matt-Damon/09.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/23.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/40.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/45.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/46.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/50.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/70.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/73.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/74.jpg identified as John-Mulaney
Incorrect: celebrities_test/Matt-Damon/97.jpg identified as John-Mulaney
Incorrect: celebrities_test/Olivia-Rodrigo/22.jpg id