In [1]:
# import the appropriate libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
from scipy import stats
from collections import Counter
import re
from datetime import datetime
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
from io import BytesIO
import ast
from scipy.stats import pearsonr

#image analysis: first install dlib in your ada environment (1: conda activate ada , 2: install dlib)
import dlib 
#face recognition library, first install it in your ada environment (1: conda activate ada, 2: install face_recognition)
import face_recognition
from PIL import Image, ImageDraw

In [None]:
#Loads "actor_images.csv" file
actor_images = pd.read_csv('data/our_datasets/actor_images.csv')
actor_images.head()

In [None]:
#Finding missing values 
missing = actor_images == '-'
missing_images = actor_images[missing.any(axis=1)]
print(missing_images)

In [None]:
#actor_images without the missing urls
actor_cleaned = actor_images[~missing.any(axis=1)]

In [None]:
#face_encodings = {'Actor': [], 'Landmarks': [], 'Encodings':[]}
def landmarks (row):
    actor_name = row['Actor']
    image_url = row['Image URL']

    try:
        # Retrieve image URL
        img = requests.get(image_url)
        image_content = BytesIO(img.content)

        # Transform the image into an array
        img_array = np.asarray(Image.open(image_content))

        # Extract facial encodings (coordinates)
        face_landmarks_list = face_recognition.face_landmarks(img_array)
        
        if face_landmarks_list:
            face_encodings_list = face_recognition.face_encodings(img_array)
                                                                  
        else:
            face_encodings_list = np.nan

    except Exception as e:
        face_landmarks_list = np.nan
        face_encodings_list = np.nan
        
    return pd.Series({'Actor': actor_name, 'Landmarks': face_landmarks_list, 'Encodings':face_encodings_list})
    
#face_encodings = actor_cleaned.apply(lambda row: landmarks(row), axis=1)

In [4]:
face_encodings = pd.read_csv("encodings2.csv")

In [6]:
face_encodings_cleaned = face_encodings.dropna()
missing =(len(face_encodings)-len(face_encodings_cleaned))/len(face_encodings)
print('Number of missing encodings is {:%}'.format(missing))

Number of missing encodings is 12.103746%


In [7]:
def face_proportions(row):

    facial_prop = pd.Series({
        'Actor': row['Actor'],
        'Eye Distance': np.nan,
        'Eye Position': np.nan,
        'Nose Length': np.nan,
        'Nose Width': np.nan,
        'Eyebrow Length': np.nan,
        'Face Shape': np.nan,
        'Cheek Bones': np.nan})
 
    try:
        landmarks = row['Encodings'][0] 
    except (json.JSONDecodeError, IndexError):
        # Handle cases where 'Encodings' is not a valid JSON or the list is empty
        return facial_prop

    chin_landmarks = landmarks['chin']
    nose_landmarks = landmarks['nose_bridge']
    nose_width_landmarks = landmarks['nose_tip']
    left_eyebrow_landmarks = landmarks['left_eyebrow']
    right_eyebrow_landmakrs = landmarks['right_eyebrow']
    left_eye_landmarks = landmarks['left_eye']
    right_eye_landmarks = landmarks['right_eye']
    top_lip_landmarks = landmarks['top_lip']

    if not chin_landmarks or not nose_landmarks:
        return facial_prop

    #Get the maximum distance for x and y => we will standardize the distances by dividing them by the max values 
    x1 = chin_landmarks[0][0]
    x2 = chin_landmarks[-1][0]
    y1 = nose_landmarks[0][1]
    y2 = chin_landmarks[8][1]

    x_max = np.abs(x2 - x1)
    y_max = np.abs(y2 - y1)

    #Get Eye Distance
    #Eye Position: distance between the eyes and the edge of the chin (in y distance)
    if not left_eye_landmarks or not right_eye_landmarks:
        eye_dist = np.nan
        eye_position = np.nan
    else: 
        x_r = right_eye_landmarks[0][0]
        x_l = left_eye_landmarks[3][0]
        eye_dist = abs(x_r - x_l) / x_max

        y_r = right_eye_landmarks[0][1]
        y_l = left_eye_landmarks[3][1]
        chin_r = chin_landmarks[7][1]
        chin_l = chin_landmarks[9][1]

        eye_position = (abs(y_r - chin_r)/y_max + abs(y_l - chin_l)/y_max)/2


    facial_prop['Eye Distance'] = eye_dist
    facial_prop['Eye Position'] = eye_position 



    # Get nose withd and nose length
    if not nose_landmarks or not nose_width_landmarks:
        nose_length  = np.nan
        nose_width  = np.nan
    else:
        y_nose1 = nose_landmarks[0][1]
        y_nose2 = nose_width_landmarks[-1][1]
        x_nose1 = nose_width_landmarks[0][0]
        x_nose2 = nose_width_landmarks[-1][0]

        nose_length = abs(y_nose2 - y_nose1) / y_max
        nose_width = abs(x_nose2 - x_nose1) / x_max

    facial_prop['Nose Length'] = nose_length
    facial_prop['Nose Width'] = nose_width


    #Eye brow length: do for both eye brows and take the mean
    if not left_eyebrow_landmarks or not right_eyebrow_landmakrs:
        eyebrow_length = np.nan
    else: 
        x1_eyebrowl = left_eyebrow_landmarks[0][0]
        x2_eyebrowl = left_eyebrow_landmarks[-1][0]
        x1_eyebrowr = right_eyebrow_landmakrs[0][0]
        x2_eyebrowr = right_eyebrow_landmakrs[-1][0]
        eyebrow_length = (abs(x2_eyebrowl - x1_eyebrowl)/x_max + abs(x2_eyebrowr - x1_eyebrowr)/x_max)/2

    facial_prop['Eyebrow Length'] = eyebrow_length

    #Face shape: for the face shape, we will do face width on face length ratio . if close to 1 the face is square, if close to zero the face is long and narrow and if bigger than 1 the face is short and broad.
    face_shape = x_max / y_max
    facial_prop['Face Shape'] = face_shape

    # Cheek bones: x distance from chin[0] to chin[5]
    chin5 = chin_landmarks[5][0]
    chin11 = chin_landmarks[11][0]
    cheek_bones = (abs(chin5 - x1)/x_max + abs(chin11 - x2)/x_max)/2

    facial_prop['Cheek Bones'] = cheek_bones

    return facial_prop

In [10]:
facial_proportions = face_encodings_cleaned.apply(face_proportions, axis=1)

TypeError: string indices must be integers

In [None]:
feature_columns = facial_proportions.drop('Actor', axis =1)
feature_columns = feature_columns.dropna()

def reg_coef(x,y,label=None,color=None,**kwargs):
    ax = plt.gca()
    r,p = pearsonr(x,y)
    ax.annotate('r = {:.2f}'.format(r), xy=(0.5,0.5), xycoords='axes fraction', ha='center')
    ax.set_axis_off()
# Create the pairplot
g = sns.PairGrid(feature_columns)
g.map_diag(sns.distplot)
g.map_lower(sns.regplot)
g.map_upper(reg_coef)

plt.suptitle('Feature visualisation', y=1.02, size=16)

plt.show()

In [None]:
#Creates a boxplot for each face proportion
plt.figure(figsize=(10, 6))
sns.boxplot(data=feature_columns, palette='Set2')
plt.title('Box Plots for Facial Proportions')
plt.ylabel('Facial Proportions {%}')
plt.legend()
plt.show()

In [None]:
#merge tropes_characters_ethnicity_df with facial_proportions

facial_proportions= facial_proportions.rename(columns={'Actor': 'ActorName'})
tropes = tropes_characters_ethnicity_df[['Trope','ActorName']]
tropes_facial_features = tropes.merge(facial_proportions, on=['ActorName'], how='outer')

In [None]:
tropes_facial_features_plot = tropes_facial_features.drop('ActorName',axis=1)

In [None]:
df_melted = pd.melt(tropes_facial_features_plot, id_vars='Trope', var_name='Feature', value_name='Values')

In [None]:
plt.figure(figsize=(20, 6))
sns.barplot(x='Trope', y='Values', hue='Feature', data=df_melted)
plt.xlabel('Trope')
plt.ylabel('Values')
plt.title('Bar Plot for facial features grouped by character tropes')
plt.legend(title='Features', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()

In [None]:
faces_euclidean = face_encodings_cleaned.drop([face_encodings_cleaned.index[97],face_encodings_cleaned.index[131],face_encodings_cleaned.index[156]])

In [None]:
for i in range(0,302):
    if len(faces_euclidean['Encodings'].iloc[i]) > 1:
        length = len(faces_euclidean['Encodings'].iloc[i])
        print('Row number {} has {} encoding vectors'.format(i,length))

In [None]:
def euclidean_dist(encoding):

In [None]:
faces_euclidean['Dist'] = faces_euclidean['Encodings'].apply(euclidean_dist)

In [None]:
faces_euclidean.reset_index(drop=True, inplace=True)
faces_euclidean['ActorID'] = faces_euclidean.index

In [None]:
actor_attributes = tropes_characters_ethnicity_df[['Trope','ActorName','ActorGender','ActorHeight','ActorDOB']]
faces_euclidean_trope = actor_attributes.merge(faces_euclidean, on=['ActorName'], how='outer')

In [None]:
faces_euclidean_trope.to_csv('actor_features.csv')

In [None]:
matrix_values = faces_euclidean['Dist'].values

# Convert the list of lists to a NumPy array
matrix_array = np.array([x for x in matrix_values])