# Validation Processes Throught Our Analysis

# Face Recognition

## Drawing functions

In [None]:
def get_actor_image(actor_id, actors_df):
    return Image.open(actors_df[actors_df["imdb_id"] == actor_id]["img_path"].iloc[0])

def get_poster_image(movie_title, poster_path):
    return Image.open(posters_folder / f"{movie_title}{poster_path}")

In [None]:
from PIL import Image, ImageDraw 

def get_image_with_rect(img, box):
    #        box = [ l,   t,   r,   d ]
    shape = [(box[0], box[1]), (box[2], box[3])]

    img1 = ImageDraw.Draw(img)
    img1.rectangle(shape, outline="red", width=5)

    return img

In [None]:
def save_as_compared_images(img1, img2, output_name):
    size=(500, 500)

    img1.thumbnail(size)
    img2.thumbnail(size)


    images = [img1,img2]
    widths, heights = zip(*(i.size for i in images))

    total_width = sum(widths)
    max_height = max(heights)

    new_im = Image.new('RGB', (total_width, max_height))

    x_offset = 0
    for im in images:
      new_im.paste(im, (x_offset,0))
      x_offset += im.size[0]

    new_im.save(f'{output_name}.png')
    return new_im

## Manually validate a random sample

In [None]:
sample = pd.read_pickle("match_poster_actor_cast_all.pkl")
sample_actors = pd.read_pickle("actors_face_encodings.pkl")


In [None]:
validation_dir_path = 'validation'
result = []

for i, row in sample.iterrows():
    for count, (actor_id, face_location) in enumerate(zip(row["face_actor"][0], row["boxes"])):
        if actor_id:
            result.append((row["movie"], row["file_path"], actor_id, face_location))
            actor_img = get_actor_image(actor_id, sample_actors)
            poster_img = get_poster_image(row["movie"], row["file_path"])
            poster_img = get_image_with_rect(poster_img, face_location)
            save_as_compared_images(actor_img, poster_img, f'{validation_dir_path}/{row["movie"]}-{i}-{actor_id}-{count}')
    
result

[('Desperate Hours',
  '/t8iykcGwaH8iK3Zc85VaRg6vlqd.jpg',
  '0000620',
  [687.72, 549.78, 1020.21, 1011.04]),
 ('Britt-Marie Was Here',
  '/u12KThfDZpBGQ98Qg7ahWvMV9gq.jpg',
  '0000278',
  [909.25, 175.29, 1132.61, 482.14])]

## Validate Poster Center Calculation

In [None]:
from scipy.spatial import distance

def draw_center_line(row):
  x,y = row['width']/2, row['height']/2
  poster_center = (x,y)
  boxes = row['boxes']

  poster_img = get_poster_image(row["movie"], row["file_path"])
  img1 = ImageDraw.Draw(poster_img)

  for box in boxes:
    (l,t,r,d) = box
    face_center = (r-(r-l)/2, d-(d-t)/2)
    img1.line([poster_center, face_center], fill="red", width=5)
    
  return poster_img
    
draw_center_line(posters_races4_cast_all.sample(1))

In [None]:
def get_box_size(box):
    l,t,r,d = box
    return (r-l)*(d-t)
    
def draw_path_to_largest(img, boxes):

    img1 = ImageDraw.Draw(img)
    sizes = [get_box_size(b) for b in boxes]
    idx_largest = np.argmax(sizes)
    l,t,r,d =  boxes[idx_largest]
    largest_face_center = (r-(r-l)/2, d-(d-t)/2)
    for box in boxes:
        (l,t,r,d) = box
        face_center = (r-(r-l)/2, d-(d-t)/2)

        img1.line((face_center,largest_face_center),  fill="red", width=5)
    
    return img

## Prepare Experiment Of Crime Genre

In [None]:
# crime validatin

tmdb_data =  pd.read_pickle(metadata_dir/'tmdb_data.pkl', 'rb') 
en_movies = tmdb_data[tmdb_data['original_language'] == 'en']['imdb_id']

def save_images_with_rect(df, output_path):
    for i, row in tqdm(df.reset_index().iterrows()):        
        img = get_image_with_rect(Image.open(posters_folder / f'{row["movie"]}{row["file_path"]}'), row["boxes"])
        img.save(f'{output_path}/{row["Decade"]}{i}{row["file_path"].strip("/")}')

In [None]:
LOWER_YEAR_BOUND = 1960
UPPER_YEAR_BOUND = 2000
NUMBER_OF_SAMPLES = 50

sampling_df = posters_new_races4_cast_all.explode(["boxes", "faces_races","face_percentage"])
sampling_df = sampling_df[sampling_df["iso_639_1"] == "en"]
sampling_df["face_percentage"] = sampling_df["face_percentage"].astype(float)
sampling_df = sampling_df[sampling_df["faces_races"]!=""]

crime_df = sampling_df[sampling_df.apply(lambda x:  "Crime" in x["genres"] , axis=1)]
crime_df = crime_df[crime_df["face_percentage"] > sampling_df["face_percentage"].median()]
crime_df["startYear"] = crime_df["startYear"].astype(float)
crime_df["Decade"] = crime_df["startYear"]//10*10
crime_df = crime_df[crime_df["tconst"].isin(en_movies)]

for (race, decade), data in crime_df[(crime_df["Decade"]<UPPER_YEAR_BOUND) & (crime_df["Decade"]>=LOWER_YEAR_BOUND)].groupby(["faces_races", "Decade"]):
    if race in ['Asian', 'Indian', 'White', 'Black']: # this line is here for optional sampling from specific races
        sampled_posters = data.sample(min([NUMBER_OF_SAMPLES, len(data)]),random_state=42)
        out_path = Path(f"{data_path}/crime/1960_to_2000/{race}")
        out_path.mkdir(exist_ok=True, parents=True)
        save_images_with_rect(sampled_posters, out_path)

# Ethnicity Validation

In [None]:
from pathlib import Path
import pandas as pd

base_path = Path ('/content/drive/MyDrive/models/Ethnicity/Results/')
races_df = pd.read_csv(base_path / 'races_new4.csv')
races_df = races_df[races_df['mse'] > 30]

races_df["name"] = races_df["face_name_align"].apply(
    lambda x: x.partition("actors/")[2].partition('/')[0].rpartition("-")[0]
)

ethni4_df = races_df.groupby("actor_id").mean().reset_index()
ethni4_df["race"] = ethni4_df[["White", "Black", "Asian", "Indian"]].idxmax(axis=1)
ethni4_df = ethni4_df[["actor_id", "race"]]
ethni4_df = ethni4_df.merge(races_df[['name', 'actor_id']], on='actor_id').drop_duplicates('actor_id')
ethni4_df.index = ethni4_df.actor_id

In [None]:
races7_df = races_df[['race','race_scores_fair', 'actor_id', 'name']]
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.strip("[")
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.strip("]")
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.strip(" ")

races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.replace("\n", "")
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.replace("  ", " ")
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.replace("  ", " ")
races7_df["race_scores_fair"] = races7_df["race_scores_fair"].str.replace("  ", " ")


races7_df[['White','Black','Latino_Hispanic','East Asian','Southeast Asian','Indian','Middle Eastern']] = pd.DataFrame(
    races7_df["race_scores_fair"].str.split(" ", expand=True),
    index=races7_df.index,
)
races7_df["White"] = races7_df["White"].astype(float)
races7_df["Black"] = races7_df["Black"].astype(float)
races7_df["Latino_Hispanic"] = races7_df["Latino_Hispanic"].astype(float)
races7_df["East Asian"] = races7_df["East Asian"].astype(float)
races7_df["Southeast Asian"] = races7_df["Southeast Asian"].astype(float)
races7_df["Indian"] = races7_df["Indian"].astype(float)
races7_df["Middle Eastern"] = races7_df["Middle Eastern"].astype(float)

ethni7_df = races7_df.groupby("actor_id").mean().reset_index()
ethni7_df["race"] = ethni7_df[['White','Black','Latino_Hispanic','East Asian','Southeast Asian','Indian','Middle Eastern']].idxmax(axis=1)
ethni7_df = ethni7_df[["actor_id", "race"]]
ethni7_df = ethni7_df.merge(races7_df[['name', 'actor_id']], on='actor_id').drop_duplicates('actor_id')
ethni7_df.index = ethni7_df.actor_id

In [None]:
# 7 races
black = pd.DataFrame({'actor_id':[151,226,291,932,1807,1845,205626,1013003,1569276,2255973],'name': ['Morgan Freeman','Will Smith','Angela Bassett','Halle Berry','Cicely Tyson','Forest Whitaker','Viola Davis','Michael Ealy','Chadwick Boseman','Donald Glover'], 'actual': 'Black'})
white = pd.DataFrame({'actor_id':[204,1401,262635,376716,424060,564215,695435,1086543,1165110,1659221],'name': ['Natalie Portman','Angelina Jolie','Chris Evans','Christina Hendricks','Scarlett Johansson','James McAvoy','Chris Pratt','Amanda Seyfried','Chris Hemsworth','Sebastian Stan'], 'actual': 'White'})
indian = pd.DataFrame({'actor_id':[821,4626,352032,438463,451234,474774,1229940,1231899,1799038,2138653],'name': ['Amitabh Bachchan','Kareena Kapoor','Kamal Haasan','Anil Kapoor','Irrfan Khan','Akshay Kumar','Katrina Kaif','Priyanka Chopra Jonas','Vidya Balan','Deepika Padukone'], 'actual': 'Indian'})
east_asian = pd.DataFrame({'actor_id':[1030205,1402449,2098603,2201753,2347861,2425074,2976916,4081467,4206125,4947538],'name': ['Tôma Ikuta','Sayaka Isoyama','Hikari Mitsushima','Yui Aragaki','Masaki Okada','Ryôsuke Yamada','Kôji Seto','Kento Yamazaki','Nanao','Minami Hamabe'], 'actual': 'East Asian'})
southeast_asian = pd.DataFrame({'actor_id':[706,498046,814259,1388074,1787887,1977856,3299397,5377144,6525901,7093076],'name': ['Michelle Yeoh','Reggie Lee','Brenda Song','Tony Jaa','Veronica Ngo','Chin Han','Iko Uwais','Awkwafina','Henry Golding','Lana Condor'], 'actual': 'Southeast Asian'})
latino_hispanic = pd.DataFrame({'actor_id':[182,491,973,1507,4851,5527,520064,622897,2201555,4641207],'name': ['Jennifer Lopez','John Leguizamo','Benjamin Bratt','Cheech Marin','Penélope Cruz','Sofía Vergara','George Lopez','Guillermo Navarro','Aubrey Plaza','Jennifer Lopez'], 'actual': 'Latino_Hispanic'})
middle_eastern = pd.DataFrame({'actor_id':[1725,267042,452102,869467,1267552,1587232,1840164,1840659,1896736,2316907],'name': ['Omar Sharif','Golshifteh Farahani','Abbas Kiarostami','Shaun Toub','Taraneh Alidoosti','Bahar Soomekh','Ali Saam','Nasim Pedrad','Haaz Sleiman','Yasmine Al Massri'], 'actual': 'Middle Eastern'})
actors7_validate = pd.concat([black, white, indian, east_asian, southeast_asian, latino_hispanic, middle_eastern], ignore_index=True)
actors7_validate = actors7_validate.merge(ethni7_df[['race']], on='actor_id')

In [None]:
races7 = ['White','Black','Latino_Hispanic','East Asian','Southeast Asian','Indian','Middle Eastern']
for race in races7:
  df_by_race_precision = actors7_validate[actors7_validate['race'] == race]
  df_by_race_recall = actors7_validate[actors7_validate['actual'] == race]
  print(f"{race} Precision - {sum(df_by_race_precision['race'] == df_by_race_precision['actual']) / len(df_by_race_precision) * 100}%")
  print(f"{race} Recall - {sum(df_by_race_recall['race'] == df_by_race_recall['actual']) / len(df_by_race_recall) * 100}%")

In [None]:
# 4 races
black = pd.DataFrame({'actor_id':[151,226,291,932,1807,1845,205626,1013003,1569276,2255973],'name': ['Morgan Freeman','Will Smith','Angela Bassett','Halle Berry','Cicely Tyson','Forest Whitaker','Viola Davis','Michael Ealy','Chadwick Boseman','Donald Glover'], 'actual': 'Black'})
asian = pd.DataFrame({'actor_id':[2263791,3036914,3254274,3859624,4555391,5377144,6373728,6525901,7093076,7573742],'name': ['Claudia Kim','Arden Cho','Manny Jacinto','Ki Hong Lee','Lyrica Okano','Awkwafina','Ian Chen','Henry Golding','Lana Condor','Ian Chen'], 'actual': 'Asian'})
white = pd.DataFrame({'actor_id':[204,1401,262635,376716,424060,564215,695435,1086543,1165110,1659221],'name': ['Natalie Portman','Angelina Jolie','Chris Evans','Christina Hendricks','Scarlett Johansson','James McAvoy','Chris Pratt','Amanda Seyfried','Chris Hemsworth','Sebastian Stan'], 'actual': 'White'})
indian = pd.DataFrame({'actor_id':[821,4626,352032,438463,451234,474774,1229940,1231899,1799038,2138653],'name': ['Amitabh Bachchan','Kareena Kapoor','Kamal Haasan','Anil Kapoor','Irrfan Khan','Akshay Kumar','Katrina Kaif','Priyanka Chopra Jonas','Vidya Balan','Deepika Padukone'], 'actual': 'Indian'})
actors4_validate = pd.concat([black, asian, white, indian], ignore_index=True)
actors4_validate = actors4_validate.merge(ethni4_df[['race']], on='actor_id')

In [None]:
races4 = ["White", "Black", "Asian", "Indian"]
for race in races4:
  df_by_race_precision = actors4_validate[actors4_validate['race'] == race]
  df_by_race_recall = actors4_validate[actors4_validate['actual'] == race]
  print(f"{race} Precision - {sum(df_by_race_precision['race'] == df_by_race_precision['actual']) / len(df_by_race_precision) * 100}%")
  print(f"{race} Recall - {sum(df_by_race_recall['race'] == df_by_race_recall['actual']) / len(df_by_race_recall) * 100}%")