# Gender Predictions Exploratory Data Analysis

## Imports

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

%matplotlib inline

In [3]:
# fface_df = pd.read_csv("../data/fface_train.csv")
synms_gender_preds_df = pd.read_csv("../data/synms_gender_preds_fface.csv")

## Preprocess

In [4]:
with open('../data/synms_gender_labels.json', encoding='utf-8') as json_data:
    data = json.load(json_data)
    fface_classes = list(data.keys())
    fface_prompts = list(data.values())

In [5]:
'gentleman' in data['Female']



False

In [6]:
print(fface_classes)
print(fface_prompts)

['Male', 'Female']
[['young man', 'adult male', 'male', 'man', 'guy', 'boy', 'middle-aged man', 'old man', 'grandfather', 'grandpa'], ['young woman', 'adult female', 'female', 'woman', 'lady', 'girl', 'madam', 'old woman', 'grandmother', 'grandma']]


## Initial EDA

In [7]:
print(synms_gender_preds_df.shape)
synms_gender_preds_df.head()

(10954, 6)


Unnamed: 0,file,age,gender,race,service_test,gender_preds
0,val/1.jpg,3-9,Male,East Asian,False,boy
1,val/2.jpg,50-59,Female,East Asian,True,grandmother
2,val/3.jpg,30-39,Male,White,True,middle-aged man
3,val/4.jpg,20-29,Female,Latino_Hispanic,True,young woman
4,val/5.jpg,20-29,Male,Southeast Asian,False,man


In [8]:
print('Hole dataset race proportions')
synms_gender_preds_df.race.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset race proportions


race
White              19.0%
Latino_Hispanic    14.8%
Black              14.2%
East Asian         14.2%
Indian             13.8%
Southeast Asian    12.9%
Middle Eastern     11.0%
Name: proportion, dtype: object

In [9]:
synms_gender_preds_df.race.value_counts()

race
White              2085
Latino_Hispanic    1623
Black              1556
East Asian         1550
Indian             1516
Southeast Asian    1415
Middle Eastern     1209
Name: count, dtype: int64

In [10]:
print('Hole dataset gender proportions')
synms_gender_preds_df.gender.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset gender proportions


gender
Male      52.9%
Female    47.1%
Name: proportion, dtype: object

In [11]:
synms_gender_preds_df.gender.value_counts()

gender
Male      5792
Female    5162
Name: count, dtype: int64

### Binary Gender Predictions

In [12]:
print('Gender predictions proportions')
synms_gender_preds_df.gender_preds.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

Gender predictions proportions


gender_preds
middle-aged man    26.5%
woman              13.1%
young woman        13.0%
boy                10.8%
girl                6.9%
grandmother         6.3%
young man           6.1%
guy                 4.5%
madam               3.8%
man                 2.4%
grandfather         2.3%
male                1.7%
old woman           1.5%
female              0.4%
old man             0.4%
grandma             0.2%
grandpa             0.2%
lady                0.0%
adult male          0.0%
adult female        0.0%
Name: proportion, dtype: object

In [13]:
synms_gender_preds_df.gender_preds.value_counts()

gender_preds
middle-aged man    2904
woman              1433
young woman        1422
boy                1182
girl                758
grandmother         689
young man           663
guy                 489
madam               418
man                 262
grandfather         251
male                185
old woman           165
female               40
old man              39
grandma              25
grandpa              25
lady                  2
adult male            1
adult female          1
Name: count, dtype: int64

In [14]:
def synms_to_gender(synm):
    if synm in data['Male']:
        return 'Male'
    else:
        return 'Female'

In [15]:
binary_synms_gender_preds_df = synms_gender_preds_df.copy()
binary_synms_gender_preds_df['gender_preds'] = synms_gender_preds_df['gender_preds'].map(synms_to_gender)

In [16]:
binary_synms_gender_preds_df

Unnamed: 0,file,age,gender,race,service_test,gender_preds
0,val/1.jpg,3-9,Male,East Asian,False,Male
1,val/2.jpg,50-59,Female,East Asian,True,Female
2,val/3.jpg,30-39,Male,White,True,Male
3,val/4.jpg,20-29,Female,Latino_Hispanic,True,Female
4,val/5.jpg,20-29,Male,Southeast Asian,False,Male
...,...,...,...,...,...,...
10949,val/10950.jpg,30-39,Male,White,True,Male
10950,val/10951.jpg,50-59,Male,White,False,Male
10951,val/10952.jpg,60-69,Male,Latino_Hispanic,False,Male
10952,val/10953.jpg,20-29,Female,East Asian,False,Female


In [17]:
gender_preds_misses = binary_synms_gender_preds_df[binary_synms_gender_preds_df['gender']
                                          != binary_synms_gender_preds_df['gender_preds']]

In [18]:
gender_preds_acc = accuracy_score(
    binary_synms_gender_preds_df['gender'], binary_synms_gender_preds_df['gender_preds'])
print(f"gender_preds misses count: {len(gender_preds_misses)}")
print(f"gender_preds accuracy score: {round(gender_preds_acc, 2)}")

gender_preds misses count: 579
gender_preds accuracy score: 0.95


In [19]:
male_preds_acc = accuracy_score(
    binary_synms_gender_preds_df[binary_synms_gender_preds_df['gender'] == 'Male']['gender'], binary_synms_gender_preds_df[binary_synms_gender_preds_df['gender'] == 'Male']['gender_preds']
)
print(f"male predictions accuracy score: {round(male_preds_acc, 2)}")

male predictions accuracy score: 0.97


In [20]:
female_preds_acc = accuracy_score(
    binary_synms_gender_preds_df[binary_synms_gender_preds_df['gender'] ==
                        'Female']['gender'], binary_synms_gender_preds_df[binary_synms_gender_preds_df['gender'] == 'Female']['gender_preds']
)
print(f"female predictions accuracy score: {round(female_preds_acc, 2)}")

female predictions accuracy score: 0.92


In [21]:
white_df = binary_synms_gender_preds_df[binary_synms_gender_preds_df['race'] == 'White']
white_df[white_df['gender'] != white_df['gender_preds']]
white_preds_acc = accuracy_score(white_df['gender'], white_df['gender_preds'])

print(f"White predictions accuracy score: {round(white_preds_acc, 2)}")

White predictions accuracy score: 0.95


In [22]:
for race in binary_synms_gender_preds_df['race'].unique():
    race_df = binary_synms_gender_preds_df[binary_synms_gender_preds_df['race'] == race]
    race_preds_acc = accuracy_score(race_df['gender'], race_df['gender_preds'])
    print(f"{race} predictions accuracy: {round(race_preds_acc, 2)}")

East Asian predictions accuracy: 0.95
White predictions accuracy: 0.95
Latino_Hispanic predictions accuracy: 0.95
Southeast Asian predictions accuracy: 0.95
Black predictions accuracy: 0.91
Indian predictions accuracy: 0.95
Middle Eastern predictions accuracy: 0.97


In [23]:
for age in binary_synms_gender_preds_df['age'].unique():
    age_df = binary_synms_gender_preds_df[binary_synms_gender_preds_df['age'] == age]
    age_preds_acc = accuracy_score(age_df['gender'], age_df['gender_preds'])
    print(f"Ages {age} predictions accuracy: {round(age_preds_acc, 2)}")

Ages 3-9 predictions accuracy: 0.86
Ages 50-59 predictions accuracy: 0.97
Ages 30-39 predictions accuracy: 0.98
Ages 20-29 predictions accuracy: 0.97
Ages more than 70 predictions accuracy: 0.97
Ages 40-49 predictions accuracy: 0.98
Ages 10-19 predictions accuracy: 0.89
Ages 60-69 predictions accuracy: 0.97
Ages 0-2 predictions accuracy: 0.81
