# Gender Predictions Exploratory Data Analysis

## Imports

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score

In [3]:
# fface_df = pd.read_csv("../data/fface_train.csv")
gender_preds_df = pd.read_csv("../data/fairface-gender-predictions.csv")

## Preprocess

In [4]:
def prepare_age(df):
    """Prepares the age column from obj to integer
    by applying the average between the age range"""
    age_dict = dict()
    new_df = df.copy()

    for idx, values in new_df.iterrows():
        if values.age == 'more than 70':
            age_dict[idx] = 75
            continue
        age_arr = values.age.split('-')
        num_age_arr = [int(num) for num in age_arr]
        age_dict[idx] = np.mean(num_age_arr)
    
    for key, value in age_dict.items():
        new_df.at[key, 'age'] = value
    
    return new_df

In [5]:
with open('../data/man-synms.json', encoding='utf-8') as json_data:
    man_synms = json.load(json_data)['synms']

In [6]:
with open('../data/woman-synms.json', encoding='utf-8') as json_data:
    woman_synms = json.load(json_data)['synms']

In [7]:
ambiguous_synms = list(set(man_synms) & set(woman_synms))

In [8]:
def synm_to_gender(synm):
    if synm in woman_synms:
        return 'Female'
    else:
        return 'Male'

In [9]:
# fface_df = prepare_age(fface_df)
gender_preds_df = prepare_age(gender_preds_df)

Dropping the ambiguous label 'adult', present in both man and woman synonyms list. Only 22 occurencies, not enough to have an impact

In [28]:
print(gender_preds_df[gender_preds_df['synms_gender_preds'] == 'adult'].count())
print(gender_preds_df[gender_preds_df['synms_gender_preds']
                == 'adult']['race'].value_counts())

file                  0
age                   0
race                  0
gender                0
gender_preds          0
synms_gender_preds    0
dtype: int64
Series([], Name: count, dtype: int64)


In [17]:
gender_preds_df = gender_preds_df.drop(
    gender_preds_df[gender_preds_df['synms_gender_preds'] == 'adult'].index)

## Initial EDA

In [18]:
print(gender_preds_df.shape)
gender_preds_df.head()

(86722, 6)


Unnamed: 0,file,age,race,gender,gender_preds,synms_gender_preds
0,train/1.jpg,54.5,East Asian,Male,Male,middle-aged man
1,train/2.jpg,34.5,Indian,Female,Female,shikse
2,train/3.jpg,6.0,Black,Female,Female,young woman
3,train/4.jpg,24.5,Indian,Female,Female,mestiza
4,train/5.jpg,24.5,Indian,Female,Female,mestiza


In [19]:
print('Hole dataset race proportions')
gender_preds_df.race.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset race proportions


race
White              19.1%
Latino_Hispanic    15.4%
Indian             14.2%
East Asian         14.2%
Black              14.1%
Southeast Asian    12.4%
Middle Eastern     10.6%
Name: proportion, dtype: object

In [20]:
print('Hole dataset gender proportions')
gender_preds_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset gender proportions


gender
Male      53.0%
Female    47.0%
Name: proportion, dtype: object

### Binary Gender Predictions

In [21]:
print('Gender predictions proportions')
gender_preds_df.gender_preds.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Gender predictions proportions


gender_preds
Male      51.0%
Female    49.0%
Name: proportion, dtype: object

In [47]:
gender_preds_misses = gender_preds_df[gender_preds_df['gender'] != gender_preds_df['gender_preds']]
gender_preds_misses.drop('synms_gender_preds', axis=1).head()

Unnamed: 0,file,age,race,gender,gender_preds
5,train/6.jpg,24.5,White,Male,Female
7,train/8.jpg,34.5,Indian,Female,Male
15,train/16.jpg,34.5,White,Female,Male
18,train/19.jpg,1.0,Black,Female,Male
29,train/30.jpg,24.5,Black,Female,Male


In [23]:
gender_preds_acc = accuracy_score(
    gender_preds_df['gender'], gender_preds_df['gender_preds'])
print(f"gender_preds misses count: {len(gender_preds_misses)}")
print(f"gender_preds accuracy score: {round(gender_preds_acc, 2)}")

gender_preds misses count: 8466
gender_preds accuracy score: 0.9


In [24]:
print('gender_preds misses by race')
gender_preds_misses.race.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

gender_preds misses by race


race
Black              22.3%
Southeast Asian    15.4%
East Asian         14.3%
White              14.2%
Indian             14.2%
Latino_Hispanic    12.3%
Middle Eastern      7.2%
Name: proportion, dtype: object

In [25]:
print('gender_preds misses by gender')
gender_preds_misses.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

gender_preds misses by gender


gender
Male      60.5%
Female    39.5%
Name: proportion, dtype: object

### Synonyms Gender Predictions

In [43]:
gender_series = gender_preds_df['gender']
synms_preds_series = gender_preds_df['synms_gender_preds']
binary_synms_preds_series = synms_preds_series.map(synm_to_gender)

In [48]:
synms_gender_preds_misses = gender_preds_df[gender_preds_df['gender'] != binary_synms_preds_series]
synms_gender_preds_misses.drop('gender_preds', axis=1).head()

Unnamed: 0,file,age,race,gender,synms_gender_preds
5,train/6.jpg,24.5,White,Male,mestiza
8,train/9.jpg,14.5,White,Male,bas bleu
18,train/19.jpg,1.0,Black,Female,babu
21,train/22.jpg,34.5,Middle Eastern,Female,staff
26,train/27.jpg,54.5,White,Female,occupier


In [45]:
synms_gender_preds_acc = accuracy_score(
    gender_preds_df['gender'], binary_synms_preds_series)
print(f"synms_gender_preds misses count: {len(synms_gender_preds_misses)}")
print(f"synms_gender_preds accuracy score: {round(synms_gender_preds_acc, 2)}")

synms_gender_preds misses count: 17752
synms_gender_preds accuracy score: 0.8


In [46]:
print('Synonyms gender predictions proportions')
binary_synms_preds_series.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

Synonyms gender predictions proportions


synms_gender_preds
Male      55.0%
Female    45.0%
Name: proportion, dtype: object

In [49]:
print('synms_gender_preds misses by race')
synms_gender_preds_misses.race.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

synms_gender_preds misses by race


race
White              18.7%
Black              17.3%
Indian             16.7%
Latino_Hispanic    13.8%
East Asian         12.5%
Southeast Asian    12.3%
Middle Eastern      8.6%
Name: proportion, dtype: object

In [50]:
print('synms_gender_preds misses by gender')
synms_gender_preds_misses.gender.value_counts(
    normalize=True).mul(100).round(1).astype(str) + '%'

synms_gender_preds misses by gender


gender
Female    54.8%
Male      45.2%
Name: proportion, dtype: object

## Age evaluation

In [51]:
print('Hole dataset age proportions')
gender_preds_df.age.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset age proportions


age
24.5    29.5%
34.5    22.2%
44.5    12.4%
6.0     12.0%
14.5    10.5%
54.5     7.2%
64.5     3.2%
1.0      2.1%
75       1.0%
Name: proportion, dtype: object

In [52]:
print('gender_preds misses by age')

gender_preds_misses.age.value_counts(normalize=True).mul(
    100).round(1).astype(str) + '%'

gender_preds misses by age


age
6.0     30.9%
24.5    18.0%
14.5    17.0%
34.5    11.4%
1.0      7.8%
44.5     6.9%
54.5     3.8%
64.5     2.8%
75       1.3%
Name: proportion, dtype: object

In [54]:
print('synms_gender_preds misses by age')

synms_gender_preds_misses.age.value_counts(normalize=True).mul(
    100).round(1).astype(str) + '%'

synms_gender_preds misses by age


age
24.5    24.6%
34.5    19.0%
6.0     16.6%
14.5    12.6%
44.5    11.8%
54.5     7.0%
64.5     3.8%
1.0      3.3%
75       1.3%
Name: proportion, dtype: object

In [16]:
# TODO: make some plots of distributions