# Exploratory Data Analysis

## Imports

In [95]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [96]:
path = "/home/lazye/Documents/ufrgs/mcs/datasets/FairFace/"
fface_df = pd.read_csv(f"{path}/train/fairface_label_train.csv")

In [97]:
fface_df.head()

Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


In [98]:
woman_miss_df = pd.read_csv('woman_misses.csv')
man_miss_df = pd.read_csv('man_misses.csv')

woman_miss_df = woman_miss_df.rename(columns={"Unnamed: 0": "index"})
man_miss_df = man_miss_df.rename(columns={'Unnamed: 0': 'index'})

## Preprocess

In [99]:
man_miss_df.age.unique()

array(['20-29', '30-39', '50-59', '0-2', '3-9', '10-19', '40-49', '60-69',
       'more than 70'], dtype=object)

In [100]:
def prepare_age(df):
    """Prepares the age column from obj to integer
    by applying the average between the age range"""
    age_dict = dict()
    new_df = df.copy()

    for idx, values in new_df.iterrows():
        if values.age == 'more than 70':
            age_dict[idx] = 75
            continue
        age_arr = values.age.split('-')
        num_age_arr = [int(num) for num in age_arr]
        age_dict[idx] = np.mean(num_age_arr)
    
    for key, value in age_dict.items():
        new_df.at[key, 'age'] = value
    
    return new_df

In [101]:
woman_miss_df = prepare_age(woman_miss_df)
man_miss_df = prepare_age(man_miss_df)
fface_df = prepare_age(fface_df)

## Initial EDA

In [102]:
print(man_miss_df.shape)
man_miss_df.head()

(5120, 6)


Unnamed: 0,index,file,age,gender,race,service_test
0,5,train/6.jpg,24.5,Male,White,True
1,37,train/38.jpg,34.5,Male,Latino_Hispanic,True
2,94,train/95.jpg,24.5,Male,Black,False
3,104,train/105.jpg,54.5,Male,East Asian,False
4,123,train/124.jpg,1.0,Male,Indian,False


In [103]:
print(woman_miss_df.shape)
woman_miss_df.head()

(3351, 6)


Unnamed: 0,index,file,age,gender,race,service_test
0,7,train/8.jpg,34.5,Female,Indian,True
1,15,train/16.jpg,34.5,Female,White,False
2,18,train/19.jpg,1.0,Female,Black,False
3,29,train/30.jpg,24.5,Female,Black,False
4,53,train/54.jpg,34.5,Female,East Asian,False


In [104]:
print('Hole dataset race proportions')
fface_df.race.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset race proportions


race
White              19.1%
Latino_Hispanic    15.4%
Indian             14.2%
East Asian         14.2%
Black              14.1%
Southeast Asian    12.4%
Middle Eastern     10.6%
Name: proportion, dtype: object

In [105]:
print('Man misses by race')
man_miss_df.race.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Man misses by race


race
Black              21.7%
Southeast Asian    18.7%
East Asian         15.9%
Indian             15.3%
White              11.4%
Latino_Hispanic    10.8%
Middle Eastern      6.2%
Name: proportion, dtype: object

In [106]:
print('Woman misses by race')
woman_miss_df.race.value_counts(normalize=True).mul(
    100).round(1).astype(str) + '%'

Woman misses by race


race
Black              23.3%
White              18.4%
Latino_Hispanic    14.7%
Indian             12.7%
East Asian         11.9%
Southeast Asian    10.2%
Middle Eastern      8.8%
Name: proportion, dtype: object

In [107]:
print('Hole dataset age proportions')
fface_df.age.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Hole dataset age proportions


age
24.5    29.5%
34.5    22.2%
44.5    12.4%
6.0     12.0%
14.5    10.5%
54.5     7.2%
64.5     3.2%
1.0      2.1%
75       1.0%
Name: proportion, dtype: object

In [108]:
print('Man misses by age')

man_miss_df.age.value_counts(normalize=True).mul(
    100).round(1).astype(str) + '%'

Man misses by age


age
6.0     40.5%
14.5    17.4%
24.5    13.5%
1.0     11.4%
34.5     7.4%
44.5     4.3%
54.5     2.4%
64.5     1.9%
75       1.2%
Name: proportion, dtype: object

In [109]:
print('Woman misses by age')

woman_miss_df.age.value_counts(normalize=True).mul(
    100).round(1).astype(str) + '%'

Woman misses by age


age
24.5    25.2%
34.5    17.3%
14.5    16.6%
6.0     16.2%
44.5    10.9%
54.5     5.9%
64.5     4.1%
1.0      2.3%
75       1.4%
Name: proportion, dtype: object

In [110]:
# TODO: make some plots of distributions