In [179]:
import numpy as np 
import pandas as pd 
import os
import glob                      # Unix style pathname pattern expansion
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [180]:
DATA_DIR = "./UTKface_Aligned_cropped/testing"
TRAIN_TEST_SPLIT = 0.7
IM_WIDTH = IM_HEIGHT = 224
ID_GENDER_MAP = {0: 'male', 1: 'female'}
GENDER_ID_MAP = dict((g, i) for i, g in ID_GENDER_MAP.items())
ID_RACE_MAP = {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}
RACE_ID_MAP = dict((r, i) for i, r in ID_RACE_MAP.items())

ID_GENDER_MAP, GENDER_ID_MAP, ID_RACE_MAP, RACE_ID_MAP

({0: 'male', 1: 'female'},
 {'male': 0, 'female': 1},
 {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'},
 {'white': 0, 'black': 1, 'asian': 2, 'indian': 3, 'others': 4})

In [181]:
DATA_DIR = "./UTKface_Aligned_cropped/UTKFace"
TRAIN_TEST_SPLIT = 0.7
IM_WIDTH = IM_HEIGHT = 198
ID_GENDER_MAP = {0: 'male', 1: 'female'}
GENDER_ID_MAP = dict((g, i) for i, g in ID_GENDER_MAP.items())
ID_RACE_MAP = {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}
RACE_ID_MAP = dict((r, i) for i, r in ID_RACE_MAP.items())

ID_GENDER_MAP, GENDER_ID_MAP, ID_RACE_MAP, RACE_ID_MAP

({0: 'male', 1: 'female'},
 {'male': 0, 'female': 1},
 {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'},
 {'white': 0, 'black': 1, 'asian': 2, 'indian': 3, 'others': 4})

In [182]:
def parse_filepath(filepath):
    try:
        path, filename = os.path.split(filepath)
        filename, ext = os.path.splitext(filename)
        age, gender, race, _ = filename.split("_")
        return int(age), ID_GENDER_MAP[int(gender)], ID_RACE_MAP[int(race)]
    except Exception as e:
        print(filepath)
        return None, None, None

In [183]:
# create a pandas data frame of images, age, gender and race
files = glob.glob(os.path.join(DATA_DIR, "*.jpg"))
# getting attributes
attributes = list(map(parse_filepath, files))

In [184]:
df = pd.DataFrame(attributes)
df['file'] = files
df.columns = ['age', 'gender', 'race', 'file']
df = df.dropna()
age_class = []
for i in df['age']:
    i = int(i)
    if i < 18:
        age_class.append(0)
    elif (18<=i<=24):
        age_class.append(1)
    elif (25<=i<=34):
        age_class.append(2)
    elif (35<=i<=44):
        age_class.append(3)
    elif (45<=i<=54):
        age_class.append(4)
    elif (55<=i<=64):
        age_class.append(5)
    elif (65<=i<=75):
        age_class.append(6)
    else:
        age_class.append(7)
        
# creating age class
# Age groups = [‘-18',’18-24', '25-34', '35-44','45-54', '55-64', '65-75', '75+’]
df['age_class'] = age_class
# changing column orders
df = df[['age_class','gender', 'race', 'file', 'age']]

df.sample(10)

Unnamed: 0,age_class,gender,race,file,age
12347,2,female,black,./UTKface_Aligned_cropped/UTKFace\33_1_1_20170...,33
15886,3,male,white,./UTKface_Aligned_cropped/UTKFace\41_0_0_20170...,41
18526,4,male,indian,./UTKface_Aligned_cropped/UTKFace\52_0_3_20170...,52
19463,5,female,indian,./UTKface_Aligned_cropped/UTKFace\55_1_3_20170...,55
15821,3,female,asian,./UTKface_Aligned_cropped/UTKFace\40_1_2_20170...,40
23085,7,female,white,./UTKface_Aligned_cropped/UTKFace\88_1_0_20170...,88
17902,4,male,white,./UTKface_Aligned_cropped/UTKFace\50_0_0_20170...,50
2414,0,female,asian,./UTKface_Aligned_cropped/UTKFace\1_1_2_201612...,1
16494,3,female,white,./UTKface_Aligned_cropped/UTKFace\44_1_0_20170...,44
2068,0,male,indian,./UTKface_Aligned_cropped/UTKFace\1_0_3_201612...,1


In [185]:
df.groupby(['race','gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age_class,file,age
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
asian,female,1861,1861,1861
asian,male,1575,1575,1575
black,female,2208,2208,2208
black,male,2318,2318,2318
indian,female,1715,1715,1715
indian,male,2261,2261,2261
others,female,932,932,932
others,male,760,760,760
white,female,4599,4599,4599
white,male,5475,5475,5475


In [235]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.13, random_state=42)

In [236]:
for train_index, test_index in split.split(df, df['gender']):
    print("TRAIN:", train_index, "TEST:", test_index)
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

TRAIN: [  755 18581  5857 ... 16906  8883 19574] TEST: [22051  8596 19003 ...  1851  6514 16079]


In [239]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, valid_index in split.split(strat_train_set, strat_train_set['gender']):
    print("TRAIN:", train_index, "TEST:", valid_index)
    strat_train_set = df.loc[train_index]
    strat_valid_set = df.loc[valid_index]

TRAIN: [ 6944  5722 12581 ...  1204  9294 16704] TEST: [ 3767  7005  3371 ... 18321  9823 18490]


In [279]:
print("Train dataset: %.2f percent" %(strat_train_set.shape[0]/(strat_train_set.shape[0]+strat_valid_set.shape[0] + strat_test_set.shape[0])*100))
strat_train_set.groupby(['gender']).count()

Train dataset: 73.95 percent


Unnamed: 0_level_0,age_class,race,file,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,8388,8388,8388,8388
male,9140,9140,9140,9140


In [280]:
print("Validation dataset: %.2f percent" %(strat_valid_set.shape[0]/(strat_train_set.shape[0]+strat_valid_set.shape[0] + strat_test_set.shape[0])*100))
strat_valid_set.groupby(['gender']).count()

Validation dataset: 13.05 percent


Unnamed: 0_level_0,age_class,race,file,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1510,1510,1510,1510
male,1584,1584,1584,1584


In [281]:
print("Test dataset: %.2f percent" %(strat_test_set.shape[0]/(strat_train_set.shape[0]+strat_valid_set.shape[0] + strat_test_set.shape[0])*100))
strat_test_set.groupby(['gender']).count()

Test dataset: 13.00 percent


Unnamed: 0_level_0,age_class,race,file,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1471,1471,1471,1471
male,1611,1611,1611,1611
