## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import PIL
from PIL import Image
from sklearn.model_selection import train_test_split
import random
import pickle
from ast import literal_eval
from operator import itemgetter
import matplotlib.pyplot as plt
import os

## Data Augmentation


In [3]:
main_dir_names = ['Adience Gender Dataset/aligned']

for main_dir_name in main_dir_names:
    sub_dir_names = os.listdir(main_dir_name)
    count = 1
    for sub_dir in sub_dir_names:
        filenames = os.listdir(main_dir_name + '/' + sub_dir)
        dir_name = main_dir_name + '/' + sub_dir

        for filename in filenames:
            # because there are txt files in 'faces'
            if filename[-4:] == '.jpg':
                # Load image
                image = Image.open(dir_name + '/' + filename)

                # Grayscale img
                gs_img = image.convert(mode = 'L')
                gs_img.save(dir_name + '/grayscale_' + filename)

                # Flip img
                hori_img = image.transpose(Image.FLIP_LEFT_RIGHT)
                vert_img = image.transpose(Image.FLIP_TOP_BOTTOM)
                hori_img.save(dir_name + '/hori_flip_' + filename)
                vert_img.save(dir_name + '/vert_flip_' + filename)

                # Rotate img
                rotate_img = image.rotate(90)
                rotate_img.save(dir_name + '/rotated_' + filename)

        # Progress tracker - 168 sub directories in total
        print(main_dir_name.split('/')[-1] + " - Sub directory " 
              + str(count) + " '" + sub_dir + "': Done" )
        count += 1

aligned - Sub directory 1 '100003415@N08': Done
aligned - Sub directory 2 '10001312@N04': Done
aligned - Sub directory 3 '100014826@N03': Done
aligned - Sub directory 4 '10008401@N05': Done
aligned - Sub directory 5 '100346410@N05': Done
aligned - Sub directory 6 '10044155@N06': Done
aligned - Sub directory 7 '10058630@N06': Done
aligned - Sub directory 8 '10062073@N07': Done
aligned - Sub directory 9 '10069023@N00': Done
aligned - Sub directory 10 '101071073@N04': Done
aligned - Sub directory 11 '10113099@N03': Done
aligned - Sub directory 12 '10123180@N04': Done
aligned - Sub directory 13 '101295462@N02': Done
aligned - Sub directory 14 '10129575@N03': Done
aligned - Sub directory 15 '10148140@N07': Done
aligned - Sub directory 16 '101515718@N03': Done
aligned - Sub directory 17 '101532586@N07': Done
aligned - Sub directory 18 '101560979@N02': Done
aligned - Sub directory 19 '101591466@N03': Done
aligned - Sub directory 20 '101636677@N08': Done
aligned - Sub directory 21 '10171175@N0

## Initialisations

In [4]:
# Folder to store processed data
dir_name = './processed/'
if not os.path.isdir(dir_name):
  os.mkdir(dir_name)

## Pickle file conversions

In [5]:
def load_pickle(filename):
  with open(filename + '.pkl', 'rb') as f:
    return pickle.load(f)

In [6]:
def save_pickle(pkl, filename):
  with open(dir_name + filename + '.pkl', 'wb') as f:
    pickle.dump(pkl, f, pickle.HIGHEST_PROTOCOL)

## Age range conversion

In [16]:
def get_age(age_tuple):
  age_ranges = [(0,2), (4,6), (8,13), (15,20), (25,32), (38,43), (48,53), (60,100)]
  nearest = (100, 100)
  nearest_range = -1
  for i in range(len(age_ranges)):
    age_range = age_ranges[i]
    temp = tuple(np.absolute(np.subtract(age_tuple, age_range)))
    if min((temp, nearest), key = itemgetter(1)) == temp:
      nearest = temp
      nearest_range = i
  return nearest_range

## Train-test split

In [17]:
train_test_dir = './train_test_processed/'
if not os.path.isdir(train_test_dir):
  os.mkdir(train_test_dir)

# Train, test
for fold in range(5):
  df = pd.read_csv("./Adience Gender Dataset/fold_%s_data.txt"%fold, sep = "\t")
  # Cleaning
  df = df[df['age']!='None']
  df = df[df['age']!=' ']
  df = df[df['gender'].notnull()]
  df = df[df['gender']!=' ']
  df = df[df['gender']!='u']

  # Split 80/20
  train_df, test_df = train_test_split(df, test_size = 0.2, shuffle = True, random_state = 10)
  train_df.to_csv(train_test_dir + "fold_%s_train.csv"%fold, index=False)
  test_df.to_csv(train_test_dir + "fold_%s_test.csv"%fold, index=False)

# Combine
train_df = pd.concat([pd.read_csv(train_test_dir + "fold_%s_train.csv"%fold) for fold in range(5)])
test_df = pd.concat([pd.read_csv(train_test_dir + "fold_%s_test.csv"%fold) for fold in range(5)])
train_df.to_csv(train_test_dir + "train.csv", index=False)
test_df.to_csv(train_test_dir + "test.csv", index=False)

# Train, validation
for fold in range(5):
  # Leave original of that fold as validation, combine the rest
  train_sub_folds = [i for i in range(5) if i != fold]
  train_sub_df = pd.concat([pd.read_csv(train_test_dir + "fold_%s_train.csv"%train_sub) for train_sub in train_sub_folds])
  val_df = pd.read_csv(train_test_dir + "fold_%s_train.csv"%fold)
  train_sub_df.to_csv(train_test_dir + "fold_%s_train_sub.csv"%fold, index=False)
  val_df.to_csv(train_test_dir + "fold_%s_val.csv"%fold, index=False)

## Pickle all datasets

In [19]:
to_pickle = ['train', 'test']
counter= "_NEW_"
for i in range(5):
  to_pickle += ["fold_%s_train_sub"%i, "fold_%s_val"%i]

for one_csv in to_pickle:
  df = pd.read_csv(train_test_dir + one_csv + '.csv')
  images, genders, ages = [], [], []
  for i, row in df.iterrows():
    age = row['age']
    age_range = get_age(literal_eval(age))
    ages.append(age_range)

    gender = row['gender']
    if (gender == "m"):
      genders.append(0)
    else:
      genders.append(1)

    user_id = row['user_id']
    orig_img = row['original_image']
    face_id = row['face_id']
    img = Image.open('./Adience Gender Dataset/aligned/' + user_id + '/landmark_aligned_face.' + str(face_id) + '.' + orig_img)
    processed_img = img.resize((64, 64), Image.ANTIALIAS)
    images.append(np.array(processed_img))

  csv_dict = {'dataset_name': one_csv, 'images': images, 'genders': genders, 'ages': ages}
  save_pickle(csv_dict, counter+one_csv)
  print(one_csv + ": Done")

train: Done
test: Done
fold_0_train_sub: Done
fold_0_val: Done
fold_1_train_sub: Done
fold_1_val: Done
fold_2_train_sub: Done
fold_2_val: Done
fold_3_train_sub: Done
fold_3_val: Done
fold_4_train_sub: Done
fold_4_val: Done
