# METADATA CREATION

Download dataset UTK - Aligned&Cropped Faces (107 MB) from https://susanqq.github.io/UTKFace/
Copy the downloaded directory crop_part1 in the project folder dataset/utk, then run this notebook.

This notebook creates a pickle file containing images metadata, so they can be used for training models.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels.

In [1]:
import os

import pandas as pd
from tqdm import tqdm

from src.DataManager import reorder_columns, remove_invalid_rows, remove_invalid_images

In [2]:
METADATA_DESTINATION = '../dataset/utk'
SMALLEST_DIMENSION = 124

In [3]:
df = pd.DataFrame()

with tqdm(total=len(os.listdir(METADATA_DESTINATION + '/crop_part1'))) as pbar:
    for file in os.listdir(METADATA_DESTINATION + '/crop_part1'):
        d = {"full_path": 'crop_part1/' + file}
        attr = file.split('_')
        d["age"] = int(attr[0])
        # 0 male, 1 female
        d["gender"] = int(attr[1])
        df = df.append(d, ignore_index=True)
        pbar.update(1)

df.head()

100%|██████████| 9780/9780 [00:14<00:00, 673.30it/s]


Unnamed: 0,full_path,age,gender
0,crop_part1/100_1_0_20170110183726390.jpg.chip.jpg,100.0,1.0
1,crop_part1/100_1_2_20170105174847679.jpg.chip.jpg,100.0,1.0
2,crop_part1/101_1_2_20170105174739309.jpg.chip.jpg,101.0,1.0
3,crop_part1/10_0_0_20161220222308131.jpg.chip.jpg,10.0,0.0
4,crop_part1/10_0_0_20170103200329407.jpg.chip.jpg,10.0,0.0


In [4]:
# Reorder columns
head = ['age', 'gender', 'full_path']
df = reorder_columns(dataset=df, head=head)
df.head()

Unnamed: 0,age,gender,full_path
0,100.0,1.0,crop_part1/100_1_0_20170110183726390.jpg.chip.jpg
1,100.0,1.0,crop_part1/100_1_2_20170105174847679.jpg.chip.jpg
2,101.0,1.0,crop_part1/101_1_2_20170105174739309.jpg.chip.jpg
3,10.0,0.0,crop_part1/10_0_0_20161220222308131.jpg.chip.jpg
4,10.0,0.0,crop_part1/10_0_0_20170103200329407.jpg.chip.jpg


In [5]:
# Remove invalid rows
df = remove_invalid_rows(dataset=df)

Len before:  9780
Len after:  6831
Invalid rows: 30.153%


In [6]:
# Remove invalid images
df = remove_invalid_images(dataset=df, path=METADATA_DESTINATION + '/', smallest_dim=SMALLEST_DIMENSION)

Len before:  6831


100%|██████████| 6831/6831 [00:14<00:00, 461.43it/s]

Len after:  6721
Invalid rows: 1.610%





In [7]:
path_serialized = METADATA_DESTINATION + '/utk.pickle'
df.to_pickle(path_serialized)

In [8]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path
0,100.0,1.0,crop_part1/100_1_0_20170110183726390.jpg.chip.jpg
1,100.0,1.0,crop_part1/100_1_2_20170105174847679.jpg.chip.jpg
3,10.0,0.0,crop_part1/10_0_0_20161220222308131.jpg.chip.jpg
4,10.0,0.0,crop_part1/10_0_0_20170103200329407.jpg.chip.jpg
5,10.0,0.0,crop_part1/10_0_0_20170103200522151.jpg.chip.jpg


In [9]:
df.describe()

Unnamed: 0,age,gender
count,6721.0,6721.0
mean,40.554679,0.565987
std,21.215792,0.496563
min,10.0,0.0
25%,24.0,0.0
50%,36.0,1.0
75%,56.0,1.0
max,100.0,3.0
