# METADATA CREATION

Download dataset UTK - Aligned&Cropped Faces (107 MB) from https://susanqq.github.io/UTKFace/
Copy the downloaded directory crop_part1 in the project folder dataset/utk, then run this notebook.

This notebook creates a pickle file containing images metadata, so they can be used for training models.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels.

In [1]:
import os

import pandas as pd
from tqdm import tqdm

from notebooks.MetadataUtils import MetadataUtils

In [3]:
METADATA_DESTINATION = '../dataset/utk'
utils = MetadataUtils()

In [4]:
df = pd.DataFrame()

with tqdm(total=len(os.listdir(METADATA_DESTINATION + '/crop_part1'))) as pbar:
    for file in os.listdir(METADATA_DESTINATION + '/crop_part1'):
        d = {"full_path": file}
        attr = file.split('_')
        d["age"] = int(attr[0])
        # 0 male, 1 female
        d["gender"] = int(attr[1])
        df = df.append(d, ignore_index=True)
        pbar.update(1)

df.head()

100%|██████████| 9780/9780 [00:19<00:00, 494.02it/s]


Unnamed: 0,full_path,age,gender
0,100_1_0_20170110183726390.jpg.chip.jpg,100.0,1.0
1,100_1_2_20170105174847679.jpg.chip.jpg,100.0,1.0
2,101_1_2_20170105174739309.jpg.chip.jpg,101.0,1.0
3,10_0_0_20161220222308131.jpg.chip.jpg,10.0,0.0
4,10_0_0_20170103200329407.jpg.chip.jpg,10.0,0.0


In [5]:
# Reorder columns
head = ['age', 'gender', 'full_path']
df = utils.reorder_columns(dataset=df, head=head)
df.head()

Unnamed: 0,age,gender,full_path
0,100.0,1.0,100_1_0_20170110183726390.jpg.chip.jpg
1,100.0,1.0,100_1_2_20170105174847679.jpg.chip.jpg
2,101.0,1.0,101_1_2_20170105174739309.jpg.chip.jpg
3,10.0,0.0,10_0_0_20161220222308131.jpg.chip.jpg
4,10.0,0.0,10_0_0_20170103200329407.jpg.chip.jpg


In [6]:
# Remove invalid rows
df = utils.remove_invalid_rows(dataset=df)

Len before:  9780
Lean after:  9777
Invalid rows: 0.031%


In [9]:
# Remove invalid images
df = utils.remove_invalid_images(dataset=df, path=METADATA_DESTINATION + '/crop_part1/')

Len before:  9777


100%|██████████| 9777/9777 [00:11<00:00, 834.60it/s]

Lean after:  9624
Invalid rows: 1.565%





In [10]:
path_serialized = METADATA_DESTINATION + '/utk.pickle'
df.to_pickle(path_serialized)

In [12]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path
0,100.0,1.0,100_1_0_20170110183726390.jpg.chip.jpg
1,100.0,1.0,100_1_2_20170105174847679.jpg.chip.jpg
3,10.0,0.0,10_0_0_20161220222308131.jpg.chip.jpg
4,10.0,0.0,10_0_0_20170103200329407.jpg.chip.jpg
6,10.0,0.0,10_0_0_20170103233459275.jpg.chip.jpg


In [13]:
df.describe()

Unnamed: 0,age,gender
count,9624.0,9624.0
mean,29.473088,0.554032
std,24.733725,0.497725
min,1.0,0.0
25%,7.0,0.0
50%,25.0,1.0
75%,49.0,1.0
max,100.0,3.0
