# METADATA CONVERSION

Download dataset IMDB - faces only (7 GB) from https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
Copy the downloaded directory imdb_crop in the project folder dataset/, then run this notebook.

This notebook converts the imdb_crop images metadata from Matlab format into Pandas format, so they can be used with Python.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels

In [10]:
import cv2
import numpy as np
import pandas as pd
import scipy.io as sio
from tqdm import tqdm

from notebooks.MetadataUtils import MetadataUtils

In [11]:
# Matlab file path
METADATA_DESTINATION = '../dataset/imdb_crop'
METADATA_CROPPED_FILE_MATLAB = METADATA_DESTINATION + '/imdb.mat'
utils = MetadataUtils()

In [12]:
mat_obj = sio.loadmat(METADATA_CROPPED_FILE_MATLAB)
mat_dta = mat_obj['imdb']
data_dict = {n: mat_dta[n][0, 0][0] for n in mat_dta.dtype.names}
# Extract celebrities names
celeb_names_dict = {k: v for k, v in enumerate(data_dict['celeb_names'], 1)}
# Remove key from dict
_ = data_dict.pop('celeb_names', None)

In [13]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


In [14]:
from datetime import datetime, timedelta

matlab2datetime = lambda matlab_datenum: (
            datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum % 1)).year

# Extract full path
df['full_path'] = df['full_path'].apply(lambda x: x[0]).astype('str')
# Extract face location
df['face_location'] = df['face_location'].apply(lambda x: x[0])
# Extract face location
df['name'] = df['name'].apply(lambda x: x[0]).astype('str')
# Convert dab to standard datetime
df['standard_dob'] = df['dob'].apply(matlab2datetime).astype('uint16')
# Compute age
df['age'] = df.apply(lambda x: x['photo_taken'] - x['standard_dob'], axis=1).astype('uint16')

In [15]:
# Reorder columns
head = ['age', 'gender', 'full_path', 'name', 'face_location']
df = utils.reorder_columns(dataset=df, head=head)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,celeb_id,standard_dob,dob,photo_taken,second_face_score,face_score
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",6488,1900,693726,1968,1.118973,1.459693
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",6488,1900,693726,1970,1.852008,2.543198
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",6488,1900,693726,1968,2.98566,3.455579
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",6488,1900,693726,1968,,1.872117
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",6488,1900,693726,1968,,1.158766


In [16]:
# Remove invalid rows
df = utils.remove_invalid_rows(dataset=df)

Len before:  460723
Len after:  451571
Invalid rows: 1.986%


In [17]:
# Remove invalid images
df = utils.remove_invalid_images(dataset=df, path=METADATA_DESTINATION + '/')


Len before:  451571


  0%|          | 321/451571 [00:10<4:13:58, 29.61it/s]


KeyboardInterrupt: 

In [5]:
from os import path

# Dump dataframe
head, tail = path.split(METADATA_CROPPED_FILE_MATLAB)

path_serialized = path.join(head, 'imdb.pickle')
df.to_pickle(path_serialized)

NameError: name 'df' is not defined

In [6]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,second_face_score,standard_dob,celeb_id,face_score,photo_taken,dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.118973,1900,6488,1.459693,1968,693726
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",1.852008,1900,6488,2.543198,1970,693726
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",,1900,6488,1.872117,1968,693726
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",,1900,6488,1.158766,1968,693726
5,66,0.0,02/nm0000002_rm1075631616_1924-9-16_1991.jpg,Lauren Bacall,"[686.6768205940973, 458.4672137293982, 1198.10...",1.345335,1925,11516,3.490563,1991,702986


In [7]:
df.describe()

Unnamed: 0,age,gender,second_face_score,standard_dob,celeb_id,face_score,photo_taken,dob
count,280844.0,280844.0,146906.0,280844.0,280844.0,280844.0,280844.0,280844.0
mean,36.164258,0.577128,2.485057,1969.365993,10119.464236,3.362345,2005.530252,719113.598813
std,13.148274,0.494016,1.092884,15.691095,5739.357341,1.238619,9.915925,5730.534448
min,0.0,0.0,0.730926,1884.0,1.0,0.730919,1961.0,687776.0
25%,27.0,0.0,1.593553,1962.0,5355.0,2.435448,2004.0,716241.0
50%,35.0,1.0,2.373423,1972.0,10068.0,3.382505,2009.0,719928.0
75%,44.0,1.0,3.277556,1980.0,14925.0,4.313906,2012.0,723127.0
max,100.0,1.0,6.395435,2009.0,20284.0,7.381689,2015.0,733668.0
