# METADATA CONVERSION

Download dataset IMDB - faces only (7 GB) from https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
Copy the downloaded directory imdb_crop in the project folder dataset/, then run this notebook.

This notebook converts the imdb_crop images metadata from Matlab format into Pandas format, so they can be used with Python.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels

In [62]:
import numpy as np
import scipy.io as sio
import pandas as pd

In [63]:
# Matlab file path
METADATA_CROPPED_FILE_MATLAB = '../dataset/imdb_crop/imdb.mat'

In [64]:
mat_obj = sio.loadmat(METADATA_CROPPED_FILE_MATLAB)
mat_dta = mat_obj['imdb']
data_dict = {n: mat_dta[n][0, 0][0] for n in mat_dta.dtype.names}
# Extract celebrities names
celeb_names_dict = {k: v for k, v in enumerate(data_dict['celeb_names'], 1)}
# Remove key from dict
_ = data_dict.pop('celeb_names', None)

In [65]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


In [66]:
from datetime import datetime, timedelta
matlab2datetime = lambda matlab_datenum: (datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1)).year

# Extract full path
df['full_path'] = df['full_path'].apply(lambda x: x[0]).astype('str')
# Extract face location
df['face_location'] = df['face_location'].apply(lambda x: x[0])
# Extract face location
df['name'] = df['name'].apply(lambda x: x[0]).astype('str')
# Convert dab to standard datetime
df['standard_dob'] = df['dob'].apply(matlab2datetime).astype('uint16')
# Compute age
df['age'] = df.apply(lambda x:x['photo_taken']-x['standard_dob'], axis=1).astype('uint16')

In [67]:
# Reorder columns
columns = df.columns.tolist()
head = ['age', 'gender', 'full_path', 'name', 'face_location']
tail = set(columns) - set(head)
ordered_columns = head + list(tail)
# Ordered columns
df = df[ordered_columns]
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,dob,photo_taken,face_score,second_face_score,standard_dob,celeb_id
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",693726,1968,1.459693,1.118973,1900,6488
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",693726,1970,2.543198,1.852008,1900,6488
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",693726,1968,3.455579,2.98566,1900,6488
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",693726,1968,1.872117,,1900,6488
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",693726,1968,1.158766,,1900,6488


In [68]:
# Remove invalid rows
len_before = len(df)
print('Len before: ', len_before)
df = df.query('age<=100')
df = df[df.gender.notna()]
df = df[df.age.notna()]
len_after = len(df)
print('Lean after: ', len_after)
print(f'Invalid rows: {(1 - len_after/len_before)*100:.3f}%')

Len before:  460723
Lean after:  451571
Invalid rows: 1.986%


In [69]:
from os import path

# Dump dataframe
head, tail = path.split(METADATA_CROPPED_FILE_MATLAB)

path_serialized = path.join(head, 'imdb.pickle')
df.to_pickle(path_serialized)

In [70]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,dob,photo_taken,face_score,second_face_score,standard_dob,celeb_id
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",693726,1968,1.459693,1.118973,1900,6488
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",693726,1970,2.543198,1.852008,1900,6488
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",693726,1968,3.455579,2.98566,1900,6488
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",693726,1968,1.872117,,1900,6488
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",693726,1968,1.158766,,1900,6488


In [71]:
df.describe()

Unnamed: 0,age,gender,dob,photo_taken,face_score,second_face_score,standard_dob,celeb_id
count,451571.0,451571.0,451571.0,451571.0,451571.0,209257.0,451571.0,451571.0
mean,35.862938,0.582041,719202.393409,2005.470832,-inf,2.445858,1969.607893,10114.143942
std,13.06445,0.493224,5561.603999,9.036479,,1.062202,15.228378,5744.821988
min,0.0,0.0,687776.0,1961.0,-inf,0.730926,1884.0,1.0
25%,27.0,0.0,716392.0,2004.0,1.757156,1.578818,1962.0,5295.0
50%,34.0,1.0,719959.0,2008.0,2.974744,2.346619,1972.0,10062.0
75%,43.0,1.0,723084.0,2011.0,4.001054,3.216948,1980.0,14920.0
max,100.0,1.0,734181.0,2015.0,7.381689,6.395435,2011.0,20284.0
