# METADATA CONVERSION

Download dataset IMDB - faces only (7 GB) from https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
Copy the downloaded directory imdb_crop in the project folder dataset/, then run this notebook.

This notebook converts the imdb_crop images metadata from Matlab format into Pandas format, so they can be used with Python.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels

In [12]:
import cv2
import pandas as pd
import scipy.io as sio
from tqdm import tqdm

from src.DataManager import reorder_columns, remove_invalid_rows, remove_invalid_images

In [13]:
# Matlab file path
METADATA_DESTINATION = '../dataset/imdb_crop/'
METADATA_CROPPED_FILE_MATLAB = METADATA_DESTINATION + 'imdb.mat'
SMALLEST_DIMENSION = 224

In [14]:
mat_obj = sio.loadmat(METADATA_CROPPED_FILE_MATLAB)
mat_dta = mat_obj['imdb']
data_dict = {n: mat_dta[n][0, 0][0] for n in mat_dta.dtype.names}
# Extract celebrities names
celeb_names_dict = {k: v for k, v in enumerate(data_dict['celeb_names'], 1)}
# Remove key from dict
_ = data_dict.pop('celeb_names', None)

In [15]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


In [16]:
from datetime import datetime, timedelta

matlab2datetime = lambda matlab_datenum: (
        datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum % 1)).year

# Extract full path
df['full_path'] = df['full_path'].apply(lambda x: x[0]).astype('str')
# Extract face location
df['face_location'] = df['face_location'].apply(lambda x: x[0])
# Extract face location
df['name'] = df['name'].apply(lambda x: x[0]).astype('str')
# Convert dab to standard datetime
df['standard_dob'] = df['dob'].apply(matlab2datetime).astype('uint16')
# Compute age
df['age'] = df.apply(lambda x: x['photo_taken'] - x['standard_dob'], axis=1).astype('uint16')

In [17]:
# Reorder columns
head = ['age', 'gender', 'full_path', 'name', 'face_location']
df = reorder_columns(dataset=df, head=head)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,face_score,dob,celeb_id,photo_taken,second_face_score,standard_dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.459693,693726,6488,1968,1.118973,1900
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",2.543198,693726,6488,1970,1.852008,1900
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",3.455579,693726,6488,1968,2.98566,1900
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",1.872117,693726,6488,1968,,1900
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",1.158766,693726,6488,1968,,1900


In [18]:
# Remove invalid rows
df = remove_invalid_rows(dataset=df)

Len before:  460723
Len after:  451571
Invalid rows: 1.986%


In [19]:
# Remove invalid images
df = remove_invalid_images(dataset=df, path=METADATA_DESTINATION, smallest_dim=SMALLEST_DIMENSION)

Len before:  451571


  0%|          | 235/451571 [00:10<5:50:05, 21.49it/s]


KeyboardInterrupt: 

In [9]:
from os import path

# Dump dataframe
head, tail = path.split(METADATA_CROPPED_FILE_MATLAB)

path_serialized = path.join(head, 'imdb.pickle')
df.to_pickle(path_serialized)

NameError: name 'df' is not defined

In [10]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,face_score,photo_taken,celeb_id,second_face_score,dob,standard_dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.459693,1968,6488,1.118973,693726,1900
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",2.543198,1970,6488,1.852008,693726,1900
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",1.872117,1968,6488,,693726,1900
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",1.158766,1968,6488,,693726,1900
5,66,0.0,02/nm0000002_rm1075631616_1924-9-16_1991.jpg,Lauren Bacall,"[686.6768205940973, 458.4672137293982, 1198.10...",3.490563,1991,11516,1.345335,702986,1925


In [11]:
df.describe()

Unnamed: 0,age,gender,face_score,photo_taken,celeb_id,second_face_score,dob,standard_dob
count,178650.0,178650.0,178650.0,178650.0,178650.0,97640.0,178650.0,178650.0
mean,36.457246,0.588995,3.353406,2005.244489,10143.653764,2.531453,718901.303297,1968.787243
std,13.356509,0.492017,1.234366,11.174091,5740.38131,1.111198,5983.199815,16.383994
min,0.0,0.0,0.730919,1961.0,1.0,0.730926,687776.0,1884.0
25%,27.0,0.0,2.435324,2003.0,5377.0,1.623601,715833.0,1960.0
50%,35.0,1.0,3.369182,2010.0,10078.0,2.419551,719783.0,1971.0
75%,44.0,1.0,4.299459,2012.0,14988.0,3.349847,723144.0,1980.0
max,99.0,1.0,7.381689,2015.0,20284.0,6.395435,733282.0,2008.0
