# METADATA CONVERSION

Download dataset IMDB - faces only (7 GB) from https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
Copy the downloaded directory imdb_crop in the project folder dataset/, then run this notebook.

This notebook converts the imdb_crop images metadata from Matlab format into Pandas format, so they can be used with Python.

The ML models for classification must inherit from src/models/Model, the DataManager class reads the images and extracts the labels

In [1]:
import numpy as np
import scipy.io as sio
import cv2
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
# Matlab file path
METADATA_CROPPED_FILE_MATLAB = '../dataset/imdb_crop/imdb.mat'

In [3]:
mat_obj = sio.loadmat(METADATA_CROPPED_FILE_MATLAB)
mat_dta = mat_obj['imdb']
data_dict = {n: mat_dta[n][0, 0][0] for n in mat_dta.dtype.names}
# Extract celebrities names
celeb_names_dict = {k: v for k, v in enumerate(data_dict['celeb_names'], 1)}
# Remove key from dict
_ = data_dict.pop('celeb_names', None)

In [4]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


In [5]:
from datetime import datetime, timedelta
matlab2datetime = lambda matlab_datenum: (datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1)).year

# Extract full path
df['full_path'] = df['full_path'].apply(lambda x: x[0]).astype('str')
# Extract face location
df['face_location'] = df['face_location'].apply(lambda x: x[0])
# Extract face location
df['name'] = df['name'].apply(lambda x: x[0]).astype('str')
# Convert dab to standard datetime
df['standard_dob'] = df['dob'].apply(matlab2datetime).astype('uint16')
# Compute age
df['age'] = df.apply(lambda x:x['photo_taken']-x['standard_dob'], axis=1).astype('uint16')

In [6]:
# Reorder columns
columns = df.columns.tolist()
head = ['age', 'gender', 'full_path', 'name', 'face_location']
tail = set(columns) - set(head)
ordered_columns = head + list(tail)
# Ordered columns
df = df[ordered_columns]
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,second_face_score,dob,celeb_id,photo_taken,face_score,standard_dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.118973,693726,6488,1968,1.459693,1900
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",1.852008,693726,6488,1970,2.543198,1900
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",2.98566,693726,6488,1968,3.455579,1900
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",,693726,6488,1968,1.872117,1900
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",,693726,6488,1968,1.158766,1900


In [7]:
# Remove invalid rows
len_before = len(df)
print('Len before: ', len_before)
df = df.query('age<=100')
df = df[df.gender.notna()]
df = df[df.age.notna()]
len_after = len(df)
print('Lean after: ', len_after)
print(f'Invalid rows: {(1 - len_after/len_before)*100:.3f}%')

Len before:  460723
Lean after:  451571
Invalid rows: 1.986%


In [8]:
# CHECK FUNCTIONS: if the image is too little then we discard it; if the image has padding, we discard it

def areRowsEqual(rows):
    for i in range(1, len(rows)):
        if (rows[i-1] == rows[i]).all():
            return True
    return False

def isImagePadded(img, number_equal_rows=5):
    nr = number_equal_rows
    return any([
        areRowsEqual(img[:nr,:,:]), areRowsEqual(img[-nr:,:,:]),
        areRowsEqual(img.T[:nr,:,:]), areRowsEqual(img.T[-nr:,:,:])
    ])

def isImageTooLittle(img, smallest_dim=224):
    return img.shape[0] <= smallest_dim or img.shape[1] <= smallest_dim

In [None]:
# Remove invalid images
len_before = len(df)
n_invalid = 0

with tqdm(total=df.shape[0]) as pbar:
    for index,row in df.iterrows():
        img = cv2.imread('../dataset/imdb_crop/' + row.full_path)
        if isImageTooLittle(img) or isImagePadded(img):
            df.drop(index, inplace=True)
            n_invalid+=1

        pbar.update(1)
        pbar.set_description(f'Invalid rows so far: {(n_invalid/len_before)*100:.3f}%')

len_after = len(df)

Invalid rows so far: 33.704%:  55%|█████▍    | 247928/451571 [6:57:36<8:54:15,  6.35it/s] 

In [None]:
print('Lean before: ', len_before)
print('Lean after: ', len_after)
print(f'Invalid rows: {(1 - len_after/len_before)*100:.3f}%')

In [None]:
from os import path

# Dump dataframe
head, tail = path.split(METADATA_CROPPED_FILE_MATLAB)

path_serialized = path.join(head, 'imdb.pickle')
df.to_pickle(path_serialized)

In [None]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

In [None]:
df.describe()