In [1]:
import scipy.io as sio
import pandas as pd

In [2]:
# Matlab file path
METADATA_CROPPED_FILE_MATLAB = '../dataset/imdb_crop/imdb.mat'

In [3]:
mat_obj = sio.loadmat(METADATA_CROPPED_FILE_MATLAB)
mat_dta = mat_obj['imdb']
data_dict = {n: mat_dta[n][0, 0][0] for n in mat_dta.dtype.names}
# Extract celebrities names
celeb_names_dict = {k: v for k, v in enumerate(data_dict['celeb_names'], 1)}
# Remove key from dict
_ = data_dict.pop('celeb_names', None)

In [4]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


In [5]:
from datetime import datetime, timedelta
matlab2datetime = lambda matlab_datenum: (datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1)).year

# Extract full path
df['full_path'] = df['full_path'].apply(lambda x: x[0]).astype('str')
# Extract face location
df['face_location'] = df['face_location'].apply(lambda x: x[0])
# Extract face location
df['name'] = df['name'].apply(lambda x: x[0]).astype('str')
# Convert dab to standard datetime
df['standard_dob'] = df['dob'].apply(matlab2datetime).astype('uint16')
# Compute age
df['age'] = df.apply(lambda x:x['photo_taken']-x['standard_dob'], axis=1).astype('uint16')


In [6]:
# Reorder columns
columns = df.columns.tolist()
head = ['age', 'gender', 'full_path', 'name', 'face_location']
tail = set(columns) - set(head)
ordered_columns = head + list(tail)
# Ordered columns
df = df[ordered_columns]
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,dob,photo_taken,second_face_score,face_score,celeb_id,standard_dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",693726,1968,1.118973,1.459693,6488,1900
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",693726,1970,1.852008,2.543198,6488,1900
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",693726,1968,2.98566,3.455579,6488,1900
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",693726,1968,,1.872117,6488,1900
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",693726,1968,,1.158766,6488,1900


In [7]:
from os import path

# Dump dataframe
head, tail = path.split(METADATA_CROPPED_FILE_MATLAB)

path_serialized = path.join(head, 'imdb.pickle')
df.to_pickle(path_serialized)

In [8]:
# Test read
df = pd.read_pickle(path_serialized)
df.head()

Unnamed: 0,age,gender,full_path,name,face_location,dob,photo_taken,second_face_score,face_score,celeb_id,standard_dob
0,68,1.0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,"[1072.926, 161.838, 1214.7839999999999, 303.69...",693726,1968,1.118973,1.459693,6488,1900
1,70,1.0,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,"[477.184, 100.352, 622.592, 245.76]",693726,1970,1.852008,2.543198,6488,1900
2,68,1.0,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,"[114.96964308962852, 114.96964308962852, 451.6...",693726,1968,2.98566,3.455579,6488,1900
3,68,1.0,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,"[622.8855056426588, 424.21750383700805, 844.33...",693726,1968,,1.872117,6488,1900
4,68,1.0,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,"[1013.8590023603723, 233.8820422075853, 1201.5...",693726,1968,,1.158766,6488,1900
