### Import

In [26]:
import os
import sys
import numpy as np
import pandas as pd
import imageio
import pickle
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image

### Parameters

In [33]:
image_size = 128
vector_size = image_size * image_size * 3
vector_size_bw = image_size * image_size

### Find all files

In [34]:
imagefiles = []
for root, dirs, files in os.walk("../../data/lfw/images/", topdown=False):
    for name in files:
        if '.jpg' in name:
            imagefiles.append(os.path.join(root, name))

### Extract all names and store in pandas dataframe

In [35]:
#Init
people = {}
count = 0
metadata = []

#Loop over all found image files and extract name, and first letter
#Also create an id for each individual (people dictionary)
for imagefile in imagefiles:
    elements = imagefile.split('/')
    name = elements[-2].lower()
    if name not in people:
        people[name] = count
        count = count + 1
    metadata.append((name,people[name],ord(name[0])-97,imagefile))

#Turn into a pandas dataframe and store
metadata = pd.DataFrame(metadata)
metadata.columns = ['name','id','first_letter_code','filename']
metadata.to_csv('../../data/lfw/meta/metadata.csv',index=False)

### Read all files and store as pickle objects

In [36]:
#Extract necessary info from the datarame
first_letter_codes = metadata['first_letter_code'].values
person_ids = metadata['id'].values
filenames = metadata['filename'].values

#Loop over the letters
for i in tqdm(range(26)):

    #Get All files matching the current first letter
    indices = np.where(first_letter_codes == i)[0]
    subset_filenames = filenames[indices]
    subset_person_ids = person_ids[indices]

    #Initialize the matrix. Speeds up processing quite a bit
    X = np.zeros((vector_size,len(subset_filenames)))
    Xbw = np.zeros((vector_size_bw,len(subset_filenames)))

    #Load the image, resize it and flatten it into a vector
    #Keep both the original color but also transform into black and white
    for f,filename in enumerate(subset_filenames):
        im = Image.open(filename)
        im = im.crop((50,50,200,200))
        im = im.resize((image_size,image_size))
        im = np.array(im)
        imbw = np.mean(im,axis=2)
        im = im.flatten()
        imbw = imbw.flatten()        
        X[:,f] = im
        Xbw[:,f] = imbw

    #Store Data as a serialized pickle file
    data = {}
    data['X'] = X
    data['Xbw'] = Xbw
    data['y'] = subset_person_ids
    pickle.dump( data, open('../../data/lfw/pickle/' + chr(i+97) + '_' + str(image_size) + 'x' + str(image_size) + '.p','wb'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:47<00:00,  1.83s/it]
