In [1]:
# This file simply prepares the data into a format that a classifier
# can easily sort through. It also dumps a pickled data hash into the 
# server file so that we dont have to change the server every time a
# parameter is changed

from sklearn.datasets import fetch_lfw_people
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import PIL
from resizeimage import resizeimage
import os
from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle

# Set random seed
seed = 42
np.random.seed(seed)

# Image Parameters
width = 100
height = 100
channels = 3
features = width * height * channels

# Get Faces
faces = fetch_lfw_people(data_home='sourced_data/faces', min_faces_per_person=1, resize=0.4)

In [2]:
def load_image( infilename ) :
    return Image.open( infilename )
    
def convert_image_to_1d(file_name, width, height):
    img = load_image(file_name)
    img = img.resize((width, height), PIL.Image.ANTIALIAS)
    image_array = np.asarray(img, dtype="int32" )
    return image_array.reshape((1,-1))

In [3]:
# Sort sourced places
sourced_places_root = "sourced_data/places"
sourced_places_array = np.zeros((1,features))
for file in os.listdir(sourced_places_root):
    if not file.endswith('.jpg'):
        continue
    one_d_array = convert_image_to_1d(sourced_places_root + '/' + file, width, height)
    (_, length) = np.shape(one_d_array)
    if length != features:
        continue
    sourced_places_array = np.concatenate((sourced_places_array, one_d_array))

sourced_places_array = sourced_places_array[1:,:]
pickle.dump(sourced_places_array, open("sourced_places_array.p", "wb"))

In [4]:
# Sort sourced faces
sourced_faces_root = 'sourced_data/faces/lfw_home/lfw_funneled'
sourced_faces_array = np.zeros((1,features))
i = 0
for root, dirs, files in os.walk(sourced_faces_root):
    if i > 500:
        break
    for file in files:
        if file.endswith('.jpg'):
            i += 1
            image_path = os.path.join(root, file)
            one_d_array = convert_image_to_1d(image_path, width, height)
            sourced_faces_array = np.concatenate((sourced_faces_array, one_d_array))

sourced_faces_array = sourced_faces_array[1:,:]

print(np.shape(sourced_faces_array))
pickle.dump(sourced_faces_array, open("sourced_faces_array.p", "wb"))

(502, 30000)


In [5]:
sourced_faces_array = pickle.load(open("sourced_faces_array.p", "rb"))
sourced_places_array = pickle.load(open("sourced_places_array.p", "rb"))

X = np.vstack((sourced_faces_array, sourced_places_array))
np.shape(X)

(sourced_face_count, features) = np.shape(sourced_faces_array)
(sourced_image_count, features) = np.shape(X)

In [7]:
y = np.zeros((sourced_image_count, 1))
for i in range(sourced_face_count):
    y[i,:] = 1
    
y = y.ravel()

pickle.dump(X, open('converted_data/X.p', "wb"))
pickle.dump(y, open('converted_data/y.p', "wb"))

In [8]:
# Convert places from onfido images
places_root = "onfido_images/places"
places_array = np.zeros((1,features))
for file in os.listdir(places_root):
    one_d_array = convert_image_to_1d(places_root + '/' + file, width, height)
    places_array = np.concatenate((places_array, one_d_array))

pickle.dump(places_array, open("converted_data/onfido_places.p", "wb"))

In [9]:
# Convert faces from onfido images
faces_root = "onfido_images/faces"
faces_array = np.zeros((1,features))
for file in os.listdir(faces_root):
    one_d_array = convert_image_to_1d(faces_root + '/' + file, width, height)
    faces_array = np.concatenate((faces_array, one_d_array))

pickle.dump(faces_array, open("converted_data/onfido_faces.p", "wb"))

In [18]:
# Save params for server file
trained_params = {'width':width, 'height':height}
pickle.dump(trained_params, open("../server/data/image_params.p", "wb"))