**DIY FACIAL RECOGNITION FILE CREATION**<BR>
This file creates files to be used for DIY facial recognition.<br>
In order to run, this script requires a folder full of files which the "1_FinalProject_ImageProcessing.ipynb" script should have placed into a folder

In [40]:
###################################################################################################
# PACKAGES - there are some crazy installation requirements. Please read all comments in this block
###################################################################################################

# First, if you did not open this script from the zip file, you will want to get that zip file, too. 
#       It contains the folder structure and supporting xml and .dat files you will need to run this
#       script. It also contains a copy of this file, but it may be an older copy. Compare them to 
#       make sure you are using the latest copy.  You can find and download the "FaceRecog.zip" file 
#       in this folder: https://drive.google.com/drive/folders/16TnIGZDvEkgtxH_HzDRiz_PhW2xkUUW6?usp=sharing

# if you haven't already installed imutils, uncomment the next line:
#!pip install imutils 
import imutils

# Follow the advice here to install cmake on your machine: https://cmake.org/download/
# Then install cmake into Python using !pip
# uncomment the next line if you haven't already installed cmake
#!pip install cmake 

# Next make sure that the path to the cmake bin folder is in your environment (system) variables
# then download the dlib tar.gz file from: https://pypi.org/project/dlib/#files. Put the file into the same folder as this python script, and install it directly from that tar.gz file 
# uncomment the next line if you haven't already installed dlib
#!pip install "dlib-19.23.0.tar.gz" 

# AFTER INSTALLING dlib ***YOU MUST*** RESTART YOUR MACHINE - OR MAYBE JUST COLAB !!!!!!!!!!!!!!
# if that doesn't work, try installing dlib using conda
# and know that I feel your pain. 
# Feel free to call Lei. He's pretty sharp. He'll know what to do! ;)
import dlib 
import pandas as pd
import numpy as np
import cv2
from cv2 import imread
import sys
import os
from shutil import move
from scipy.spatial.distance import euclidean

# predictive modeling
from sklearn.ensemble import RandomForestClassifier
# randomize the data
import random

import time
import pathlib

In [41]:
###################################################################################################
# SET LOCAL PATH VARIABLES
# These are variables that you will probably need to modify so that they work on your local machine
###################################################################################################

# because I use dark mode, I use a contrasting color when printing text against the background. You can change this here
textColor = "#cccccc"

# optional, True if you want to save a copy of the b&w frame, with the points on each face, into the "Points" folder
saveCopyOfSpottedImage = False
# optional, True if you want a textual list of images that fail. The end up in the "Failed" folder, regardless.
reportFailedImages = False

# the number of images, minimum, we need per person for model testing
targetImagesPerPerson = 110

# percent of data set to use as train
trainSplit = .80

# where you stored the the classifier xml file. Download a copy from here: https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml
classifierPath = r"D:\Karl\1 Syracuse Masters Program\2022 - Winter\IST 718\Final Project\haarcascade_frontalface_default.xml.txt"

# where you stored the shape predictor data file - different predictors located here: https://github.com/davisking/dlib-models
# Predictor 1 from: https://github.com/davisking/dlib-models/blob/master/shape_predictor_68_face_landmarks.dat.bz2
predictorPath = r"D:\Karl\1 Syracuse Masters Program\2022 - Winter\IST 718\Final Project\shape_predictor_68_face_landmarks.dat"
# Predictor 2 (81% effective for random forest, ~47% Bayes, ~74% KNN) from: https://github.com/davisking/dlib-models/blob/master/shape_predictor_68_face_landmarks_GTX.dat.bz2
# predictorPath = r"D:\Karl\1 Syracuse Masters Program\2022 - Winter\IST 718\Final Project\shape_predictor_68_face_landmarks_GTX.dat"

# path where everything happens
srcFolder = pathlib.Path("FaceRecog_FromCSV.ipynb").parent.resolve()

# starting point for random numbers
seed = 121717874

In [42]:
####################################################################################
# MORE LOCAL VARIABLES - you should probably leave these alone
####################################################################################

# path from which images get loaded
toProcessPath = str(srcFolder) + "\\ToProcess"

# paths to folders in which images get saved
processedPath = str(srcFolder) + "\\Processed"
failedPath = str(srcFolder) + "\\Failed"
pointedImgPath = str(srcFolder) + "\\Points"

# paths to files that get saved
distancesSavePath = str(srcFolder) + "\\Data\\distances.csv"
pointsSavePath = str(srcFolder) + "\\Data\\points.csv"
trainSavePath = str(srcFolder) + "\\Data\\train.csv"
testSavePath = str(srcFolder) + "\\Data\\test.csv"

# files from which data may be pre-loaded (optional)
pathToLoadDistances = str(srcFolder) + "\\Data\\distances.csv"
preloadDistances = True


In [43]:
########################
# DATA ACQUISITION TOOLS
########################

# completely customized but informed by https://livecodestream.dev/post/detecting-face-features-with-python/
# shape predictor helps by locating important points on a face
predictor = dlib.shape_predictor(predictorPath) 

# detects top, bottom, left, and right points on a face - the frame in which the other points sit
detector = dlib.get_frontal_face_detector() # Load the detector

In [44]:
#################
# HANDY FUNCTIONS
#################

# Extract the name of the person from the original file name
def getPersonNameFromImgName(imgName):
    fileNameSplit = imgName.split("_")
    return(fileNameSplit[0].replace(" ", ""))

# turns a landmarks point into a list of 2 points
def cleanPoint(p):
    return([p.x,p.y])

# gets a list of all point lists
# usage: getPoints(landmarks.parts())
def getPoints(lParts):
    lp = list(lParts)
    xy = list(map(cleanPoint, lp))
    return xy

# if the filename passed in is a jpg, simply return the file name, otherwise, return None
def getImgOrNone(fileName):
    l = len(fileName)
    if l > 4 and (fileName[l - 4:l]).lower() == ".jpg":
        return(fileName)
    else:
        return(None)

def prependPath(fileName, path):
    return path + "\\" + fileName

# gets all jpg files from a folder
def getImagesAndImageNames(folderPath):
    # get all file names from the folder
    allFiles = os.listdir(folderPath)

    # remove all non-jpg files from the list
    allFiles = list(map(getImgOrNone, allFiles))
    allFileNames = os.listdir(toProcessPath)
    processedFileNames = []

    jpgFiles = []

    # for every file in files
    for i in range(0,len(allFiles)):
        # get the name of the file
        fName = allFileNames[i]
        # get the image, if it's an image, otherwise, ignore the file or folder
        if getImgOrNone(fName) != None:            
            fileWithPath = prependPath(allFiles[i], folderPath)
            thisFile = imread(fileWithPath)     # read the file
            jpgFiles.append(thisFile)           # add the file to the list of files we want to store
            processedFileNames.append(fName)    # add the name of the file to the list of file we want to store
    
    return [jpgFiles, processedFileNames]


# returns points on the face. 
# The first 4 points are top, bottom, left, and right edges (not points at all)
# The remaining 68 points are actually turned into slopes for 
def getFaceData(img, imgName):

    showLeft = False
    showRight = True
    leftPoint = []
    rightPoint = []

    try:
        # Convert into grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(gray, 1.1, 4)

        # crop the image
        x = faces[0][0]
        y = faces[0][1]
        w = faces[0][2]
        h = faces[0][3]

        # crop both the colored and gray images
        cropped_color = img[y:y+h, x:x+w]
        cropped_gray = gray[y:y+h, x:x+w]

        # resize the image - https://www.tutorialkart.com/opencv/python/opencv-python-resize-image/
        cropped_color = cv2.resize(cropped_color, (330,330), interpolation = cv2.INTER_AREA)
        cropped_gray = cv2.resize(cropped_gray, (330,330), interpolation = cv2.INTER_AREA)
        frame = detector(cropped_gray) # get the edges of the face
    
        tst = frame[0]
    except:
        move(prependPath(imgName, toProcessPath), prependPath(imgName, failedPath))

    for f in frame:

        # mark the landmarks
        landmarks = predictor(image=cropped_gray, box=f)
        x = landmarks.part(30).x
        y = landmarks.part(30).y        
        centerPoint = [x, y]  
        if showRight:
            x = landmarks.part(45).x
            y = landmarks.part(45).y
            rightPoint = [x, y]   
        if showLeft:
            x = landmarks.part(48).x
            y = landmarks.part(48).y
            leftPoint = [x, y]       

        # gather x/y coords for each landmark point on image
        for n in range(0, 68):

            # get the x/y coords of each point
            x = landmarks.part(n).x
            y = landmarks.part(n).y

            # informational commentary if we want to print each euclidean distance from nose
            # print("Euclidean distance from nose: {}".format(euclidean(centerPoint, [x,y])))

            # Draw a circle
            cv2.circle(img=cropped_gray, center=(x, y), radius=3, color=(0, 255, 0), thickness=-1)

        
        ####################################################################################
        ###### optional, save spotted image ################################################
        # change save/not save option in settings
        if saveCopyOfSpottedImage:

            bwFileName = "z_"+ imgName

            # write spotted image to the source folder for inspection
            cv2.imwrite(prependPath(fileName=bwFileName, path=pointedImgPath), cropped_gray)
        ###### end optional ################################################################
        ####################################################################################

        # move file to processed
        try:            
            move(src=prependPath(fileName=imgName, path=toProcessPath), dst=processedPath)
        except:
            raise Exception("Failed to move {} to {}.".format(imgName, processedPath))

        points = getPoints(landmarks.parts())

        # adds Euclidean distances from central point on face
        distancesFromCenter = list(map(euclidean, points, centerPoint * (len(points)-1)))
        distances = distancesFromCenter
        
        if showRight: # adds distances from point on right side of face
            distancesFromRight = list(map(euclidean, points, rightPoint * (len(points)-1)))
            distances = list(distances + distancesFromRight)
        if showLeft: # adds distances from point on left side of face
            distancesFromLeft = list(map(euclidean, points, leftPoint * (len(points)-1)))
            distances = list(distances + distancesFromLeft)
        

        return points, distances


**Images that Fail to Load**<br>
~34% of the images that we collected failed to load. The reasons for this ranges from dark images, obstructed, or partly obstructed, images and chaotic backgrounds that confuse CV2's algorithms. The project was designed to compensate for this by collecting more images than necessary. Some people were harder to read than others. It is clear why many of Joshua's images were difficult to read, as his face was half-obscured by the bottom of the screen. It is less obvious why the system had such a difficult time capturing, and then recognizing, Katie. 

In [45]:
##########################################################################################################
# TRANSFORMATION:   GET A TABLE OF ALL POINTS ON EACH FACE. ALSO...
#                   GET A TABLE OF EUCLIDEAN DISTANCES FOR ALL POINTS ON EACH FACE TO THE TIP OF EACH NOSE
##########################################################################################################

if preloadDistances == False:
    # code to detect where face is on the image
    # https://towardsdatascience.com/face-detection-in-2-minutes-using-opencv-python-90f89d7c0f81
    # Load the cascade classifier xml file
    face_cascade = ""
    try:
        face_cascade = cv2.CascadeClassifier(classifierPath)
    except:
        # if it fails to load locally, try to get it from GitHub
        face_cascade = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml"
        print("Local classifier not found. Loading from GitHub.")

    # get all of the images we're processing, along with their file names
    imgs, imgNames = getImagesAndImageNames(toProcessPath)

    # a few variables we'll need below
    savedImgNames = []
    numberOfImagesToProcess = len(imgs)
    allDistances = {}
    allPoints = {}
    isFirst = True
    i = 0

    # get points and euclidean distances for points on every image we're processing
    for img in imgs: 

        # use the index number to get the name of the file we're saving, since the img, itself, doesn't carry a name
        currentFileName = imgNames[i] 

        # try to get all point coordinates and euclidean distances for each file
        try:
            points, distances = getFaceData(img, currentFileName)

            # if this is the first image we're processing, create the data structures
            if isFirst:
                allDistances = ([distances])
                allPoints = ([points])
                isFirst = False
            else: # otherwise, just append the next data to the existing structures
                allDistances.append(distances)
                allPoints.append(points)

            # make a note of the name of the image we just saved    
            savedImgNames.append(currentFileName)
        except:
            if reportFailedImages:
                print("getFaceData() failed for {}".format(currentFileName))

        i = i + 1

    # save the points and Euclidean distances in dataframes
    allDistances = pd.DataFrame(allDistances)
    allPoints = pd.DataFrame(allPoints)

    # rename distance columns
    allDistances.columns = list(map(lambda oldColName: "dist_" + str(oldColName), allDistances.columns[0:len(allDistances.columns)])) # https://stackoverflow.com/questions/2050637/appending-the-same-string-to-a-list-of-strings-in-python
    allDistances["img"] = savedImgNames
    allDistances["person"] = allDistances["img"].apply(getPersonNameFromImgName)
    # rearrange distance columns
    cols = ["person", "img"] + list(allDistances.columns[0:len(allDistances.columns)-2])
    allDistances = allDistances[cols]

    # rename points columns
    allPoints.columns = list(map(lambda oldColName: "dist_" + str(oldColName), allPoints.columns[0:len(allPoints.columns)]))
    allPoints["img"] = savedImgNames
    allPoints["person"] = allPoints["img"].apply(getPersonNameFromImgName)
    # rearrange points columns
    cols = ["person", "img"] + list(allPoints.columns[0:len(allPoints.columns)-2])
    allPoints = allPoints[cols]

    if numberOfImagesToProcess == 0:
        raise Exception("Error: no files in the source folder: {}. Did you forget to put files into that folder?".format(srcFolder))

    print("{} images processed successfully! {} images failed to load. Please check `failed` folder to see failed images.".format(len(savedImgNames), numberOfImagesToProcess - len(savedImgNames)))
else:
    print("Distances being preloaded...")


Distances being preloaded...


In [46]:
##############################################################################################
# OPTIONAL: CAN BE SET AT TOP OF THIS SCRIPT SO THAT ALL DATA FROM EVERY PERSON CAN BE LOADED
#           BECAUSE THIS SCRIPT IS BEING PASSED IN, WE ARE NOT INCLUDING ALL OF THE ORIGINAL
#           IMAGE FILES. THAT WOULD BE FAR TO LARGE TO SUBMIT. INSTEAD, THIS FILE CAN BE USED
#           TO SIMULATE THE LOADING OF ALL OF THE IMAGES
##############################################################################################

# get all data
if preloadDistances:
    allDistances = pd.read_csv(pathToLoadDistances)
    newCols = allDistances.columns[1:len(allDistances.columns)] # remember the columns we want to keep
    allDistances.set_index("Unnamed: 0", inplace=False) # reset the index to the first column's values
    allDistances = allDistances[newCols] # remove the old index column

In [47]:
########################################
# SCRUB: CLEAN UP DIFFERENT PERSON NAMES
########################################

# determine which columns we really need
colsToKeep = ["person"] + list(allDistances.columns[2:len(allDistances.columns)])

ad = allDistances[colsToKeep]                       # get a local copy of allDistances df
ad["person"] = ad["person"].apply(str.lower)        # change case of person to lower

#print(ad["person"].unique())                        # check out person names before
# fix person names where they are duplicates
ad["person"][ad["person"] == "sandras"] = "sandys"  
ad["person"][ad["person"] == "andrew"] = "andrewd"  
ad["person"][ad["person"] == "anniet"] = "annieti"  
ad["person"][ad["person"] == "davidl"] = "davidla"  
ad["person"][ad["person"] == "garenm"] = "garenmo"  
ad["person"][ad["person"] == "jiebin"] = "jiebinz"  
ad["person"][ad["person"] == "joshua"] = "joshuab"  
ad["person"][ad["person"] == "karltr"] = "karltre"  
ad["person"][ad["person"] == "katieh"] = "katieha"  
ad["person"][ad["person"] == "leiche"] = "leichen"  
ad["person"][ad["person"] == "nickwa"] = "nickwai"
ad["person"][ad["person"] == "noahla"] = "noahlar"
ad["person"][ad["person"] == "noefer"] = "noefern"
ad["person"][ad["person"] == "sethri"] = "sethris"
#print(ad["person"].unique())                        # check out person names after

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ad["person"] = ad["person"].apply(str.lower)        # change case of person to lower
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ad["person"][ad["person"] == "sandras"] = "sandys"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

In [48]:
#####################################################
# TRANSFORM:  TURN person COLUMN INTO personID COLUMN
#####################################################

ad = ad.sort_values(by="person")                    # sort by person
p = ad["person"].unique()                           # get one copy of each person
i = list(range(0,len(p)))                           # get an index for the table that will translate people into numbers
pID = pd.DataFrame(data=p, index=i)
pID.columns = ["person"]
pID['personID'] = pID.index

ad = pd.merge(ad, pID, how="inner", on="person")
ad = ad[["personID"] + list(ad.columns[1:len(ad.columns)-1])]

# take a look at the transformed data frame
ad.head()

Unnamed: 0,personID,dist_0,dist_1,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,...,dist_126,dist_127,dist_128,dist_129,dist_130,dist_131,dist_132,dist_133,dist_134,dist_135
0,0,173.994253,187.856328,158.050625,178.241409,165.529454,170.81569,165.0,155.080624,155.913438,...,111.305885,121.494856,142.284925,109.224539,90.022219,117.341382,50.089919,120.332872,92.195445,114.109596
1,0,154.945152,181.231896,140.698259,170.064694,157.0,169.446157,167.585799,157.292721,168.023808,...,111.198022,110.860272,129.003876,97.416631,91.197588,103.097042,62.096699,104.809351,94.429868,100.603181
2,0,177.132719,189.76301,155.116086,175.26266,163.975608,171.723615,167.523133,156.064089,166.207701,...,101.833197,113.534136,129.034879,105.385008,86.005814,112.929181,52.345009,112.641023,87.005747,105.304321
3,0,184.390889,195.601636,164.109719,181.727818,171.432202,179.72479,179.178682,166.709328,168.576392,...,105.990566,131.244047,128.097619,113.74533,87.051709,120.166551,55.901699,128.444541,88.814413,122.588743
4,0,171.592541,195.058965,152.069063,176.638048,162.542302,171.420536,169.685002,153.495928,165.511329,...,95.603347,120.415946,117.613775,108.295891,79.762146,113.053085,50.596443,114.982607,80.062476,109.293184


In [49]:
#####################################################################
# SCRUB: REMOVE ANY PERSONS WHO DON'T HAVE A MINIMUM NUMBER OF IMAGES
#####################################################################

personsRemoved = pID.copy(deep=True)

# for each person, make sure there are a minimum number of images
for i in pID["personID"]:
    # get the number of images we have for this person
    c = len(ad[ad["personID"] == i])
    # if we have fewer than the minimum number, remove that person from the data
    if c < targetImagesPerPerson: 
        ad = ad[ad["personID"] != i]
    else: 
        # otherwise, note down the fact that they won't have their image removed
        personsRemoved = personsRemoved[personsRemoved["personID"] != i]
        # get ids we'll cull (and leave as many as we need to build a good model)
        imagesToCull = random.sample(list(ad.index[ad["personID"] == i]), c-targetImagesPerPerson)
        
        prsn = pID["person"][i]

        print("We will remove {} images from {}. Previous total was {}".format(len(imagesToCull), prsn, c))
        # remove the extra images
        ad.drop(imagesToCull, inplace=True)


print("\n\nAll images were removed for the following individuals due to insufficient number of images:\n{}".format(personsRemoved))



We will remove 16 images from andrewd. Previous total was 126
We will remove 31 images from annieti. Previous total was 141
We will remove 4 images from christi. Previous total was 114
We will remove 23 images from davidla. Previous total was 133
We will remove 29 images from farahin. Previous total was 139
We will remove 9 images from garenmo. Previous total was 119
We will remove 121 images from jessika. Previous total was 231
We will remove 151 images from jiebinz. Previous total was 261
We will remove 143 images from jonfox. Previous total was 253
We will remove 16 images from joshuab. Previous total was 126
We will remove 426 images from karltre. Previous total was 536
We will remove 263 images from katieha. Previous total was 373
We will remove 114 images from leichen. Previous total was 224
We will remove 88 images from matthew. Previous total was 198
We will remove 59 images from nickwai. Previous total was 169
We will remove 154 images from noahlar. Previous total was 264
We w

In [50]:
########################################
# GET TRAIN AND TEST SETS FOR THE MODELS
########################################

random.seed(seed)

try:
    ad = ad.reset_index(drop=True)
except:
    print("No need to reset index. Already exists.")

results = []
predictions = []

c = len(ad) # get the length of the dataframe

#############################################################
# get train/test split with equal number of images per person
#############################################################
trainDist = pd.DataFrame()
testDist = pd.DataFrame()


# for each person in the dataframe, get 75% of their images
for i in pID.index:

    personSample = ad.loc[ad["personID"] == i] # first, get all the images of that person
    
    # assuming there are some images
    if(len(personSample)) > 0:     
        # get a random list of indexes for our train set
        randList = random.sample(range(personSample.index[0], (personSample.index[0] + targetImagesPerPerson)), int(targetImagesPerPerson * trainSplit))

        # get the train and test tables
        trainDist = trainDist.append(ad.iloc[randList])
        testDist = testDist.append(ad.iloc[list(set(personSample.index) - set(randList))]) # https://stackoverflow.com/questions/3462143/get-difference-between-two-lists

# fix the indexes on the train and test tables
trainDist - trainDist.reset_index(drop=True)
testDist - testDist.reset_index(drop=True)

# separate train into x and y variables 
y_train = trainDist["personID"]
X_train = trainDist[trainDist.columns[2:len(trainDist.columns)]]

# separate test into x and y variables 
y_test = testDist["personID"]
X_test = testDist[testDist.columns[2:len(testDist.columns)]]

In [51]:
# save all distances to file
if preloadDistances == False: # if we didn't preload the distances from a file, save new distances
    allDistances.to_csv(distancesSavePath)
# save all points to file - NOT WORKABLE YET. NEEDS POINTS TO BE CHANGED TO SPARSE ARRAY
#allPoints.to_csv(pointsSavePath)

# save train/test data to csv file 
# NOTE: IF YOU DO THIS, YOU WILL NEED TO CHANGE THE bestRandomForestSeed VARIABLE IN THE latest FaceRecog_FromCSV.jpynb file
trainDist.to_csv(trainSavePath)
testDist.to_csv(testSavePath)