In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
%pip install opencv-python
%pip install cmake
%pip install dlib
%pip install face_recognition
%pip install --user insightface
%pip install --user onnxruntime-gpu
%pip install keras_facenet
%pip install tensorflow
%pip install imgaug
%pip install xgboost


import io # Input/Output Module
import os # OS interfaces
import pathlib
import cv2 # OpenCV package
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm

import face_recognition as fr
import insightface
from insightface.app import FaceAnalysis

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV, LeaveOneOut 
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from skimage.transform import resize
from skimage.feature import hog
from sklearn.cluster import KMeans
from scipy.spatial import distance
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import imgaug as ia

from xgboost import XGBClassifier

from math import ceil
from urllib import request # module for opening HTTP requests
from matplotlib import pyplot as plt # Plotting library
from matplotlib.offsetbox import OffsetImage, AnnotationBbox # PCA Feature space plot

import warnings
warnings.filterwarnings("ignore")

<div style="width:100%; height:140px">
    <img src="https://www.kuleuven.be/internationaal/thinktank/fotos-en-logos/ku-leuven-logo.png/image_preview" width = 300px, heigh = auto align=left>
</div>


KUL H02A5a Computer Vision: Group Assignment 1
---------------------------------------------------------------
Student numbers: <span style="color:red">r0843426, r0670841, r0673254, r0870074, r5</span>.

The goal of this assignment is to explore more advanced techniques for constructing features that better describe objects of interest and to perform face recognition using these features. This assignment will be delivered in groups of 5 (either composed by you or randomly assigned by your TA's).

In this assignment you are a group of computer vision experts that have been invited to ECCV 2021 to do a tutorial about  "Feature representations, then and now". To prepare the tutorial you are asked to participate in a kaggle competition and to release a notebook that can be easily studied by the tutorial participants. Your target audience is: (master) students who want to get a first hands-on introduction to the techniques that you apply.

---------------------------------------------------------------
This notebook is structured as follows:
0. Data loading & Preprocessing
1. Feature Representations
2. Evaluation Metrics 
3. Classifiers
4. Experiments
5. Publishing best results
6. Discussion

Make sure that your notebook is **self-contained** and **fully documented**. Walk us through all steps of your code. Treat your notebook as a tutorial for students who need to get a first hands-on introduction to the techniques that you apply. Provide strong arguments for the design choices that you made and what insights you got from your experiments. Make use of the *Group assignment* forum/discussion board on Toledo if you have any questions.

Fill in your student numbers above and get to it! Good luck! 


<div class="alert alert-block alert-info">
<b>NOTE:</b> This notebook is just a example/template, feel free to adjust in any way you please! Just keep things organised and document accordingly!
</div>

<div class="alert alert-block alert-info">
<b>NOTE:</b> Clearly indicate the improvements that you make!!! You can for instance use titles like: <i>3.1. Improvement: Non-linear SVM with RBF Kernel.<i>
</div>
    


---------------------------------------------------------------
# 0. Data loading & Preprocessing

## 0.1. Loading data
The training set is many times smaller than the test set and this might strike you as odd, however, this is close to a real world scenario where your system might be put through daily use! In this session we will try to do the best we can with the data that we've got! 

In [None]:
# Input data files are available in the read-only "../input/" directory

train = pd.read_csv(
    '/kaggle/input/kul-h02a5a-computer-vision-ga1-2022/train_set.csv', index_col = 0)
train.index = train.index.rename('id')

test = pd.read_csv(
    '/kaggle/input/kul-h02a5a-computer-vision-ga1-2022/test_set.csv', index_col = 0)
test.index = test.index.rename('id')

# read the images as numpy arrays and store in "img" column
train['img'] = [cv2.cvtColor(np.load('/kaggle/input/kul-h02a5a-computer-vision-ga1-2022/train/train_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
                for index, row in train.iterrows()]

test['img'] = [cv2.cvtColor(np.load('/kaggle/input/kul-h02a5a-computer-vision-ga1-2022/test/test_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
                for index, row in test.iterrows()]
  

train_size, test_size = len(train),len(test)

"The training set contains {} examples, the test set contains {} examples.".format(train_size, test_size)

## 0.2. Downloading prep_data

In [None]:
!conda install -y gdown
!gdown 1TdJCUfysjTe59l49QMZmqAk_xXDUOK5f

import tarfile

file = tarfile.open('/kaggle/working/prep.tar.gz', "r:gz")
file.extractall()
file.close()

*Note: this dataset is a subset of the* [*VGG face dataset*](https://www.robots.ox.ac.uk/~vgg/data/vgg_face/).

## 0.3. A first look
Let's have a look at the data columns and class distribution.

In [None]:
# The training set contains an identifier, name, image information and class label
train.head(1)

In [None]:
# The test set only contains an identifier and corresponding image information.
test.head(1)

In [None]:
# The class distribution in the training set:
train.groupby('name').agg({'img':'count', 'class': 'max'})

Note that **Jesse is assigned the classification label 1**, and **Mila is assigned the classification label 2**. The dataset also contains 20 images of **look alikes (assigned classification label 0)** and the raw images. 

### 0.3.1 Visualizing the original dataset

In [None]:
FACE_SIZE = (100,100)

def plot_image_sequence(data, n, imgs_per_row=10):
    n_rows = ceil(n/(imgs_per_row))
    n_cols = min(imgs_per_row, n)

    f,ax = plt.subplots(n_rows,n_cols, figsize=(10*n_cols,10*n_rows))
    for i in range(n):
        if n == 1:
            ax.imshow(data[i])
        elif n_rows > 1:
            ax[int(i/imgs_per_row),int(i%imgs_per_row)].imshow(data[i])
        else:
            ax[int(i%n)].imshow(data[i])
    plt.show()
    
def plot_gray_image_sequence(data, n, imgs_per_row=10):
    n_rows = ceil(n/(imgs_per_row))
    n_cols = min(imgs_per_row, n)

    f,ax = plt.subplots(n_rows,n_cols, figsize=(10*n_cols,10*n_rows))
    for i in range(n):
        if n == 1:
            ax.imshow(data[i], cmap = 'gray')
        elif n_rows > 1:
            ax[int(i/imgs_per_row),int(i%imgs_per_row)].imshow(data[i],cmap = 'gray')
        else:
            ax[int(i%n)].imshow(data[i], cmap = 'gray')
    plt.show()

In [None]:
train_images = train['img'].to_numpy()
plot_image_sequence(train_images[60:80], n=20)

By looking at the original training dataset, we can spot some issues which might occur when trying to classify faces.<br>

**1)** Some images contain multiple faces. We will have to make a decision on how to approach this problem. One possible solution would be to detect only the first face that is found in the image. Another possible solution would be to split the image for every face. This would mean that the training dataset would increase in size. However, it would also result in some false positives as each face would be given the same label that the original image had.<br>  
**2)** There is one image which is unable to load. When classifying the image it might be a good idea to leave this particular case out of the training dataset.

## 0.4. Preprocess data
### 0.4.1 HAAR face detector
In this example we use the [HAAR feature based cascade classifiers](https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_objdetect/py_face_detection/py_face_detection.html) to detect faces, then the faces are resized so that they all have the same shape. If there are multiple faces in an image, we only take the first one. 

<div class="alert alert-block alert-info"> <b>NOTE:</b> You can write temporary files to <code>/kaggle/temp/</code> or <code>../../tmp</code>, but they won't be saved outside of the current session
</div>


In [None]:
class HAARPreprocessor():
    """Preprocessing pipeline built around HAAR feature based cascade classifiers. """
    
    def __init__(self, path, face_size, faces=-1):
        self.face_size = face_size
        self.faces = faces
        if faces == 1:
            self.name = 'HAAR_FIRST'
        else:
            self.name = 'HAAR_MULTI'
            
        file_path = os.path.join(path, "haarcascade_frontalface_default.xml")
        if not os.path.exists(file_path): 
            if not os.path.exists(path):
                os.mkdir(path)
            self.download_model(file_path)
        
        self.classifier = cv2.CascadeClassifier(file_path)
  
    def download_model(self, path):
        url = "https://raw.githubusercontent.com/opencv/opencv/master/data/"\
            "haarcascades/haarcascade_frontalface_default.xml"
        
        with request.urlopen(url) as r, open(path, 'wb') as f:
            f.write(r.read())
            
    def detect_faces(self, img):
        """Detect all faces in an image."""
        
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        return self.classifier.detectMultiScale(
            img_gray,
            scaleFactor=1.2,
            minNeighbors=5,
            minSize=(30, 30),
            flags=cv2.CASCADE_SCALE_IMAGE
        )
        
    def extract_faces(self, img):
        """Returns all faces (cropped) in an image."""
        
        faces = self.detect_faces(img)

        return [img[y:y+h, x:x+w] for (x, y, w, h) in faces]
    
    def preprocess(self, data_row):
        faces = self.extract_faces(data_row['img'])
        
        # if no faces were found, return None
        if len(faces) == 0:
            nan_img = np.empty(self.face_size + (3,))
            nan_img[:] = np.nan
            if self.faces == 1:
                return nan_img
            else:
                return np.array([nan_img])
        
        if self.faces == 1:
            return cv2.resize(faces[0], self.face_size, interpolation = cv2.INTER_AREA)
        else:
            return np.array([cv2.resize(face, self.face_size, interpolation = cv2.INTER_AREA) for face in faces])
    
    def check_prep(self, label, prep_path):
        pathlib.Path(prep_path).mkdir(parents=True, exist_ok=True)
        
        bool_X = pathlib.Path(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name))).exists()
        bool_y = pathlib.Path(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name))).exists()
        bool_ids = pathlib.Path(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name))).exists()
        return (bool_X and bool_y and bool_ids)
    
    def __call__(self, data, values=None, label='train'):
        prep_path = "/kaggle/working/prepped_data/1. Detection/" + self.name
        if self.check_prep(label, prep_path):
            print('Loading prepped data...')
            X = np.load(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)))
            y = np.load(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)))
            ids = np.load(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)))
        else:
            face_arr = []
            class_values = []
            ids = []
            for i, row in data.iterrows():
                if self.faces == 1:
                    face_arr.append(self.preprocess(row))
                else:
                    for face in self.preprocess(row):
                        if values is not None:
                            ids.append(i)
                            class_values.append(values[i])
                        face_arr.append(face)
            if self.faces == 1 and values is not None:
                ids = list(range(len(values)))
                class_values = values
                
            X = np.stack(face_arr).astype(int)
            y = np.array(class_values)
            ids = np.array(ids)
            
            np.save(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)), X)
            np.save(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)), y)
            np.save(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)), ids)
        return X, y, ids

#### 0.4.1.1 HAAR face detector (First face detection)

In [None]:
%%time
preprocessor = HAARPreprocessor(path = '../../tmp', face_size=FACE_SIZE, faces=1)

train_X_HAAR, train_y_HAAR, HAAR_ids = preprocessor(train, train['class'].values, label='train')
#test_X_HAAR = preprocessor(test)
print(f'Preprocessing resulted in {train_X_HAAR.shape[0]} images with {train_y_HAAR.shape[0]} labels.')

**Visualise**

Let's plot a few examples.

In [None]:
# plot faces of Michael and Sarah
plot_image_sequence(train_X_HAAR[train_y_HAAR == 0], n=20, imgs_per_row=10)

In [None]:
# plot faces of Jesse
plot_image_sequence(train_X_HAAR[train_y_HAAR == 1], n=30, imgs_per_row=10)

In [None]:
# plot faces of Mila
plot_image_sequence(train_X_HAAR[train_y_HAAR == 2], n=30, imgs_per_row=10)

These images clearly indicate that the preprocessing is not perfect. It seems that in three images it was unable to recognize a face and as such it has returned a blank image.<br>
Furthermore, there are five images where it has "detected" a face, however it seems to have gotten a false positive as they are focused on either the collar of a shirt or on the hair.<br>
Finally there also seems to be a problem by recognizing the wrong face. This means that, in this case, it will train on data that is not completely reliable. This problem can be partially mitigated by allowing the preprocessing to recognize more than one face.

#### 0.4.1.2 HAAR face detector (Multi face detection)

In [None]:
%%time
preprocessor = HAARPreprocessor(path = '../../tmp', face_size=FACE_SIZE)

train_X_HAAR, train_y_HAAR, HAAR_ids = preprocessor(train, train['class'].values, label='train')
#test_X_HAAR = preprocessor(test)
print(f'Preprocessing resulted in {train_X_HAAR.shape[0]} images with {train_y_HAAR.shape[0]} labels.')

**Visualise**

Let's plot a few examples.

In [None]:
plot_image_sequence(train_X_HAAR, n=train_X_HAAR.shape[0], imgs_per_row=13)

The total train_X dataset has been expanded from 80 images to 104 images by allowing the preprocessor to recognize more than one face. This means that the dataset should be complete and for every one of the 80 labels, there should exist one picture denoting the right face (With the exception of the pictures where no face or the wrong area was detected). This can be followed up on during the feature extraction by plotting the similarities between images and dismissing all images that are too dissimilar.

### 0.4.2: FaceRecognition
This preprocessor will make use of the face_recognition package (which can be found [here](https://pypi.org/project/face-recognition/)). <br>
For the use of this project this package will only be used in a face detection setting. It is able to detect faces within an image based on two different models: HoG or deep learning (CNN). Whilst the second model should provide better results, it also uses a lot more computation power and is usually paired with GPU acceleration. <br>
As such, for this project, the face detection will be applied within a HoG model. 

In [None]:
class FaceRec_Preprocessor():
    """Preprocessing pipeline built around the face-recognition package built on dlib. """
    def __init__(self, face_size, model='hog', faces=1):
        self.name = 'FACEREC'
        self.face_size = face_size                                     
        self.model = model
        self.faces = faces

    def detect_faces(self, img):
        """Detects all faces in an image"""        
        return fr.face_locations(img, model=self.model)
        
    def extract_faces(self, img):
        """Returns cropped faces"""
        faces = self.detect_faces(img)

        # return the cropped images
        return [img[top:bottom, left:right] for (top, right, bottom, left) in faces]
    
    def preprocess(self, data_row):
        """Returns the cropped images resized to prespecified face_size"""
        faces = self.extract_faces(data_row['img']) 
        # if no faces were found, return None
        if len(faces) == 0:
            nan_img = np.empty(self.face_size + (3,))
            nan_img[:] = 0
            if self.faces == 1:
                return nan_img
            else:
                return np.array([nan_img])
        
        if self.faces == 1:
            return cv2.resize(faces[0], self.face_size, interpolation = cv2.INTER_AREA)
        else:
            return np.array([cv2.resize(face, self.face_size, interpolation = cv2.INTER_AREA) for face in faces])
    
    def check_prep(self, label, prep_path):
        """Check whether pre-existing data exists."""
        pathlib.Path(prep_path).mkdir(parents=True, exist_ok=True)
        
        bool_X = pathlib.Path(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name))).exists()
        bool_y = pathlib.Path(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name))).exists()
        bool_ids = pathlib.Path(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name))).exists()
        return (bool_X and bool_y and bool_ids)
    
    def __call__(self, data, values=None, label='train'):
        prep_path = "/kaggle/working/prepped_data/1. Detection/" + self.name
        if self.check_prep(label, prep_path):
            print('Loading prepped data...')
            X = np.load(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)))
            y = np.load(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)))
            ids = np.load(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)))
        else:
            face_arr = []
            class_values = []
            ids = []
            for i, row in tqdm(data.iterrows()):
                if self.faces == 1:
                    face_arr.append(self.preprocess(row))
                else:
                    for face in self.preprocess(row):
                        ids.append(i)
                        if values is not None:                        
                            class_values.append(values[i])
                        face_arr.append(face)
            if self.faces == 1 and values is not None:
                ids = list(range(len(values)))
                class_values = values
                
            X = np.stack(face_arr).astype(int)
            y = np.array(class_values)
            ids = np.array(ids)
            
            np.save(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)), X)
            np.save(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)), y)
            np.save(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)), ids)
        return X, y, ids

In [None]:
%%time
preprocessor = FaceRec_Preprocessor(FACE_SIZE, faces=-1)

train_X_FACEREC, train_y_FACEREC, FACEREC_ids = preprocessor(train, train['class'].values, label='train')
print(f'Preprocessing resulted in {train_X_FACEREC.shape[0]} images with {train_y_FACEREC.shape[0]} labels.')

In [None]:
plot_image_sequence(train_X_FACEREC, n = 20, imgs_per_row=10)

We can see that by using this face detection package, we are already able to get much better results. By detecting multiple faces, the dataset has been expanded to 97 images. <br>
In 96 of these images we can verify that the dataset was indeed able to recognize a face. In one image it has even managed to identify a face which is really blurry.

### 0.4.3: InsightFace
Another preprocessing technique that can be used is the InsightFace package (found [here](https://pypi.org/project/insightface/)). <br>According to the developers “InsightFace is an integrated Python library for 2D&3D face analysis.” \[[1](https://insightface.ai/)]. <br>It is able to perform multiple tasks: face detection, face alignment and face recognition. Again, for this project, we will focus on the face detection.

In [None]:
class InsightFace_Preprocessor():
    def __init__(self, face_size, faces=-1):
        self.name = 'INSIGHT'
        self.face_size = face_size
        self.faces = faces
        
    def detect_faces(self, img):
        features = self.app.get(img)
        if features == []:
            return []
        face_loc = [list(map(lambda x: 0 if x < 0 else x, f['bbox'].astype(int))) for f in features]
        return face_loc
    
    def extract_faces(self, img):
        """Returns faces (cropped) in an image"""
        faces = self.detect_faces(img)                                  # get locations of detected faces

        # crop the images accordingly and save them to a list
        face = [img[top:bottom, left:right] for (left, top, right, bottom) in faces]
        return face                                                        # return the list of the cropped images    
    
    def preprocess(self, data_row):
        faces = self.extract_faces(data_row['img'])                     # get list of cropped images     
        # if no faces were found, return None
        if len(faces) == 0:
            nan_img = np.empty(self.face_size + (3,))
            nan_img[:] = 0
            if self.faces == 1:
                return nan_img
            else:
                return np.array([nan_img])
        
        if self.faces == 1:
            return cv2.resize(faces[0], self.face_size, interpolation = cv2.INTER_AREA)
        else:
            return np.array([cv2.resize(face, self.face_size, interpolation = cv2.INTER_AREA) for face in faces])
    
    def check_prep(self, label, prep_path):
        pathlib.Path(prep_path).mkdir(parents=True, exist_ok=True)
        
        bool_X = pathlib.Path(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name))).exists()
        bool_y = pathlib.Path(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name))).exists()
        bool_ids = pathlib.Path(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name))).exists()
        return (bool_X and bool_y and bool_ids)

    def __call__(self, data, values=None, label='train'):
        prep_path = "/kaggle/working/prepped_data/1. Detection/INSIGHT"
        if self.check_prep(label, prep_path):
            print('Loading prepped data...')
            X = np.load(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)))
            y = np.load(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)))
            ids = np.load(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)))
        else:
            self.app = FaceAnalysis(allowed_modules=['detection'],providers=['CPUExecutionProvider'])
            self.app.prepare(ctx_id=0)
            face_arr = []
            class_values = []
            ids = []
            for i, row in data.iterrows():
                if self.faces == 1:
                    face_arr.append(self.preprocess(row))
                else:
                    for face in self.preprocess(row):
                        ids.append(i)
                        if values is not None:                        
                            class_values.append(values[i])
                        face_arr.append(face)
            if self.faces == 1 and values is not None:
                ids = list(range(len(values)))
                class_values = values
            X = np.stack(face_arr).astype(int)
            y = np.array(class_values)
            ids = np.array(ids)
            
            np.save(os.path.join(prep_path, '{}_X_{}.npy'.format(label, self.name)), X)
            np.save(os.path.join(prep_path, '{}_y_{}.npy'.format(label, self.name)), y)
            np.save(os.path.join(prep_path, '{}_ids_{}.npy'.format(label, self.name)), ids)
        return X, y, ids

In [None]:
%%time
preprocessor = InsightFace_Preprocessor(FACE_SIZE, faces=-1)                                                 # create face_recognition object

train_X_INSIGHT, train_y_INSIGHT, INSIGHT_ids = preprocessor(train, train['class'].values, label='train')
# test_X_INSIGHT, _, test_INSIGHT_ids = preprocessor(test)
print(f'Preprocessing resulted in {train_X_INSIGHT.shape[0]} images with {train_y_INSIGHT.shape[0]} labels.')

In [None]:
plot_image_sequence(train_X_INSIGHT, n = 20)

This preprocessor sees mixed success. It was able to expand the dataset to 110 images, meaning it was able to detect a lot more faces within the image. <br>
In one instance, however, it wasn't able to recognize a face at all, while `face_recognition` and even `HAAR` was able to find this face. <br>
For this dataset it might be a good idea to fallback on the face_recognition dataset when no face is found within the image. This would allow us to identify a lot more faces, whilst also eliminating the problematic images.

## 0.5. Store Preprocessed data (optional)
<div class="alert alert-block alert-info">
<b>NOTE:</b> You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All". Feel free to use this to store intermediary results.
</div>

In [None]:
# save preprocessed data
# prep_path = './kaggle/working/prepped_data/'
# if not os.path.exists(prep_path):
#     os.mkdir(prep_path)
    
# np.save(os.path.join(prep_path, 'train_X.npy'), train_X)
# np.save(os.path.join(prep_path, 'train_y.npy'), train_y)
# np.save(os.path.join(prep_path, 'test_X.npy'), test_X)

# load preprocessed data
# prep_path = './kaggle/working/prepped_data/'
# if not os.path.exists(prep_path):
#     os.mkdir(prep_path)
# train_X = np.load(os.path.join(prep_path, 'train_X.npy'))
# train_y = np.load(os.path.join(prep_path, 'train_y.npy'))
# test_X = np.load(os.path.join(prep_path, 'test_X.npy'))

Now we are ready to rock!

# 1. Feature Representations
## 1.0. Identify feature extractor
Our example feature extractor doesn't actually do anything... It just returns the input:
$$
\forall x : f(x) = x.
$$

It does make for a good placeholder and baseclass ;).

In [None]:
class IdentityFeatureExtractor:
    """A simple function that returns the input"""
    
    def transform(self, X):
        return X
    
    def __call__(self, X, label='train', preproc='HAAR'):
        self.label = label
        self.preproc = preproc
        return self.transform(X)

## 1.1. Baseline 1: HOG feature extractor
HOG stands for **H**istogram of **O**riented **G**radients. In this feature descriptor, the distribution of directions of gradients are used as features as they contain valueable information about corners and edges. 

For this feature, we are going to use [scikit-image's hog descriptor](https://scikit-image.org/docs/dev/auto_examples/features_detection/plot_hog.html) which is popular for object detection algorithm of [Dalal and Triggs's paper](https://lear.inrialpes.fr/people/triggs/pubs/Dalal-cvpr05.pdf).

There are couple of steps to generate HOG features:

1. **Global Image Normalisation**: `block_norm` is used to indicate the normalisation process. We chose `L1` (Lasso regression) for our extractor. This step is optional
2. **Computing the Gradient Image in x and y**: Calculation of horizontal and vertical gradients for each cell.
3. **Computing Gradient Histograms**: Histograms are calculated per cell. In our extractor, we used 64 pixels (8x8) per cell(`pixels_per_cell=(8,8)`). `orientations` argument is to define the bin size of each histogram. Common behaviour is to take 9-bin histograms so that each bin represents 20 unsigned degrees.
4. **Normalising Across Blocks:** After creating histogram for each cell, algorithm normalizes over blocks that consist of 4 cells(2x2) per block(`cells_per_block=(2,2)`). Normalizing over blocks help us to remove out the noise we have in the image such as illumination or shadowing by not losing important information.
5. **Flattening Into a Feature Vector**: Concatanation of all vectors obtained from each block. 

[scikit-image hog extractor source code](https://github.com/scikit-image/scikit-image/blob/main/skimage/feature/_hog.py)

In [None]:
class HOGFeatureExtractor(IdentityFeatureExtractor):
    
    def __init__(self, orientations=9, pixels_per_cell=(8,8), cells_per_block=(2,2),
                 visualize=True, multichannel=True, block_norm="L1"):
        self.params = {'orientations':orientations,
                        'pixels_per_cell':pixels_per_cell,
                        'cells_per_block':cells_per_block,
                        'visualize':visualize,
                        'multichannel':multichannel,
                        'block_norm':block_norm
                        }
    
    def features(self, img):
        fd, hog_img = hog(img, **self.params)
        return fd, hog_img
    
    def get_images(self):
        return self.hog_img
    
    def transform(self, X):
        features = [self.features(img) for img in X]
        fd, self.hog_img = list(map(lambda x: x[0], features)), list(map(lambda x: x[1], features))
        return np.array(fd)

In [None]:
%%time
hog_extr = HOGFeatureExtractor(block_norm='L1')
hog_fd_train = hog_extr(train_X_HAAR)
hog_img_train = hog_extr.get_images()
print(hog_fd_train.shape)
plot_image_sequence(hog_img_train, n=30, imgs_per_row=10)

### 1.1.1. t-SNE Plots
Do you have troubles to visualize your high-dimentional data? Well, worry no more! t-SNE is there for you! 

**t-Distributed Stochastic Neighbor Embedding (t-SNE)** is a dimensionality reduction technique used to represent high-dimensional dataset in a low-dimensional space of two or three dimensions by calculating a similarity measure between pairs of instances in the high dimensional space and in the low dimensional space.


**Hyperparameters**

*   `perplexity` is the # of nearest neighbors to indicate the size of circle which is used to lay out the gaussian distribution around the centered point. Values between 5 and 50 are mostly used values
*   `n_components` indicates the dimension of the space to calculate the similarity
*   `n_iter` indicates the # of iterations

In [None]:
def plot_TSNE(features, target_y, n_components=2, verbose=1, perplexity=10, n_iter=10000, ttl='HoG'):
    """Function that plots the t-SNE for 2 components."""
    tsne = TSNE(n_components=n_components, verbose=verbose, perplexity=perplexity, n_iter=n_iter)
    tsne_results = tsne.fit_transform(features)
    x,y = tsne_results[:,0], tsne_results[:,1]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    if target_y is None:
        ax.scatter(x, y)

    else:
        ax.scatter(x[target_y==0], y[target_y==0], c='r', marker='x', label='class 0')      
        ax.scatter(x[target_y==1], y[target_y==1], c='g', marker='^', label='class 1')
        ax.scatter(x[target_y==2], y[target_y==2], c='b', marker='o', label='class 2')
        plt.legend()
    plt.title(f't-SNE plot using {ttl}')
    plt.ylabel("Component 1")
    plt.xlabel("Component 2")
    plt.show()

In [None]:
plot_TSNE(features=hog_fd_train, ttl='HoG', target_y=train_y_HAAR)

## 1.2. Baseline 2: Scale Invariant Feature Transform (SIFT)
SIFT, which is an abbreviation of Scale Invariant Feature Transform, is feature extraction method like HOG which transforms image content into local feature coordinates that are invariant to translation, scale and different other image transformations. An example application of SIFT can be found in this research  [(Basha, Padmaja & Balaji, 2019)](https://www.proquest.com/docview/2345487825/5934A9B0D3664BEEPQ/9) regarding bone fracture detection. A combination of the HAAR and SIFT transform techniques are used for improving the image quality such that appropriate fracture portion of the bone can be extracted for feature extraction.

Some advantages of this method are the following: 

*   Locality: the features it detects are local and are hence robust to occlusion and clutter.
*   Distinctiveness: the individual features the model extracts can be matched to a large dataset of objects.  
*   Quantity: by using SIFT, one can extract a lot of features regardless the size of the objects. 
*   Lijstitem


Efficiency: near real-time performance. 

In [None]:
class SIFTFeatureExtractor(IdentityFeatureExtractor):
    
    def __init__(self, **params):
        self.params = params
        self.sift = cv2.SIFT_create()
    
    def features(self, img):
        gray = cv2.cvtColor(np.uint8(img),cv2.COLOR_RGB2GRAY)        
        kp, fd = self.sift.detectAndCompute(gray, None)
        kp = np.array(kp)
        kp_responses = np.array(list(map(lambda x: x.response, kp)))
        idx = np.array(list(np.argpartition(kp_responses, -min(20, len(kp)))[-min(20, len(kp)):]))
        if len(kp) != 0:
            kp = kp[idx]
            fd = fd[idx]
        img = cv2.drawKeypoints(np.uint8(img),kp,gray)
        return kp, fd, img
    
    def get_images(self):
        return self.img
    
    def get_kp(self):
        return self.kp
    
    def transform(self, X):
        features = [self.features(img) for img in X]
        self.kp, self.fd, self.img = zip(*features)
        return np.array(self.fd)

In [None]:
sift = SIFTFeatureExtractor()
sift_fd = sift(train_X_INSIGHT)
sift_img = sift.get_images()
sift_kp = sift.get_kp()
plot_image_sequence(sift_img, n=30, imgs_per_row=10)

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', n_init=10)
centers = []
for fd_ in sift_fd:
    if fd_ is None:
        centers.append(np.zeros((512,)))
        continue
    kmeans.fit(fd_)
    kmeans.predict(fd_)
    centers.append(kmeans.cluster_centers_.flatten())
centers = np.array(centers)

### 1.2.1 t-SNE Plot

In [None]:
plot_TSNE(features=centers, ttl='SIFT descriptors', target_y=train_y_INSIGHT)

## 1.3. Baseline 3: FAST feature extractor
`FAST` (or **F**eatures from **A**cceleated **S**egment **T**est) is a corner detection method. This feature extractor was made to have similar performance as `SIFT` or `ORB`. <br>
However, as the name implied it was made to focus on computational efficiency, resulting in a lower cost of resources. This allows it to be more suitable toward processing video in real-time. <br>
This feature extractor will create a numpy array of 128 decimals, which are calculated based on every keypoint FAST is able to detect. <br>
For the purpose of this project, we have limited the maximum amount of keypoints to 20. These 20 keypoints had the highest reponse within the image and should most strongly characterize it. <br>
To actually use the feature descriptors within a machine learning context, the dimension will have to be reduced so that every image has an array of an equal size. This step is done by calculating the `kmeans` center on the descriptors of every image.

In [None]:
class FastFeatureExtractor(IdentityFeatureExtractor):
    
    def __init__(self, **params):
        self.params = params
        self.fast = cv2.FastFeatureDetector_create()
        self.sift = cv2.SIFT_create()
        
    def features(self, img):
        gray = cv2.cvtColor(np.uint8(img),cv2.COLOR_RGB2GRAY)
        kp = np.array(list(self.fast.detect(gray, None)))
        _, fd = self.sift.compute(gray, kp)
        kp_responses = np.array(list(map(lambda x: x.response, kp)))
        idx = np.array(list(np.argpartition(kp_responses, -min(20, len(kp)))[-min(20, len(kp)):]))
        if len(kp) != 0:
            kp = kp[idx]
        if fd is None:
            fd = []
        img = cv2.drawKeypoints(np.uint8(img),kp,gray)
        return kp, fd, img
    
    def get_images(self):
        return self.img
    
    def get_keypoints(self):
        return np.array(self.kp)
    
    def transform(self, X):
        features = [self.features(img) for img in X]
        self.kp, self.fd, self.img = zip(*features)
        return np.array(self.fd)

In [None]:
fast = FastFeatureExtractor()
fast_fd = fast(train_X_INSIGHT)
fast_img = fast.get_images()
fast_kp = fast.get_keypoints()
plot_image_sequence(fast_img, n=50, imgs_per_row=10)

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10)
centers = []
for i, fd_ in enumerate(fast_fd):
    if len(fd_) == 0:
        fd_ = sift_fd[i]
    if fd_ is None:
        centers.append(np.zeros((256,)))
        continue
    kmeans.fit(fd_)
    kmeans.predict(fd_)
#     print(kmeans.cluster_centers_.flatten().shape)
    centers.append(kmeans.cluster_centers_.flatten())
centers = np.array(centers)

### 1.3.1 t-SNE Plot

In [None]:
plot_TSNE(features=centers, ttl='FAST descriptors', target_y=train_y_INSIGHT)

## 1.4. Baseline 4: PCA feature extractor
Another commonly used feature extraction method is the Principle Component Analysis, PCA in short. PCA aims to find the eigenvectors of a covariance matrix with the highest eigenvalues and hence uses them to project the input data into a new space of equal or less dimensions. In practice, the method converts a matrix of X features into a new dataset of, ideally, < X features. In other words, it reduces the number of features by constructing a new, smaller number variables that capture a remarkable portion of the information found in the original features. An example of this method can be found in the [paper of Ma, Miao and Zhang (2013)](https://www.proquest.com/docview/1776429183/FC068F7F84A544D0PQ/8) where they used PCA in order to utilize the global features to efficiently classify vehicles. 

In [None]:
def make_flat_grays(images):
    Grays = []
    n_samples = int(np.shape(images)[0])
    for i in range(0,n_samples):
            # take the ith image from the input
            image = images[i,:,:,:]
            # set to grayscale
            gray_image = cv2.cvtColor(image.astype('uint8'), cv2.COLOR_RGB2GRAY)
            # resize to 1 dimension
            flat_gray_image = np.reshape(gray_image, -1)
            # stack all resized images
            Grays.append(flat_gray_image)
    return Grays

class PCAFeatureExtractor(IdentityFeatureExtractor):
    
    def __init__(self, n_components):
        self.n_components = n_components
        self.processing = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=n_components))])
        
    def transf(self, images):
        # preprocess the data: 
        Grays = make_flat_grays(images)
        Grays = np.array(Grays)
        Grays = Grays.T
        
        # perform the PCA
        self.eigenfaces = self.processing.fit_transform(Grays)
        
        return self.eigenfaces
    
    def inv_transform(self, eigenfaces):
        self.reconstructed_faces = self.processing.named_steps['pca'].inverse_transform(eigenfaces)
        return self.reconstructed_faces 
    
    def create_pca_object(self, images):
        Grays = make_flat_grays(images)
        Grays = np.array(Grays).T
        pca = PCA(n_components = self.n_components, whiten = True)
        pca.fit(Grays)
        return pca

### 1.4.1. Eigenface Plots
When performing PCA on the faces in the test set, the principal components that we extract are called Eigenfaces instead of Eigenvectors. Each of these Eigenfaces represents a basic, standardized form of a face which displays one or more features of a face. Any existing face can be created by combining these Eigenfaces in a different way and it does not take many of these Eigenfaces to obtain a nice face approximation. Therefore, Eigenfaces are frequently used for face recognition purposes, for example in [this](https://sites.cs.ucsb.edu/~mturk/Papers/mturk-CVPR91.pdf) and [this](https://www.cec.uchile.cl/~aabdie/jruizd/papers/afss2002b.pdf) research.

The first Eigenface is the face that represents the greatest variance among all the faces in the test set. Therefore, it is a very general representation of a face as can be seen in the images below. The second face explains less variance, the third even less than the second and so on. The last Eigenfaces can even be left out as they explain almost no variance.

In [None]:
n_components = 39
images_gray = make_flat_grays(train_X_HAAR[train_y_HAAR == 2])
mean_face = np.mean(images_gray, axis=0)
mean_face = np.reshape(mean_face,(FACE_SIZE[0],FACE_SIZE[1]))
std_face = np.std(images_gray, axis = 0)
std_face = np.reshape(std_face,(FACE_SIZE[0],FACE_SIZE[1]))
Extractor = PCAFeatureExtractor(n_components = n_components)

Eigenfaces_flat = Extractor.transf(train_X_HAAR[train_y_HAAR == 2])
Eigenfaces_as_image = np.empty((n_components,FACE_SIZE[0],FACE_SIZE[1],1))
for i in range(0,n_components):
    image = np.reshape(Eigenfaces_flat[:,i], (FACE_SIZE[0],FACE_SIZE[1]))
    image = np.expand_dims(image, axis = 0)
    Eigenfaces_as_image[i,0:FACE_SIZE[0],0:FACE_SIZE[1],0] = image

print(Eigenfaces_as_image.shape)
plot_gray_image_sequence(Eigenfaces_as_image, n=n_components, imgs_per_row=10)

In [None]:
reconstructed_imgs = Extractor.inv_transform(Eigenfaces_flat)
reconstructed_imgs = reconstructed_imgs
rec_imgs_as_image = np.empty((np.shape(reconstructed_imgs)[0],FACE_SIZE[0],FACE_SIZE[1],1))
print(np.shape(reconstructed_imgs))
for i in range(0,np.shape(reconstructed_imgs)[1]):
    reconstructed_image = np.reshape(reconstructed_imgs[:,i], (FACE_SIZE[0],FACE_SIZE[1]))
    reconstructed_image = reconstructed_image*std_face + mean_face
    reconstructed_image = np.expand_dims(reconstructed_image, axis = 0)
    rec_imgs_as_image[i,0:FACE_SIZE[0],0:FACE_SIZE[1],0] = reconstructed_image
plot_gray_image_sequence(rec_imgs_as_image, n=np.shape(reconstructed_imgs)[1], imgs_per_row=10)

### 1.4.2. Feature Space Plots

The following visualization displays the images projected on and reconstructed from the first two eigenfaces. The images that are close to each other are the more similar ones whereas the dissimilar images are far apart in the graph. 

In [None]:
extractor = PCAFeatureExtractor(n_components = 39)
transformed_faces = extractor.transf(train_X_HAAR)
print(transformed_faces.shape)
#faces_inv_proj_2 = reconstructed_imgs
reconstructed_faces = np.reshape(extractor.inv_transform(transformed_faces),(104,100,100))
fig,ax = plt.subplots()
ax.scatter(transformed_faces[:104,0], transformed_faces[:104,1])
for x0, y0, rec_face in zip(transformed_faces[:,0], transformed_faces[:,1], reconstructed_faces):
    ab = AnnotationBbox(OffsetImage(mean_face + std_face * rec_face, zoom=0.3, cmap='gray'), (x0, y0), frameon=False)
    ax.add_artist(ab)
ax.set_title('Projecting the images in the 2D eigenspace')
ax.set_xlabel('Eigenface 1')
ax.set_ylabel('Eigenface 2')
fig.set_figheight(10)
fig.set_figwidth(15)

## 1.5. Baseline 5: Facial encodings
This feature extractor is made using the `face_recognition` package that has earlier been used for preprocessing. <br>
In this section the same package is used to extract the facial encodings in an image. Because every image has been cropped to only show the faces, this function is (theoretically) possible with every preprocessor. <br>
However in testing it becomes clear that this is not always the case. This feature extractor returns an empty array when ran on blurry images and sometimes even on clear images when they are not cropped in the same way the face_recognition package expects them to be. 

In [None]:
class FEncodings_FeatureExtractor(IdentityFeatureExtractor):
    """Use facial landmarks as computed by face_recognition"""
    def __init__(self):
        self.prep_path = "/kaggle/working/prepped_data/2. Features/Facial Encodings"

    def extract(self, img):
        img_tmp = cv2.cvtColor(np.uint8(img), cv2.COLOR_BGR2RGB)
        return fr.face_encodings(img_tmp, model='hog')
    
    def check_prep(self):
        pathlib.Path(self.prep_path).mkdir(parents=True, exist_ok=True)
        
        bool_FE = pathlib.Path(os.path.join(self.prep_path, '{}_{}_FE.npy'.format(self.label, self.preproc))).exists()
        return bool_FE
        
    def transform(self, X):          
        if self.check_prep():
            print('Loading prepped data...')
            encodings = np.load(os.path.join(self.prep_path, '{}_{}_FE.npy'.format(self.label, self.preproc)))
        else:
            encodings = []
            for img in tqdm(X):
                fe = self.extract(img)
                if len(fe) == 0:
                    encodings.extend(np.zeros((1,128)))
                else:
                    encodings.extend(fe)
            encodings = np.array(encodings)
            np.save(os.path.join(self.prep_path, '{}_{}_FE.npy'.format(self.label, self.preproc)), encodings)
        return encodings

In [None]:
%%time
FEncodings = FEncodings_FeatureExtractor()
encodings = FEncodings(train_X_FACEREC, 'train', 'FACEREC')

### 1.1.2. Discussion
...

# 2. Cleaning the dataset

### 2.1 Remove most dissimilar faces from dataset
An important step is the cleaning of the dataset. <br>
In the previous section surround preprocessing, the decision was made to allow the face detection to detect more than one face. This means that our dataset has been expanded, but now we are faced with a lot of images that are either <u>irrelevant</u> or <u>mislabeled</u>. As such this function exists to scale the dataset back to its original shape. <br><br>
To get the best possible results, there exist different functions for training sets and testing sets. <br><br>
In the scenario that a `training` dataset is passed on to the function, it will iterate over every picture and calculate the cosine distance with every image of the same class after which this value is averaged. Afterwards the dataset will go over every row for which the image id appears multiple times and will only keep the image with the highest similarity. The `cosine similarity` is calculated by comparing the features that were passed on, however, if the feature is equal to an array consisting only of zeroes, it will compare the similarities between the images directly.<br><br>
For the `testing` dataset this is done in a similar way, but because the class label is not known a similarity score is calculated for every class. After which it will keep the image in which the highest similarity was attained, regardless of the class.


In [None]:
class CleanTrainingSet():
    def __init__(self, train_X, train_y, ids, features):
        self.train_X = train_X
        self.train_y = train_y
        self.ids = ids
        self.features = features
        self.prep_path = "/kaggle/working/prepped_data/3. Clean Data/"
        
    def create_df(self):
        self.data = pd.DataFrame({'id': self.ids})
        if self.train_y is not None:
            self.data['class'] = list(self.train_y)
        self.data['img'] = list(self.train_X)
        self.data['feature'] = list(self.features)
        return self.data
        
    def cosine_sim_train(self, row):
        sim = []
        if (row['feature'] == 0).all():
            for class_feature in self.data[self.data['class'] == row['class']]['img'].to_numpy():            
                tmp = (distance.cosine(class_feature.flatten(), row['img'].flatten()) - 1) * (-1)
                sim.append(tmp)
            sim = np.array(sim)
            return np.mean(sim[~np.isnan(sim)])
        
        for class_feature in self.data[self.data['class'] == row['class']]['feature'].to_numpy():
            tmp = (distance.cosine(class_feature, row['feature']) - 1) * (-1)
            sim.append(tmp)
        sim = np.array(sim)
        tmp = np.mean(sim[~np.isnan(sim)])
        return 0 if np.isnan(tmp) else tmp
    
    def cosine_sim_test(self, row, class_id):
        sim = []
        if (row['feature'] == 0).all():
            for class_feature in self.train_data[self.train_data['class'] == class_id]['img'].to_numpy():            
                tmp = (distance.cosine(class_feature.flatten(), row['img'].flatten()) - 1) * (-1)
                sim.append(tmp)
            sim = np.array(sim)
            return np.mean(sim[~np.isnan(sim)])
        
        for class_feature in self.train_data[self.train_data['class'] == class_id]['feature'].to_numpy():
            tmp = (distance.cosine(class_feature, row['feature']) - 1) * (-1)
            sim.append(tmp)
        sim = np.array(sim)
        return np.mean(sim[~np.isnan(sim)])
    
    def calc_sim(self):
        if self.train_y is not None:
            self.data['cos_sim'] = self.data.apply(self.cosine_sim_train, axis=1)
        else:
            self.data['cos_sim_0'] = self.data.apply(self.cosine_sim_test, args=(0,), axis=1)
            self.data['cos_sim_1'] = self.data.apply(self.cosine_sim_test, args=(1,), axis=1)
            self.data['cos_sim_2'] = self.data.apply(self.cosine_sim_test, args=(2,), axis=1)
            self.data['cos_sim'] = self.data[['cos_sim_0', 'cos_sim_1', 'cos_sim_2']].max(axis=1)
        return self.data
    
    
    def drop_dissimilar(self):
        amount = self.data.groupby('id').agg({'id':'count'})
        idx = list(amount[amount['id'] > 1].index)
        
        for ind in idx:
            if (self.data[self.data['id'] == ind]['cos_sim'] == 0).all():
                0
            max_sim = self.data[self.data['id'] == ind]['cos_sim'].max()
            self.data.drop(self.data[np.logical_and(self.data['id'] == ind, self.data['cos_sim'] < max_sim)].index,
                           inplace=True)
        self.data.reset_index(drop=True, inplace=True)
        return self.data
    
    def replace_black(self, fallback_X, fallback_ids):
        for i, row in self.data.iterrows():
            if not np.count_nonzero(row['img']):
                arg = np.argwhere(fallback_ids == row['id']).flatten()[0]
                if np.count_nonzero(fallback_X[arg]):
                    self.data.at[i, 'img'] = fallback_X[arg]
        return
                    
    def load_training(self, train_X, train_y, feature):
        self.train_data = pd.DataFrame({'class': train_y, 'img': list(train_X), 'feature': list(feature)})
        return self.train_data
        
        
    def get_data(self):
        return self.data
    
    def check_prep(self, y):
        pathlib.Path(self.prep_path).mkdir(parents=True, exist_ok=True)
        
        bool_X = pathlib.Path(os.path.join(self.prep_path, '{}_{}_{}_X_clean.npy'.format(self.label, self.feature_name,
                                                                                   self.preproc))).exists()
        if y is None:
            bool_y = pathlib.Path(os.path.join(self.prep_path, '{}_{}_{}_y_clean.npy'.format(self.label, self.feature_name,
                                                                                   self.preproc))).exists()
        else:
            bool_y = True
        return (bool_X and bool_y)
    
    def get_class(self):
        y = np.load(os.path.join(self.prep_path, '{}_{}_{}_y_clean.npy'.format(self.label, self.feature_name,
                                                                                   self.preproc)))
        return y
        
    def clean(self, train_X=None, train_y=None, train_features=None, fallback_X=None, fallback_ids=None,
              label='train', feature='hog', preproc='HAAR'):
        self.label = label
        self.feature_name = feature
        self.preproc = preproc
        
        if self.check_prep(train_y):
            print('Loading cleaned dataset...')
            X = np.load(os.path.join(self.prep_path, '{}_{}_{}_X_clean.npy'.format(self.label, self.feature_name,
                                                                                   self.preproc)), allow_pickle=True)
        else:
            self.create_df()
            if train_X is not None and train_y is not None and train_features is not None:
                self.load_training(train_X, train_y, train_features)
            self.calc_sim()
            self.drop_dissimilar()  
            if fallback_X is not None and fallback_ids is not None:
                self.replace_black(fallback_X, fallback_ids)

            X = self.data['img'].to_numpy()
            np.save(os.path.join(self.prep_path, '{}_{}_{}_X_clean.npy'.format(self.label, self.feature_name,
                                                                               self.preproc)), X)
            if train_y is None:
                y = self.data['class'].to_numpy()            
                np.save(os.path.join(self.prep_path, '{}_{}_{}_y_clean.npy'.format(self.label, self.feature_name,
                                                                               self.preproc)), y)
        return X

## Cleaning HAAR dataset

In [None]:
cleaner = CleanTrainingSet(train_X_HAAR, train_y_HAAR, HAAR_ids, hog_fd_train)
train_X_HAAR = cleaner.clean(label='train', feature='hog', preproc='HAAR')
train_y_HAAR = cleaner.get_class()
train_X_HAAR.shape

In [None]:
plot_image_sequence(train_X_HAAR, n=train_X_HAAR.shape[0])

## Cleaning FaceRec dataset

In [None]:
cleaner = CleanTrainingSet(train_X_FACEREC, train_y_FACEREC, FACEREC_ids, encodings)
train_X_FACEREC = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y_FACEREC = cleaner.get_class()
train_X_FACEREC.shape

In [None]:
plot_image_sequence(train_X_FACEREC, n=train_X_FACEREC.shape[0])

## Cleaning InsightFace dataset

In [None]:
feature_extractor = FEncodings_FeatureExtractor()
cleaner = CleanTrainingSet(train_X_INSIGHT, train_y_INSIGHT, INSIGHT_ids, 
                           feature_extractor(train_X_INSIGHT, 'train', 'INSIGHT'))
train_X_INSIGHT = cleaner.clean(fallback_X=train_X_FACEREC, fallback_ids=FACEREC_ids, label='train', feature='FE', preproc='INSIGHT')
train_y_INSIGHT = cleaner.get_class()
train_X_INSIGHT.shape

In [None]:
plot_image_sequence(train_X_INSIGHT, n=train_X_INSIGHT.shape[0])

## 2.2 Removing fully black images
When visually inspecting the training dataset, it could be observed that there was one fully black image on which no face could be read. <br>
To make sure that fully black images don't skew the training, they are removed from the dataset.

In [None]:
def remove_blacks(train_X, train_y):
    for i, row in enumerate(train_X):
        if not np.count_nonzero(row):
            train_X = np.delete(train_X, i)
            train_y = np.delete(train_y, i)
    return train_X, train_y

In [None]:
train_X_FACEREC, train_y_FACEREC = remove_blacks(train_X_FACEREC, train_y_FACEREC)
train_X_FACEREC.shape

## 2.3 Augmenting the training set


Another way of improving our results was done by augmenting the dataset. In this instance it will take the preprocessed and cleaned data, with the black images removed. After this, it will iterate over the entire dataset and for every image it will create 7 <u>alternative</u> images using the following methods:
1. `Flipping` the image horizontally
2. Applying a `Gaussian blur` with sigma 0.50
3. Applying a `Gaussian blur` with sigma 1.50
4. `Lighten` the picture by multiplying by 1.50
5. `Darken` the picture by multiplying by 0.50
6. Add `"Salt and Pepper"` noise by changing 2% of the total pixels
7. Add `"Salt and Pepper"` noise by changing 5% of the total pixels

In [None]:
class AugmentDataset():
    def __init__(self):
        self.fliplr = ia.augmenters.Fliplr(1)
        self.blur050 = ia.augmenters.GaussianBlur(sigma=0.50)
        self.blur100 = ia.augmenters.GaussianBlur(sigma=1.5)
        self.lighten = ia.augmenters.Multiply(1.5)
        self.darken = ia.augmenters.Multiply(0.5)
        self.noise2 = ia.augmenters.SaltAndPepper(0.02)
        self.noise5 = ia.augmenters.SaltAndPepper(0.05)
      
    def augment(self, img):
        new_train_X = []
        new_train_X.append(np.uint8(img))
        new_train_X.append(self.fliplr(image=np.uint8(img)))
        new_train_X.append(self.blur050(image=np.uint8(img)))
        new_train_X.append(self.blur100(image=np.uint8(img)))
        new_train_X.append(self.lighten(image=np.uint8(img)))
        new_train_X.append(self.darken(image=np.uint8(img)))
        new_train_X.append(self.noise2(image=np.uint8(img)))
        new_train_X.append(self.noise5(image=np.uint8(img)))
        return new_train_X
    
    def __call__(self, X, y):
        new_X = []
        new_y = []
        for i, img in enumerate(X):
            new_X.extend(self.augment(img))
            new_y.extend([y[i] for _ in range(8)])

        tmp = np.empty(len(new_X), dtype=object)
        tmp[:] = new_X
        new_X = tmp
        return np.array(new_X), np.array(new_y)

In [None]:
augmenter = AugmentDataset()
new_train_X, new_train_y = augmenter(train_X_FACEREC,train_y_FACEREC)
print(new_train_X.shape, new_train_y.shape)

# 2. Evaluation Metrics

In [None]:
class Evaluator:
    def __init__(self):
        return
    
    def get_metrics(self):
        metric_dict = {'Accuracy': accuracy_score(self.train_y, self.train_y_star),
                      'Recall': recall_score(self.train_y, self.train_y_star, average="weighted"),
                      'Precision': precision_score(self.train_y, self.train_y_star, average="weighted"),
                      'F1': f1_score(self.train_y, self.train_y_star, average="weighted")}
        return metric_dict
    
    def __call__(self, train_y, train_y_star):
        self.train_y = train_y
        self.train_y_star = train_y_star
        return self.get_metrics()

## 2.0. Accuracy
This metric is the ratio of the correctly predicted observation to the total number of observations. In our case this will be the number of correctly classified images divided by the total number of the images. 

## 2.1. Recall
Recall is the number of correctly predicted positive observations relative to the number of observations in the actual class. In other words, for all the images we have of person X, for how many of them did we correctly classify them as person X?

## 2.2. Precision
The ratio of correctly predicted positive observations to the total predicted positive observations. In our case, for all the images we classified as person X, how many of them were person X?

## 2.3. F1 score
It is the weighted average of the precision and recall metric. 

# 3. Classifiers

In [None]:
class RandomClassificationModel:
    """Random classifier, draws a random sample based on class distribution observed 
    during training."""
    
    def fit(self, X, y):
        """Adjusts the class ratio instance variable to the one observed in y. 

        Parameters
        ----------
        X : tensor
            Training set
        y : array
            Training set labels

        Returns
        -------
        self : RandomClassificationModel
        """
        
        self.classes, self.class_ratio = np.unique(y, return_counts=True)
        self.class_ratio = self.class_ratio / self.class_ratio.sum()
        return self
        
    def predict(self, X):
        """Samples labels for the input data. 

        Parameters
        ----------
        X : tensor
            dataset
            
        Returns
        -------
        y_star : array
            'Predicted' labels
        """

        np.random.seed(0)
        return np.random.choice(self.classes, size = X.shape[0], p=self.class_ratio)
    
    def __call__(self, X):
        return self.predict(X)
    

## 3.1. Baseline 1: MultiClass SVM
The Support Vector Machine algorithm tries to find a line that maximizes the separation between a 2-class data set of 2D space points. The data points that are the closest to this line, in other words, within a minimum distance are called the support vectors. Normally, SVM does not support multiclass classification, namely when an instance needs to be classified as only one out of three or more classes. The aim is to map data points to high dimensional space to gain mutual linear separation between every two classes. An application of this algorithm can be found in the following [paper of Manerkar et al. (2016)](https://www.proquest.com/docview/2456786692/6A78351CD3F64C9FPQ/4) where they use it in the context of automated skin disease segmentation and classification. 

In [None]:
class MultiClassSVM:
    """An SVM classifier for face recognition"""

    def __init__(self):
        self.clf = None
        self.evaluator = Evaluator()
        
    def cross_validate(self, features, train_y, n_splits=5):
        scores = {'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}
        ss = ShuffleSplit(n_splits=n_splits, test_size=0.25)
        for train_idx, test_idx in ss.split(f):
            self.fit(features[train_idx], train_y[train_idx])  
            train_y_star = self.predict(features[test_idx])
            metric_dict = self.evaluator(train_y[test_idx], train_y_star)
            for key, value in metric_dict.items():
                scores[key].append(value)
        for key, value in scores.items():
            print('Average {}: {:.2f}%'.format(key, np.mean(value) * 100))
            
    def grid_search(self, features, train_y):
        # initialize grid search params
        C_range = np.arange(-2, 5, 0.5)               # logarithmic grid for c parameter
        gamma_range = np.logspace(-9, 3, 10)            # log grid for gamma param
        param_grid = dict(gamma=gamma_range, C=C_range)  # create a dict with these params
        cv = ShuffleSplit(n_splits=8, test_size=0.2, random_state=0)
        grid = GridSearchCV(                            # create hyperparameter search grid
            SVC(kernel='linear', decision_function_shape='ovr', break_ties=False),
            param_grid=param_grid, cv=cv)
        grid.fit(features, train_y)                     # fit grid to training datas
        print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_}")
        

    def fit(self, features, train_y):        
        self.clf = make_pipeline(
            StandardScaler(),
            SVC(kernel='linear',
                  C=10.0,
                  gamma=0.01,
                decision_function_shape='ovr',
                break_ties=False))
        self.clf.fit(features, train_y)

    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

## 3.2. Baseline 2: Random Forest 🌲🌳🌲
Random forest is a supervised machine learning algorithm widely used on classification problems. A random forest model consists of decision trees that were trained on different datasets which were row sampled from the training data. After predicting each instance on every decision tree, random forest takes the majority vote as the final output. This ensemble technique is called bagging (**B**ootstrap **Agg**regat**ing**). 

Random forest is immune to overfitting and curse of dimensionality as each tree in the forest does not consider all the features.

**Hyperparameters**

* `n_estimators`: # of decision trees in the model
* `max_depth`: The maximum depth of a decision tree
* `njobs`: # of preprocessors to run, `-1` means all
* `min_samples_split`: The minimum number of samples required to split an internal node further

We are going to use `GridSearchCV` to find the best hyperparameters for our dataset

In [None]:
class RandomForest:
    def __init__(self):
        self.clf = None
        self.evaluator = Evaluator()

    def cross_validate(self, features, train_y, n_splits=5):
        scores = {'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}
        ss = ShuffleSplit(n_splits=n_splits, test_size=0.25)
        for train_idx, test_idx in ss.split(f):
            self.fit(features[train_idx], train_y[train_idx])  
            train_y_star = self.predict(features[test_idx])
            metric_dict = self.evaluator(train_y[test_idx], train_y_star)
            for key, value in metric_dict.items():
                scores[key].append(value)
        for key, value in scores.items():
            print('Average {}: {:.2f}%'.format(key, np.mean(value) * 100))
            
    def grid_search(self, features, train_y):
        clf = RandomForestClassifier(random_state = 2)
        param_grid = { 
            'max_depth': [3, 4, 5],
            # 'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 3, 5],
            'n_estimators': [50, 100, 150, 200]
        }
        grid = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, cv=LeaveOneOut())
        grid.fit(features, train_y)                     # fit grid to training datas
        print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_}")

    def fit(self, features, train_y):
        self.clf = make_pipeline(
            StandardScaler(),
            RandomForestClassifier(max_depth=4, min_samples_split=2, n_estimators=100)
        )
        self.clf.fit(features, train_y)

    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

## 3.4. Baseline 4: Binary SVM

Support Vector Machine classfieirs which functions in the same way that the MultiClass SVM classifiers works, however here the input is split. Two datasets are initialized with a SVM for each class. By having a classifier that focuses on one class at a time, there might be an improvement on the accuracy.


**Hyperparameters**

*   `C`: Regularization parameter. The strength of the regularization is inversely proportional to C.
*   `gamma`: Kernel coefficient

We are going to use `GridSearchCV` to find the best hyperparameters for our dataset depending on leave one out cross-validation

In [None]:
class BSVM:
    """Create 2 simple linear Binary SVM classifiers, one for class 1 and one for class 2"""
    def __init__(self):
        self.clf1 = None
        self.clf2 = None
        self.evaluator = Evaluator()
    
    def cross_validate(self, features, train_y, n_splits=5):
        scores = {'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}
        ss = ShuffleSplit(n_splits=n_splits, test_size=0.25)
        for train_idx, test_idx in ss.split(f):
            self.fit(features[train_idx], train_y[train_idx])  
            train_y_star = self.predict(features[test_idx])
            metric_dict = self.evaluator(train_y[test_idx], train_y_star)
            for key, value in metric_dict.items():
                scores[key].append(value)
        for key, value in scores.items():
            print('Average {}: {:.2f}%'.format(key, np.mean(value) * 100))
    
    def grid_search(self, features, train_y):
        features_tmp = np.zeros_like(train_y)
        features_tmp[train_y==1] = 1
        C_range = np.logspace(-2, 10, 13)
        gamma_range = np.logspace(-9, 3, 13)
        param_grid = dict(gamma=gamma_range, C=C_range)
        cv = LeaveOneOut()
        cv.get_n_splits(features)
        grid = GridSearchCV(SVC(kernel='linear', decision_function_shape='ovr', break_ties=False), param_grid=param_grid, n_jobs=-1, cv=cv)
        grid.fit(features, features_tmp)
        print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_}")
        
        features_tmp = np.zeros_like(train_y)
        features_tmp[train_y==2] = 1
        grid.fit(features, features_tmp)
        print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_}")
        
    def fit(self, features, train_y):
        # SVM class 1
        features_tmp = np.zeros_like(train_y) 
        features_tmp[train_y==1] = 1        
        self.clf1 = make_pipeline(
            StandardScaler(),
            SVC(kernel='linear', C=1, gamma=10**(-9), decision_function_shape='ovr', break_ties=False, probability=True))
        self.clf1.fit(features, np.array(features_tmp))
        
        # SVM class 2
        features_tmp = np.zeros_like(train_y)
        features_tmp[train_y==2] = 1
        self.clf2 = make_pipeline(
            StandardScaler(),
            SVC(kernel='linear', C=1, gamma=10**(-9), decision_function_shape='ovr', break_ties=False, probability=True))
        self.clf2.fit(features, np.array(features_tmp)) 
        
    def predict(self, features):
        res1 = self.clf1.predict(features)
        res2 = self.clf2.predict(features)
        tmp = res1+2*res2 
        prob1 = self.clf1.predict_proba(features)
        prob2 = self.clf2.predict_proba(features)
        for i,cls in enumerate(tmp):  
            if cls==3:
                if (prob1[i][1]>prob2[i][1]):
                    tmp[i]=1
                elif(prob1[i][1]<=prob2[i][1]):
                    tmp[i]=2
        return tmp
        
    def __call__(self, X):
        return self.predict(X)
    
    

## 3.5. Baseline 5: MLP
A multilayer perceptron (MLP) is a relatively small, fully connected, feedforward artificial neural network which consists of an input layer, at least one hidden layer and an output layer. MLPs are frequently used for classification of data (input) to a class (output) especially when the data is not linearly separable as an MLP is a universal approximator for polynomial functions. 

This particular MLP has one hidden layer with 1024 hidden neurons. The network is trained by optimizing the log-loss function with [LBFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) in a supervised way. To avoid overfitting the features of the training data early stopping is used.

In [None]:
class MLP:
    def __init__(self):
        self.clf = None
        self.evaluator = Evaluator()

    def fit(self, features, train_y):
        self.clf = make_pipeline(
            StandardScaler(),
            MLPClassifier(solver='lbfgs', hidden_layer_sizes = (1024,), batch_size = 50,
                          verbose = False, early_stopping = True)
        )
        self.clf.fit(features, train_y)
        
    def cross_validate(self, features, train_y, n_splits=5):
        scores = {'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}
        ss = ShuffleSplit(n_splits=n_splits, test_size=0.25)
        for train_idx, test_idx in ss.split(f):
            self.fit(features[train_idx], train_y[train_idx])  
            train_y_star = self.predict(features[test_idx])
            metric_dict = self.evaluator(train_y[test_idx], train_y_star)
            for key, value in metric_dict.items():
                scores[key].append(value)
        for key, value in scores.items():
            print('Average {}: {:.2f}%'.format(key, np.mean(value) * 100))

    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

## 3.6. Baseline 6: XGBoost

**XGBoost** or e**X**treme **G**radient **Boost**ing is a framework that has seen a lot of popularity recently on *Kaggle* competitions. It has been used to great success for many teams to receive a high score on the leaderboard. <br>
XGBoost is an open-source library which implements **gradient boosted decision trees**. It focuses on speed and performance.<br>
One of the main advantages of this classifier is its ease of parallelization which allows computers with multiple cores to run the classifier faster than would regularly be expected.<br>

**Hyperparameters**

*   `gamma`: Minimum loss reduction required to make a further partition on a leaf node of the tree.
*   `learning_rate`: Step size shrinkage used in update to prevents overfitting.
*   `subsample`: Subsample ratio of the training instances.
*   `max_depth`: Maximum depth of a tree.
*   `colsample_bytree`: Subsample ratio of columns when constructing each tree.

In [None]:
class XGBoost:
    """Classification using XGBoost"""

    def __init__(self):
        self.clf = None
        self.evaluator = Evaluator()
        
    def cross_validate(self, features, train_y, n_splits=5):
        scores = {'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}
        ss = ShuffleSplit(n_splits=n_splits, test_size=0.25)
        for train_idx, test_idx in ss.split(f):
            self.fit(features[train_idx], train_y[train_idx])  
            train_y_star = self.predict(features[test_idx])
            metric_dict = self.evaluator(train_y[test_idx], train_y_star)
            for key, value in metric_dict.items():
                scores[key].append(value)
        for key, value in scores.items():
            print('Average {}: {:.2f}%'.format(key, np.mean(value) * 100))
            
    def grid_search(self, features, train_y):
        # initialize grid search params
        gamma_range = np.arange(0.01, 0.21, 0.05)
        lr = np.arange(0.05,0.3,0.05)
        ss = np.arange(0.3,0.7,0.05)
        depth = np.arange(3,7,1)
        cs = np.arange(0.5,1,0.25)
        param_grid = dict(gamma=gamma_range, learning_rate=lr, subsample=ss, max_depth=depth, colsample_bytree=cs)
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        grid = GridSearchCV(XGBClassifier(base_score=0.33, booster='gbtree', colsample_bylevel=1,
                      colsample_bynode=1, eval_metric='mlogloss',
                      gpu_id=-1, importance_type='gain',
                      interaction_constraints='',
                      max_delta_step=0, min_child_weight=1, missing=np.nan,
                      monotone_constraints='()', n_estimators=100,
                      num_parallel_tree=1, objective='multi:softmax', random_state=0,
                      reg_alpha=0, reg_lambda=1,
                      tree_method='exact', use_label_encoder=False,
                      validate_parameters=1, verbosity=None),
                            param_grid=param_grid, n_jobs=-1, cv=cv, verbose=10)
        grid.fit(features, train_y)
        print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_}")
        

    def fit(self, features, train_y):
        
        self.clf = make_pipeline(
            StandardScaler(),
            XGBClassifier(base_score=0.33, booster='gbtree', colsample_bylevel=1,
                      colsample_bynode=1, eval_metric='mlogloss',
                      gpu_id=-1, importance_type='gain',
                      interaction_constraints='',
                      max_delta_step=0, min_child_weight=1, missing=np.nan,
                      monotone_constraints='()', n_estimators=100,
                      num_parallel_tree=1, objective='multi:softmax', random_state=0,
                      reg_alpha=0, reg_lambda=1, 
                      gamma=0.2, learning_rate=0.15, subsample=0.5,
                      max_depth=5, colsample_bytree=1,
                      # gamma=grid.best_params_['gamma'], learning_rate=grid.best_params_['learning_rate'],
                      # subsample=grid.best_params_['subsample'], max_depth=grid.best_params_['max_depth'],
                      # colsample_bytree=grid.best_params_['colsample_bytree'],
                      tree_method='exact', use_label_encoder=False,
                      validate_parameters=1, verbosity=None))
        self.clf.fit(features, train_y)                 # fit the classifier to training features

    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

# 4. Experiments
<div class="alert alert-block alert-info"> <b>NOTE:</b> Do <i>NOT</i> use this section to keep track of every little change you make in your code! Instead, highlight the most important findings and the major (best) pipelines that you've discovered.  
</div>
<br>

## 4.1. Pipeline 1: MultiClass SVM with Facial Encodings


In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = MultiClassSVM()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y, n_splits=cross_val)

## 4.2. Pipeline 2: RandomForest with Facial Encodings

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = RandomForest()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y)

## 4.3. Pipeline 3: XGBoost with Facial Encodings

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = XGBoost()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y, n_splits=cross_val)

## 4.4. Pipeline 4: MLP with Facial Encodings

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = MLP()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y, n_splits=cross_val)

## 4.4. Pipeline 4: Binary SVM with Facial Encodings

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = BSVM()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y)

## 4.4. Pipeline 4: MultiClassSVM with HOG

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = HOGFeatureExtractor()
classifier = MultiClassSVM()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='HOG', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='HOG', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.cross_validate(f, train_y, n_splits=cross_val)

# 5. Publishing best results

In [None]:
%%time

preprocessor = FaceRec_Preprocessor(face_size=FACE_SIZE, faces=-1)
feature_extractor = FEncodings_FeatureExtractor()
classifier = BSVM()
cross_val = 10

print('Preprocessing training data')
train_X, train_y, ids = preprocessor(train, train['class'].values, label='train')
print('Calculating feature for training dataset')
train_features = feature_extractor(train_X, label='train', preproc='FACEREC')
print('Cleaning the training dataset')
cleaner = CleanTrainingSet(train_X, train_y, ids, train_features)
train_X = cleaner.clean(label='train', feature='FE', preproc='FACEREC')
train_y = cleaner.get_class()
train_X, train_y = remove_blacks(train_X, train_y)

print('Augmenting training dataset')
train_X, train_y = augmenter(train_X,train_y)

print('Preprocessing test data')
test_X, _, test_ids = preprocessor(test, label='test')
print('Calculating feature for test dataset')
test_features = feature_extractor(test_X, label='test', preproc='FACEREC')
print('Cleaning the test dataset')
cleaner = CleanTrainingSet(test_X, None, test_ids, test_features)
test_X = cleaner.clean(train_X=train_X, train_y=train_y, train_features=feature_extractor(train_X, label='train',
                                                                                          preproc='FACEREC_clean_augm'),
                      label='test', feature='FE', preproc='FACEREC_augm')
# train the model on the features
f = feature_extractor(train_X, label='train', preproc='FACEREC_clean_augm')
print('Fitting to input features\n')
print('Performance of model on the training set with {}-fold cross validation'.format(cross_val))
classifier.fit(f, train_y)

In [None]:
# predict the labels for the test set 
test_y_star = classifier(feature_extractor(test_X, label='test', preproc='FACEREC_clean_augm'))

In [None]:
submission = test.copy().drop('img', axis = 1)
submission['class'] = test_y_star

submission

In [None]:
submission.to_csv('submission.csv')

# 6. Discussion
In this group assignment we had the opportunity to experiment with the face recognition task, using various data preprocessing and feature extraction techniques. HAAR was already given as a face detection algorithm, however we wanted try different techniques such as InsightFace and HAAR with multi face detention to improve the quality of the training data. Among all preprocessors we tried, FaceRecognition was the most successful one that provides a good quality data set after cleaning the data and removing black images.

As feature extraction, we looked into five different methods: HOG, PCA, FAST, SIFT, FaceNet and Facial Encodings. The t-SNE plots of HOG, FAST and SIFT were not promising, i.e. clusters were not clearly separable in 2-D. Therefore, we decided to use facial encodings as feature.

As evalution metrics, we noticed that accuracy by itself is not the most successful metric. Therefore, we used precision and recall (so F1 Score) as evaluation metric in addition to accuracy to choose the right classifier.

Five classifiers were implemented, namely, Multi-class SVM, Double binary SVM, Multi-layer Perceptrons (MLP),Random Forest and XGBoost. By combining all preprocessors, all feature detectors and all classifiers, the highest score we got is `94.603%` with Random Forest having `(max_depth=4, min_samples_split=2, n_estimators=100)` hyperparameters after experimenting with the grid search. 

This is an optimal score considering the capabilities of the model and size of the training data we have. Although if we had more time, we would try deep learning methods such as Convolutional Neural Networks. In order to train a neural network, we would need a larger training data which can be generated by data augmentation.

