Let's first define classifiers and some functions that we will need later.

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC

# Disable warnings for some sklearn classifiers
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Define classifiers
classifiers = {
    "knn, k = 1": KNeighborsClassifier(1),
    "knn, k = 1, weights = distance": KNeighborsClassifier(1, weights='distance'),
    "knn, k = 3": KNeighborsClassifier(3),
    "knn, k = 3, weights = distance": KNeighborsClassifier(3, weights='distance'),
    "knn, k = 5": KNeighborsClassifier(5),
    "knn, k = 5, weights = distance": KNeighborsClassifier(5, weights='distance'),
    "knn, k = 7": KNeighborsClassifier(7),
    "knn, k = 7, weights = distance": KNeighborsClassifier(7, weights='distance'),
    "decision tree, gini": DecisionTreeClassifier(criterion='gini'),
    "decision tree, entropy": DecisionTreeClassifier(criterion='entropy'),
    "random forest, gini": RandomForestClassifier(n_estimators=100, criterion='gini'),
    "random forest, entropy": RandomForestClassifier(n_estimators=100, criterion='entropy'),
    "naive bayes": GaussianNB(),
    "support vector machine, kernel=linear, c=0.025": SVC(kernel="linear", C=0.025),
    "support vector machine, kernel=linear, c=0.05": SVC(kernel="linear", C=0.05),
    "support vector machine, kernel=linear, c=0.1": SVC(kernel="linear", C=0.1),
    "support vector machine, kernel=linear, c=0.2": SVC(kernel="linear", C=0.2),
}

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def evaluate_classifiers(x_train, x_test, y_train, y_test):
    for classifier_name in classifiers:
        classifier = classifiers[classifier_name]
        classifier.fit(x_train, y_train)
        test_score = classifier.score(x_test, y_test)
        print("{}: {}".format(classifier_name, test_score))

def prepare_train_test_set(train_file, test_file=None, normalize=True):
    # Read the data from csv file
    dataset_train = pd.read_csv(train_file)

    # Split data into features and labels
    x_train = dataset_train.iloc[:, :-1].values
    y_train = dataset_train.iloc[:, 128].values
    
    if test_file is None:
        # Split the dataset_train into train and test set
        from sklearn.model_selection import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
    else:
        # Or read the test data from csv file
        dataset_test = pd.read_csv(test_file)
        x_test = dataset_test.iloc[:, :-1].values
        y_test = dataset_test.iloc[:, 128].values
    

    # Normalize features for better results
    if normalize:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)
    
    return x_train, x_test, y_train, y_test

## Evaluating classifiers
Let's first train and test our models on images taken in gazebo simulation.

We can see that most of the classifiers perform extremely well. The only exception being decision tree, but it still works fairly well.

There are a few reasons why all the classifiers work really well.

1. The `face_recognition` library does a very good job encoding the faces into embeddings. As we can see in an example later in this notebook, when using only the original images of the faces as train set and faces from simulation as test set, the classifiers still perform very well.

2. We only need to recognize 21 faces, which makes this problem much easier.

3. The nature of the simulation makes this task easier as well. There is no blur in the images recorded in the simulation and the lighting conditions are very simple.

We can notice that KNeighbors algorithm works very well, despite feeding it high dimensional data.

In [6]:
x_train, x_test, y_train, y_test = prepare_train_test_set('../encodings/faces_all_simulation.txt')

evaluate_classifiers(x_train, x_test, y_train, y_test)

knn, k = 7, weights = distance: 0.99481865285
support vector machine, kernel=linear, c=0.05: 0.989637305699
decision tree, entropy: 0.79274611399
random forest, gini: 0.984455958549
random forest, entropy: 0.989637305699
knn, k = 1, weights = distance: 0.99481865285
naive bayes: 0.989637305699
decision tree, gini: 0.849740932642
support vector machine, kernel=linear, c=0.1: 0.989637305699
support vector machine, kernel=linear, c=0.2: 0.989637305699
knn, k = 3, weights = distance: 0.99481865285
support vector machine, kernel=linear, c=0.025: 0.989637305699
knn, k = 5, weights = distance: 0.99481865285
knn, k = 1: 0.99481865285
knn, k = 3: 0.99481865285
knn, k = 5: 0.984455958549
knn, k = 7: 0.984455958549
