In [27]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
import face_recognition
from sklearn.svm import SVC
from tqdm import tqdm
from scipy import stats

Load the data:

In [2]:
with open('../data/good_image_ids_list.pkl', 'rb') as f:
    good_images = pickle.load(f)
with open('../data/face_encodings.pkl', 'rb') as f:
    encodings = pd.DataFrame(pickle.load(f))
identities = pd.read_table('../annotations/identity_CelebA.txt', delim_whitespace=True)

Prepare the data:

In [3]:
identities = identities.loc[identities.image_id.isin(good_images)]
identity_counts = identities.groupby('identity').count()
only_1_photo_people = identity_counts.loc[identity_counts.image_id == 1].index.tolist()
identities['person'] = identities.identity.apply(lambda x: 'unknown' if x in only_1_photo_people else str(x))
df = encodings.merge(identities, on='image_id')

Convert needed values to numpy arrays:

In [4]:
y = df.person.to_numpy()
x = np.vstack(df.encoding.values)

Do the train/test data split:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

Check the distributions:

In [6]:
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[['10000' 24]
 ['10001' 23]
 ['10003' 22]
 ...
 ['9986' 23]
 ['9987' 23]
 ['unknown' 34]]


In [7]:
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)).T)

[['10000' 6]
 ['10001' 5]
 ['10003' 5]
 ...
 ['9986' 6]
 ['9987' 6]
 ['unknown' 8]]


## Models

### Distance based prediction

In [10]:
list_of_train_encodings = [row for row in X_train]

In [11]:
distance_predictions = []
for row in tqdm(X_test):
    distances = face_recognition.face_distance(list_of_train_encodings, row)
    closest = np.argmin(distances)
    prediction = y_train[closest]
    distance_predictions.append(prediction)

100%|████████████████████████████████████████████████████████████████████████████| 13459/13459 [17:28<00:00, 12.84it/s]


In [14]:
distance_predictions = np.array(distance_predictions)

In [26]:
print(f"Accuracy (recognition rate): {accuracy_score(y_test, distance_predictions)}")
print(f"Macro F1-Score: {f1_score(y_test, distance_predictions, average='macro')}")
print(f"Unknown people classified as known: {np.logical_and(y_test == 'unknown', distance_predictions != 'unknown').sum()}")
print(f"False acceptance rate: {np.logical_and(y_test == 'unknown', distance_predictions != 'unknown').sum() / y_test.size}")
print(f"Known people classified as unknown: {np.logical_and(y_test != 'unknown', distance_predictions == 'unknown').sum()}")
print(f"False reject rate: {np.logical_and(y_test != 'unknown', distance_predictions== 'unknown').sum() / y_test.size}")
print(f"Unknown people classified as unknown: {np.logical_and(y_test == 'unknown', distance_predictions == 'unknown').sum()}")

Accuracy (recognition rate): 0.9485102905119251
Macro F1-Score: 0.9454698832241204
Unknown people classified as known: 7
False acceptance rate: 0.0005200980756371201
Known people classified as unknown: 0
False reject rate: 0.0
Unknown people classified as unknown: 1


### Frequency based prediction

In [28]:
freq_predictions = []
for row in tqdm(X_test):
    comparisions = face_recognition.compare_faces(list_of_train_encodings, row)
    agreed = y_train[comparisions]
    prediction = stats.mode(agreed)
    freq_predictions.append(prediction)

100%|████████████████████████████████████████████████████████████████████████████| 13459/13459 [18:23<00:00, 12.20it/s]


In [35]:
freq_predictions = np.array(freq_predictions)[:, 0, 0]

In [36]:
print(f"Accuracy (recognition rate): {accuracy_score(y_test, freq_predictions)}")
print(f"Macro F1-Score: {f1_score(y_test, freq_predictions, average='macro')}")
print(f"Unknown people classified as known: {np.logical_and(y_test == 'unknown', freq_predictions != 'unknown').sum()}")
print(f"False acceptance rate: {np.logical_and(y_test == 'unknown', freq_predictions != 'unknown').sum() / y_test.size}")
print(f"Known people classified as unknown: {np.logical_and(y_test != 'unknown', freq_predictions == 'unknown').sum()}")
print(f"False reject rate: {np.logical_and(y_test != 'unknown', freq_predictions== 'unknown').sum() / y_test.size}")
print(f"Unknown people classified as unknown: {np.logical_and(y_test == 'unknown', freq_predictions == 'unknown').sum()}")

Accuracy (recognition rate): 0.8771825544245486
Macro F1-Score: 0.861244222796153
Unknown people classified as known: 8
False acceptance rate: 0.0005943978007281373
Known people classified as unknown: 0
False reject rate: 0.0
Unknown people classified as unknown: 0


### SVM

In [8]:
clf = SVC()
clf.fit(X_train, y_train)

SVC()

In [37]:
svm_predictions = []
for row in tqdm(X_test):
    person = clf.predict([row])
    svm_predictions.append(person)

100%|████████████████████████████████████████████████████████████████████████████| 13459/13459 [38:14<00:00,  5.87it/s]


In [39]:
svm_predictionsl = [ar[0] for ar in svm_predictions]

In [41]:
print(f"Accuracy (recognition rate): {accuracy_score(y_test, svm_predictionsl)}")
print(f"Macro F1-Score: {f1_score(y_test, svm_predictionsl, average='macro')}")
print(f"Unknown people classified as known: {np.logical_and(y_test == 'unknown', svm_predictionsl != 'unknown').sum()}")
print(f"False acceptance rate: {np.logical_and(y_test == 'unknown', svm_predictionsl != 'unknown').sum() / y_test.size}")
print(f"Known people classified as unknown: {np.logical_and(y_test != 'unknown', svm_predictionsl == 'unknown').sum()}")
print(f"False reject rate: {np.logical_and(y_test != 'unknown', svm_predictionsl== 'unknown').sum() / y_test.size}")
print(f"Unknown people classified as unknown: {np.logical_and(y_test == 'unknown', svm_predictionsl == 'unknown').sum()}")

Accuracy (recognition rate): 0.9570547588973921
Macro F1-Score: 0.9532790829282605
Unknown people classified as known: 8
False acceptance rate: 0.0005943978007281373
Known people classified as unknown: 0
False reject rate: 0.0
Unknown people classified as unknown: 0
