In [11]:
import face_recognition
import os
import pandas as pd
import numpy as np
import pickle
import json
from glob import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
np.random.seed(4)

In [2]:
data = []
target = []
face_names = []
num_faces = len(face_names)

In [3]:
faces_selected = set()
for file_path in glob(os.environ.get("DATA_DIR")+"/*/*"):
    class_name = file_path.split('/')[-2]
    if not class_name == 'test':
        if class_name not in faces_selected:
            faces_selected.add(class_name)
            face_names.append(class_name)
        load_image = face_recognition.load_image_file(file_path)
        list_encoding = face_recognition.face_encodings(load_image)
        if len(list_encoding) > 0:
            face_encoding = list_encoding[0]
            data.append(face_encoding)
            target.append(face_names.index(class_name))

In [4]:
face_names = np.array(face_names)
data = np.asarray(data)
dimensions = range(len(face_encoding))

In [5]:
df = pd.DataFrame(data, columns=dimensions)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .60
df['face'] = pd.Categorical.from_codes(target, face_names)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,is_train,face
0,-0.056642,0.111967,0.045343,-0.080298,-0.121893,0.031283,-0.119775,-0.033428,0.090027,-0.03135,...,0.034124,0.021715,-0.169225,-0.166573,-0.053117,0.012711,0.003189,0.082766,False,putin
1,-0.056591,0.176904,0.05707,-0.045259,-0.127751,-0.024484,-0.099635,0.009118,0.081999,0.034808,...,0.027769,-0.041969,-0.164249,-0.125031,-0.051352,-0.031716,-0.022123,0.099742,True,putin
2,-0.075421,0.153674,0.059769,-0.064464,-0.152883,0.05188,-0.050011,-0.039049,0.196426,-0.068756,...,0.026801,0.070755,-0.186185,-0.09721,0.027087,0.018035,-0.04985,0.094557,False,putin
3,-0.043356,0.097681,0.06787,-0.068763,-0.129864,0.033538,-0.053448,-0.038728,0.116607,-0.034512,...,0.053414,0.029532,-0.152544,-0.16189,-0.020153,0.010267,0.035278,0.095204,False,putin
4,-0.0671,0.123819,-0.014953,-0.069412,-0.108969,0.032154,-0.052483,-0.032942,0.119839,-0.043241,...,0.06571,0.093189,-0.218938,-0.170097,-0.067346,-0.022166,-0.053724,0.050731,False,putin


In [6]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [7]:
features = df.columns[:len(face_encoding)]
clf = RandomForestClassifier(n_jobs=4)
y, _ = pd.factorize(train['face'])
clf.fit(train[features], y)

In [13]:
preds = face_names[np.array(clf.predict(test[features]))]
cross_validation = pd.crosstab(test['face'], preds, rownames=['actual'], colnames=['preds'])
print(cross_validation)
stats = {'label': 'random_forest', 'cross validation': str(cross_validation)}
model_filename = os.path.join(os.environ['OUTPUT_DIR'],'model.dat')
pickle.dump(clf, open(model_filename, 'wb'))
stats_filename = os.path.join(os.environ['OUTPUT_DIR'],'stats.json')
with open(stats_filename, 'wb') as f:
    f.write(json.dumps(stats))


preds         donald_trump  mike_pence  other  putin
actual                                              
putin                    0           0      1      7
mike_pence               0           5      1      0
other                    0           1      3      1
donald_trump             5           0      0      0
